This outline contains a template of a typical DESeq2 workflow - as discussed during the course.
For specific projects, the parts on data exploration, quality controls and filtering are frequently more iterative and could require tweaking.

Load Libraries


Data, Metadata and Design

dfile <- "data/mpp/mpp_counts.tsv"     
data <- read_tsv(dfile)
## Rows: 24417 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (1): gene_id
## dbl (12): Young_HSC_1, Young_HSC_2, Young_MPP1_1, Young_MPP1_2, Young_MPP2_1...
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- data %>% column_to_rownames("gene_id") 

mfile <- "data/mpp/mpp_meta.tsv"      
metadata <- read_tsv(mfile)
## Rows: 12 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): sample, celltype, condition
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
metadata <- metadata %>% column_to_rownames("sample") 
metadata$condition <- as.factor(metadata$condition)
metadata$celltype  <- as.factor(metadata$celltype)

my_design <- ~ celltype + condition + celltype:condition

all(rownames(metadata) == colnames(data))  # sanity check
## [1] TRUE
dds <- DESeqDataSetFromMatrix(countData=data, colData=metadata, design= my_design)
## converting counts to integer mode

Data Exploration and Filters

# data exploration and QC: not shown
# ...

# remove genes
keep_genes <- rowSums(counts(dds)) > 1
dds <- dds[keep_genes,]

# process/filter samples as required
# ...

# rerun Data Exploration and QC
rld <- rlog(dds)
plotPCA(rld, intgroup=c("condition", "celltype"))

# keep transformed matrix A for later visualization
# scale rows (genes) for heatmap = transpose; (column) scale; transpose
A = assay(rld, normalized=TRUE) %>% t %>% scale %>% t 

Run DESeq

dds <- DESeq(dds)                          
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
colData(dds)                                # sizefactors were added to metadata
## DataFrame with 12 rows and 3 columns
##              celltype condition sizeFactor
##              <factor>  <factor>  <numeric>
## Young_HSC_1      HSC      Young   0.874166
## Young_HSC_2      HSC      Young   1.090789
## Young_MPP1_1     MPP1     Young   1.050110
## Young_MPP1_2     MPP1     Young   1.059677
## Young_MPP2_1     MPP2     Young   0.930771
## ...               ...       ...        ...
## Aged_HSC_2       HSC       Aged    1.07411
## Aged_MPP1_1      MPP1      Aged    0.98275
## Aged_MPP1_2      MPP1      Aged    1.10200
## Aged_MPP2_1      MPP2      Aged    1.00093
## Aged_MPP2_2      MPP2      Aged    1.04783
plotDispEsts(dds, main="Dispersion plot")   # dispersion

# mcols(dds)                                # for the really curious: results of modeling


Verify (base) levels and available coefficient (resultsNames).
Define model matrix and contrasts systematically

##  [1] Young Young Young Young Young Young Aged  Aged  Aged  Aged  Aged  Aged 
## Levels: Aged Young
## Levels: HSC MPP1 MPP2
## [1] "Intercept"                   "celltype_MPP1_vs_HSC"       
## [3] "celltype_MPP2_vs_HSC"        "condition_Young_vs_Aged"    
## [5] "celltypeMPP1.conditionYoung" "celltypeMPP2.conditionYoung"
mod_mat <- model.matrix(my_design, metadata)

# define contrast systematically
Aged_MPP2 <-  which(dds$condition=="Aged"  & dds$celltype=="MPP2")
Young_MPP2 <- which(dds$condition=="Young" & dds$celltype=="MPP2")
YvA_MPP2=colMeans(mod_mat[Young_MPP2,]) - colMeans(mod_mat[Aged_MPP2,]) 

##                 (Intercept)                celltypeMPP1 
##                           0                           0 
##                celltypeMPP2              conditionYoung 
##                           0                           1 
## celltypeMPP1:conditionYoung celltypeMPP2:conditionYoung 
##                           0                           1


res <- lfcShrink(dds, contrast=YvA_MPP2, type = "ashr") 
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
# inspect results
res %>% head(5)                                        # first few results
## log2 fold change (MMSE): 0,0,0,+1,0,+1 
## Wald test p-value: 0,0,0,+1,0,+1 
## DataFrame with 5 rows and 5 columns
##          baseMean log2FoldChange     lfcSE      pvalue        padj
##         <numeric>      <numeric> <numeric>   <numeric>   <numeric>
## Plekhg2  133.8963       0.116174  0.185884 3.10302e-01 5.60298e-01
## Plekhg3  195.2731       0.134482  0.171969 2.46270e-01 4.89375e-01
## Plekhg1  120.6119       0.807279  0.548689 5.10819e-04 4.86566e-03
## Plekhg6   77.5309       0.335161  0.517385 1.56011e-02 7.23021e-02
## Plekhg5  168.7902       0.872573  0.276774 1.28764e-06 2.82492e-05
summary(res)                                             # summary for all genes
## out of 16478 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 1482, 9%
## LFC < 0 (down)     : 1818, 11%
## outliers [1]       : 0, 0%
## low counts [2]     : 2876, 17%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
plotMA(res, ylim=c(-3,3))                              # MA-plot from DESeq2

plot(res$log2FoldChange, -log10(res$padj), cex=0.3)    # very simple volcano

#my_volcano(res)                                        # better?

# plot top-gene
plotCounts(dds, gene=which.min(res$padj), intgroup=c("condition","celltype"))  

# plot heatmap for top 15 genes
ntop <- 15
sorted_res <- res %>% data.frame() %>% arrange(padj)   # sort results by padj
top_genes <- sorted_res %>% head(ntop) %>% row.names() # top gene names
col_sel=c(Young_MPP2, Aged_MPP2)                       # select only samples in contrast
pheatmap(A[top_genes,col_sel], main="Top DE Genes (rlog)", annotation=metadata)

# write sorted results to file (add rowname as column)
fn="results.tsv"                        # might need a more informative filename
write_tsv(sorted_res %>% rownames_to_column("Gene"), file=fn)            


