| | 212 | |
| | 213 | === Detect genes that are differentially expressed between conditions === |
| | 214 | Often one is interested in how a treatment or condition alters the gene expression profile in a selected cell type. In the desirable case where multiple biological replicates are present for each condition, K. D. Zimmerman, M. A. Espeland and Carl D. Langefeld have recently [https://www.nature.com/articles/s41467-021-21038-1 highlighted] the importance of properly taking account of the correlation present in such hierarchically structured data. One strategy, the so-called pseudobulk approach, is to aggregate counts across cells from the same biological sample or subject. Mixed-effects modeling, where sample is treated as a random effect, is another strategy. The code below uses mixed effects modeling within [https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0844-5 MAST] and has been adapted from [https://github.com/kdzimm/PseudoreplicationPaper K. D. Zimmerman et al.] |
| | 215 | |
| | 216 | * Testing for differential expression with mixed-effects models in MAST: |
| | 217 | {{{ |
| | 218 | require(MAST) |
| | 219 | require(dplyr) |
| | 220 | require(gdata) |
| | 221 | require(reshape2) |
| | 222 | |
| | 223 | # Assume that differential expression is being tested with in a single |
| | 224 | # cluster/cell type from a single cell RNA-seq data-set. The counts matrix |
| | 225 | # is the starting point. From a Seurat object, it can be extracted as follows: |
| | 226 | allgenesCT<-t(as.data.frame(clusterCells@assays$RNA@counts)) # Un-normalized |
| | 227 | allgenesCT<-as.data.frame(allgenesCT) |
| | 228 | ngenes<-ncol(allgenesCT) |
| | 229 | allgenesCT$Sample<-clusterCells@meta.data$Sample |
| | 230 | allgenesCT$Treat<-clusterCells@meta.data$Treat |
| | 231 | allgenesCT$wellKey<-paste(clusterCells@meta.data$Sample, rownames(clusterCells@meta.data), sep = "_") |
| | 232 | rownames(allgenesCT)<-allgenesCT$wellKey |
| | 233 | allgenesCT<-allgenesCT[,c(ngenes+3,ngenes+2,ngenes+1,1:ngenes)] |
| | 234 | genecountsRaw<-as.matrix(t(allgenesCT[,c(-1,-2,-3)])) |
| | 235 | genecounts<-log2(genecountsRaw + 1) # Normalization with pseudocount. |
| | 236 | coldata<-allgenesCT[,1:3] |
| | 237 | coldata$Sample<-as.factor(coldata$Sample) |
| | 238 | |
| | 239 | # Filter out genes expressed in <10% of cells |
| | 240 | genecounts<-genecounts[rowSums(genecounts>0)/(ncol(genecounts))>=0.1,] |
| | 241 | genecounts<-genecounts[,rownames(coldata)] |
| | 242 | |
| | 243 | fDataCT<-data.frame(primerid=rownames(genecounts)) |
| | 244 | sca<-MAST::FromMatrix(exprsArray=genecounts, cData=coldata, fData=fDataCT) |
| | 245 | |
| | 246 | # CDR is the proportion of genes detected in each cell and is defined in |
| | 247 | # G. Finak et al., Genome Biology 16, 278 (2015). CDR functions as a |
| | 248 | # regularizer in the model. |
| | 249 | cdr2<-colSums(SummarizedExperiment::assay(sca)>0) |
| | 250 | |
| | 251 | # Reassign cellular detection rate. |
| | 252 | SummarizedExperiment::colData(sca)$ngeneson<-scale(cdr2) |
| | 253 | |
| | 254 | # Assign predictors as factors. |
| | 255 | SummarizedExperiment::colData(sca)$Sample<-factor(SummarizedExperiment::colData(sca)$Sample) |
| | 256 | SummarizedExperiment::colData(sca)$Treat<-factor(SummarizedExperiment::colData(sca)$Treat) |
| | 257 | |
| | 258 | # In the model below, "Treat" is a categorical fixed effect, while "Sample" |
| | 259 | # is a categorical random effect with intercept varying by sample. |
| | 260 | options(mc.cores = 4) # The number of available cores is hardware-dependent. |
| | 261 | zlmCond<-zlm(~ Treat + ngeneson + (1 | Sample), sca, method='glmer', ebayes = FALSE, fitArgsD = list(nAGQ = 0), strictConvergence = FALSE, parallel=TRUE) |
| | 262 | |
| | 263 | # Tabulate statistics for for treatment ("TreatT") versus control ("TreatC" is |
| | 264 | # the reference level of "Treat"). |
| | 265 | summaryCond<-MAST::summary(zlmCond,doLRT=TRUE) |
| | 266 | summaryDt<-summaryCond$datatable |
| | 267 | fcHurdle<-merge(summaryDt[contrast=='TreatT' & component=='H',.(primerid, `Pr(>Chisq)`)], summaryDt[contrast=='TreatT' & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid') |
| | 268 | fcHurdle[,fdr:=p.adjust(`Pr(>Chisq)`, 'fdr')] # Correct for multiple testing. |
| | 269 | fcHurdle<-stats::na.omit(as.data.frame(fcHurdle)) # Omit NAs, if present. |
| | 270 | fcHurdle<-fcHurdle[order(fcHurdle$fdr),] |
| | 271 | |
| | 272 | # Examine the top differentially expressed genes, ranked by FDR. |
| | 273 | head(fcHurdle) |
| | 274 | }}} |