| 212 | |
| 213 | === Detect genes that are differentially expressed between conditions === |
| 214 | Often one is interested in how a treatment or condition alters the gene expression profile in a selected cell type. In the desirable case where multiple biological replicates are present for each condition, K. D. Zimmerman, M. A. Espeland and Carl D. Langefeld have recently [https://www.nature.com/articles/s41467-021-21038-1 highlighted] the importance of properly taking account of the correlation present in such hierarchically structured data. One strategy, the so-called pseudobulk approach, is to aggregate counts across cells from the same biological sample or subject. Mixed-effects modeling, where sample is treated as a random effect, is another strategy. The code below uses mixed effects modeling within [https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0844-5 MAST] and has been adapted from [https://github.com/kdzimm/PseudoreplicationPaper K. D. Zimmerman et al.] |
| 215 | |
| 216 | * Testing for differential expression with mixed-effects models in MAST: |
| 217 | {{{ |
| 218 | require(MAST) |
| 219 | require(dplyr) |
| 220 | require(gdata) |
| 221 | require(reshape2) |
| 222 | |
| 223 | # Assume that differential expression is being tested with in a single |
| 224 | # cluster/cell type from a single cell RNA-seq data-set. The counts matrix |
| 225 | # is the starting point. From a Seurat object, it can be extracted as follows: |
| 226 | allgenesCT<-t(as.data.frame(clusterCells@assays$RNA@counts)) # Un-normalized |
| 227 | allgenesCT<-as.data.frame(allgenesCT) |
| 228 | ngenes<-ncol(allgenesCT) |
| 229 | allgenesCT$Sample<-clusterCells@meta.data$Sample |
| 230 | allgenesCT$Treat<-clusterCells@meta.data$Treat |
| 231 | allgenesCT$wellKey<-paste(clusterCells@meta.data$Sample, rownames(clusterCells@meta.data), sep = "_") |
| 232 | rownames(allgenesCT)<-allgenesCT$wellKey |
| 233 | allgenesCT<-allgenesCT[,c(ngenes+3,ngenes+2,ngenes+1,1:ngenes)] |
| 234 | genecountsRaw<-as.matrix(t(allgenesCT[,c(-1,-2,-3)])) |
| 235 | genecounts<-log2(genecountsRaw + 1) # Normalization with pseudocount. |
| 236 | coldata<-allgenesCT[,1:3] |
| 237 | coldata$Sample<-as.factor(coldata$Sample) |
| 238 | |
| 239 | # Filter out genes expressed in <10% of cells |
| 240 | genecounts<-genecounts[rowSums(genecounts>0)/(ncol(genecounts))>=0.1,] |
| 241 | genecounts<-genecounts[,rownames(coldata)] |
| 242 | |
| 243 | fDataCT<-data.frame(primerid=rownames(genecounts)) |
| 244 | sca<-MAST::FromMatrix(exprsArray=genecounts, cData=coldata, fData=fDataCT) |
| 245 | |
| 246 | # CDR is the proportion of genes detected in each cell and is defined in |
| 247 | # G. Finak et al., Genome Biology 16, 278 (2015). CDR functions as a |
| 248 | # regularizer in the model. |
| 249 | cdr2<-colSums(SummarizedExperiment::assay(sca)>0) |
| 250 | |
| 251 | # Reassign cellular detection rate. |
| 252 | SummarizedExperiment::colData(sca)$ngeneson<-scale(cdr2) |
| 253 | |
| 254 | # Assign predictors as factors. |
| 255 | SummarizedExperiment::colData(sca)$Sample<-factor(SummarizedExperiment::colData(sca)$Sample) |
| 256 | SummarizedExperiment::colData(sca)$Treat<-factor(SummarizedExperiment::colData(sca)$Treat) |
| 257 | |
| 258 | # In the model below, "Treat" is a categorical fixed effect, while "Sample" |
| 259 | # is a categorical random effect with intercept varying by sample. |
| 260 | options(mc.cores = 4) # The number of available cores is hardware-dependent. |
| 261 | zlmCond<-zlm(~ Treat + ngeneson + (1 | Sample), sca, method='glmer', ebayes = FALSE, fitArgsD = list(nAGQ = 0), strictConvergence = FALSE, parallel=TRUE) |
| 262 | |
| 263 | # Tabulate statistics for for treatment ("TreatT") versus control ("TreatC" is |
| 264 | # the reference level of "Treat"). |
| 265 | summaryCond<-MAST::summary(zlmCond,doLRT=TRUE) |
| 266 | summaryDt<-summaryCond$datatable |
| 267 | fcHurdle<-merge(summaryDt[contrast=='TreatT' & component=='H',.(primerid, `Pr(>Chisq)`)], summaryDt[contrast=='TreatT' & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid') |
| 268 | fcHurdle[,fdr:=p.adjust(`Pr(>Chisq)`, 'fdr')] # Correct for multiple testing. |
| 269 | fcHurdle<-stats::na.omit(as.data.frame(fcHurdle)) # Omit NAs, if present. |
| 270 | fcHurdle<-fcHurdle[order(fcHurdle$fdr),] |
| 271 | |
| 272 | # Examine the top differentially expressed genes, ranked by FDR. |
| 273 | head(fcHurdle) |
| 274 | }}} |