21 min to read
scRNA-seq下游分析
原理
实验层面
细胞测序需要检测支原体。
简介
相比于bulk RNA-seq,scRNA-seq数据的存在稀疏,高噪声,高维度的特征。
以Nanoliter Droplets方法为例[03]
首先是组织处理得到单细胞,包裹在单个microparticle里面,而microparticle里面又存有包含polyT的beads,于是可以结合mRNA反转成为cDNA,建成pool进行PCR扩增,最后混合所有的STAMPs高通量测序得到数据。
每个micro particle上面的序列由四个部分组成:
-
一段一样的序列,PCR handle用于后续的PCR扩增
-
bead特异性的barcode,用来区分单个细胞
-
UMI,Unique Molecular Identifier,4 - 8bp,用来区分transcripts
-
30bp的oligo-dT,用来捕捉mRNA完成反转录
问题
-
扩增偏差,单个细胞初始转录本的捕捉效率和低输入
-
基因丢失(Gene dropouts),有些基因会在某个细胞里检测到具有中等表达水平却在其它细胞里面没有被发现
-
tag-based的优势在于它可以结合UMI(前面介绍过)来提高定量的水平,缺点在于未捕捉完全的转录本序列,在比对的时候无法区分iosform (Archer et al. 2016)
实战
简介
S4对象中有一系列的slot变量,其类型就是我们设置的类型,可以为S4对象定义方法。
SingleCellExperiment类是用于单细胞基因组数据的轻量级容器。行代表特征,列代表细胞。SingleCellExperiment类包含:int_elementMetadata,用于保存内部元数据的每行;int_colData,用于保存内部元数据的每列;int_metadata,用于保存内部元数据的全部对象;reducedDims,用于保存降维结果;int_前缀不适合用户直接操作的内部slots。
常用的R包
scater
实操
library(DropletUtils)
library(biomaRt)
library(dplyr)
library(scater)
path="C:\\Users\\Administrator\\Desktop\\hg19"
sce = read10xCounts(path, col.names=TRUE)
###获取基因注释信息
ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
attr.string = c('ensembl_gene_id', 'hgnc_symbol', 'chromosome_name')
attr.string = c(attr.string, 'start_position', 'end_position', 'strand')
attr.string = c(attr.string, 'description', 'percentage_gene_gc_content')
attr.string = c(attr.string, 'gene_biotype')
rowData(sce)[1:2,]
gene.annotation = getBM(attributes=attr.string,
filters = 'ensembl_gene_id',
values = rowData(sce)$ID,
mart = ensembl)
t1 = table(gene.annotation$ensembl_gene_id)
t2 = t1[t1 > 1]
gene.annotation[which(gene.annotation$ensembl_gene_id %in% names(t2)),]
gene.annotation = distinct(gene.annotation, ensembl_gene_id,
.keep_all = TRUE)
gene.missing = setdiff(rowData(sce)$ID, gene.annotation$ensembl_gene_id)
w2kp = match(gene.annotation$ensembl_gene_id, rowData(sce)$ID)
sce = sce[w2kp,]
dim(sce)
table(gene.annotation$ensembl_gene_id == rowData(sce)$ID)
rowData(sce) = gene.annotation
rownames(sce) = uniquifyFeatureNames(rowData(sce)$ensembl_gene_id,
rowData(sce)$hgnc_symbol)
###Identify low quallity cells
bcrank = barcodeRanks(counts(sce))
# Only showing unique points for plotting speed.
uniq = !duplicated(bcrank$rank)
par(mar=c(5,4,2,1), bty="n")
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
xlab="Rank", ylab="Total UMI count", cex=0.5, cex.lab=1.2)
abline(h=bcrank$inflection, col="darkgreen", lty=2)
abline(h=bcrank$knee, col="dodgerblue", lty=2)
legend("left", legend=c("Inflection", "Knee"), bty="n",
col=c("darkgreen", "dodgerblue"), lty=2, cex=1.2)
bcrank$inflection
bcrank$knee
summary(bcrank$total)
table(bcrank$total >= bcrank$knee)
table(bcrank$total >= bcrank$inflection)
set.seed(100)
e.out = emptyDrops(counts(sce))
e.out
is.cell = (e.out$FDR <= 0.01)
par(mar=c(5,4,1,1), mfrow=c(1,2), bty="n")
plot(e.out$Total, -e.out$LogProb, col=ifelse(is.cell, "red", "black"),
xlab="Total UMI count", ylab="-Log Probability", cex=0.2)
abline(v = bcrank$inflection, col="darkgreen")
abline(v = bcrank$knee, col="dodgerblue")
legend("bottomright", legend=c("Inflection", "Knee"), bty="n",
col=c("darkgreen", "dodgerblue"), lty=1, cex=1.2)
plot(e.out$Total, -e.out$LogProb, col=ifelse(is.cell, "red", "black"),
xlab="Total UMI count", ylab="-Log Probability", cex=0.2, xlim=c(0,2000), ylim=c(0,2000))
abline(v = bcrank$inflection, col="darkgreen")
abline(v = bcrank$knee, col="dodgerblue")
table(colnames(sce) == rownames(e.out))
table(e.out$FDR <= 0.01, useNA="ifany")
table(is.cell, e.out$Total >= bcrank$inflection)
w2kp = which(is.cell & e.out$Total >= bcrank$inflection)
sce = sce[,w2kp]
dim(sce)
library(data.table)
ribo.file = "C:\\Users\\Administrator\\Desktop\\ribosome_genes.txt"
ribo = fread(ribo.file)
dim(ribo)
is.mito = which(rowData(sce)$chromosome_name == "MT")
is.ribo = which(rowData(sce)$hgnc_symbol %in% ribo$`Approved symbol`)
length(is.ribo)
sce = calculateQCMetrics(sce, feature_controls=list(Mt=is.mito, Ri=is.ribo))
colnames(colData(sce))
par(mfrow=c(2,2), mar=c(5, 4, 1, 1), bty="n")
hist(log10(sce$total_counts), xlab="log10(Library sizes)", main="",
breaks=20, col="grey80", ylab="Number of cells")
hist(log10(sce$total_features), xlab="log10(# of expressed genes)",
main="", breaks=20, col="grey80", ylab="Number of cells")
hist(sce$pct_counts_Ri, xlab="Ribosome prop. (%)",
ylab="Number of cells", breaks=40, main="", col="grey80")
hist(sce$pct_counts_Mt, xlab="Mitochondrial prop. (%)",
ylab="Number of cells", breaks=80, main="", col="grey80")
par(mfrow=c(2,2), mar=c(5, 4, 1, 1), bty="n")
smoothScatter(log10(sce$total_counts), log10(sce$total_features),
xlab="log10(Library sizes)", ylab="log10(# of expressed genes)",
nrpoints=500, cex=0.5)
smoothScatter(log10(sce$total_counts), sce$pct_counts_Ri,
xlab="log10(Library sizes)", ylab="Ribosome prop. (%)",
nrpoints=500, cex=0.5)
abline(h=10, lty=1)
smoothScatter(log10(sce$total_counts), sce$pct_counts_Mt,
xlab="log10(Library sizes)", ylab="Mitochondrial prop. (%)",
nrpoints=500, cex=0.5)
abline(h=5, lty=1)
smoothScatter(sce$pct_counts_Ri, sce$pct_counts_Mt,
xlab="Ribosome prop. (%)", ylab="Mitochondrial prop. (%)",
nrpoints=500, cex=0.5)
abline(h=5, lty=1)
abline(v=10, lty=1)
table(sce$pct_counts_Mt > 5, sce$pct_counts_Ri < 10)
sce.lq = sce[,which(sce$pct_counts_Mt > 5 | sce$pct_counts_Ri < 10)]
sce = sce[,which(sce$pct_counts_Mt <= 5 | sce$pct_counts_Ri >= 10)]
par(mfrow=c(1,3), mar=c(5,4,1,1))
hist(log10(rowData(sce)$mean_counts+1e-6), col="grey80", main="",
breaks=40, xlab="log10(ave # of UMI + 1e-6)")
hist(log10(rowData(sce)$n_cells_by_counts+1), col="grey80", main="",
breaks=40, xlab="log10(# of expressed cells + 1)")
smoothScatter(log10(rowData(sce)$mean_counts+1e-6),
log10(rowData(sce)$n_cells_by_counts + 1),
xlab="log10(ave # of UMI + 1e-6)",
ylab="log10(# of expressed cells + 1)")
tb1 = table(rowData(sce)$n_cells_by_counts)
names(rowData(sce))[6] = "strand_n"
sce = sce[which(rowData(sce)$n_cells_by_counts > 1),]
dim(sce)
par(mar=c(5,4,1,1))
od1 = order(rowData(sce)$mean_counts, decreasing = TRUE)
barplot(rowData(sce)$mean_counts[od1[20:1]], las=1,
names.arg=rowData(sce)$hgnc_symbol[od1[20:1]],
horiz=TRUE, cex.names=0.5, cex.axis=0.7,
xlab="ave # of UMI")
library(scran)
clusters = quickCluster(sce, min.mean=0.1, method="igraph")
sce = computeSumFactors(sce, cluster=clusters, min.mean=0.1)
par(mfrow=c(1,2), mar=c(5,4,2,1), bty="n")
smoothScatter(sce$total_counts, sizeFactors(sce), log="xy",
xlab="total counts", ylab="size factors")
plot(sce$total_counts, sizeFactors(sce), log="xy",
xlab="total counts", ylab="size factors",
cex=0.3, pch=20, col=rgb(0.1,0.2,0.7,0.3))
sce = normalize(sce)
new.trend = makeTechTrend(x=sce)
fit = trendVar(sce, use.spikes=FALSE, loess.args=list(span=0.05))
par(mfrow=c(1,1), mar=c(5,4,2,1), bty="n")
plot(fit$mean, fit$var, pch=20, col=rgb(0.1,0.2,0.7,0.6),
xlab="log(mean)", ylab="var")
curve(fit$trend(x), col="orange", lwd=2, add=TRUE)
curve(new.trend(x), col="red", lwd=2, add=TRUE)
legend("topright", legend=c("Poisson noise", "observed trend"),
lty=1, lwd=2, col=c("red", "orange"), bty="n")
fit$trend = new.trend
dec = decomposeVar(fit=fit)
top.dec = dec[order(dec$bio, decreasing=TRUE),]
plotExpression(sce, features=rownames(top.dec)[1:10])
sce = denoisePCA(sce, technical=new.trend, approx=TRUE)
plot(log10(attr(reducedDim(sce), "percentVar")), xlab="PC",
ylab="log10(Prop of variance explained)", pch=20, cex=0.6,
col=rgb(0.8, 0.2, 0.2, 0.5))
abline(v=ncol(reducedDim(sce, "PCA")), lty=2, col="red")
df_pcs = data.frame(reducedDim(sce, "PCA"))
df_pcs$log10_total_features = colData(sce)$log10_total_features
gp1 = ggplot(df_pcs, aes(PC1,PC2,col=log10_total_features)) +
geom_point(size=0.2,alpha=0.6) + theme_classic() +
scale_colour_gradient(low="lightblue",high="red") +
guides(color = guide_legend(override.aes = list(size=3)))
gp1
sce = runTSNE(sce, use_dimred="PCA", perplexity=30, rand_seed=100)
df_tsne = data.frame(reducedDim(sce, "TSNE"))
df_tsne$log10_total_features = colData(sce)$log10_total_features
gp1 = ggplot(df_tsne, aes(X1,X2,col=log10_total_features)) +
geom_point(size=0.2,alpha=0.6) + theme_classic() +
scale_colour_gradient(low="lightblue",high="red") +
guides(color = guide_legend(override.aes = list(size=3)))
gp1
library(svd)
library(Rtsne)
dec1 = dec
dec1$bio[which(dec$bio < 1e-8)] = 1e-8
dec1$FDR[which(dec$FDR < 1e-100)] = 1e-100
par(mfrow=c(1,2))
hist(log10(dec1$bio), breaks=100, main="")
hist(log10(dec1$FDR), breaks=100, main="")
w2kp = which(dec$FDR < 1e-10 & dec$bio > 0.01)
sce_sub = sce[w2kp,]
edat = t(as.matrix(logcounts(sce_sub)))
edat = scale(edat)
ppk = propack.svd(edat,neig=50)
pca = t(ppk$d*t(ppk$u))
df_pcs = data.frame(pca)
df_pcs$log10_total_features = colData(sce_sub)$log10_total_features
df_pcs = data.frame(pca)
df_pcs$log10_total_features = colData(sce_sub)$log10_total_features
gp1 = ggplot(df_pcs, aes(X1,X2,col=log10_total_features)) +
geom_point(size=0.2,alpha=0.6) + theme_classic() +
scale_colour_gradient(low="lightblue",high="red") +
guides(color = guide_legend(override.aes = list(size=3)))
gp1
colnames(colData(sce_sub))
set.seed(100)
tsne = Rtsne(pca, pca = FALSE)
df_tsne = data.frame(tsne$Y)
df_tsne$log10_total_features_by_counts = colData(sce_sub)$log10_total_features_by_counts
gp1 = ggplot(df_tsne, aes(X1,X2,col=log10_total_features_by_counts)) +
geom_point(size=0.2,alpha=0.6) + theme_classic() +
scale_colour_gradient(low="lightblue",high="red") +
guides(color = guide_legend(override.aes = list(size=3)))
gp1
reducedDims(sce_sub) = SimpleList(PCA=pca, TSNE=tsne$Y)
k10_50_pcs = kmeans(reducedDim(sce_sub, "PCA"), centers=10,
iter.max=150, algorithm="MacQueen")
df_tsne$cluster_kmean = as.factor(k10_50_pcs$cluster)
cols = c("#FB9A99","#FF7F00","yellow","orchid","grey",
"red","dodgerblue2","tan4","green4","#99c9fb")
gp1 = ggplot(df_tsne, aes(X1,X2,col=cluster_kmean)) +
geom_point(size=0.2,alpha=0.6) + theme_classic() +
scale_color_manual(values=cols) +
guides(color = guide_legend(override.aes = list(size=3)))
gp1
snn.gr = buildSNNGraph(sce_sub, use.dimred="PCA")
clusters = igraph::cluster_walktrap(snn.gr)
参考资料
[01]. 第六章 scRNA-seq数据分析 Chapter 6: single cell RNA-seq analysis
[02]. Conquer-对单细胞数据差异表达分析的重新审视
[04]. Single Cell RNA-seq Analysis 学习记录(一)原理理解
[05]. Getting Started with Seurat
[06]. 【Analysis of single cell RNA-seq data】
[07]. singleCelSeq
[08]. Analysis of single cell RNA-seq data
[09]. 【A workflow for single cell RNA-seq data analysis】
[11]. An introduction to the SingleCellExperiment class
[12]. 单细胞转录组3大R包之monocle2
[13]. 单细胞转录组3大R包之Seurat
Comments