散发性双侧肾透明细胞癌细胞分子特征的scRNA-seq鉴定
考虑到咱们生信技能树粉丝对单细胞数据挖掘的需求,我开通了一个专栏《100个单细胞转录组数据降维聚类分群图表复现》,也亲自示范了几个,不过自己带娃,读博,时间精力有限,所以把剩余的90多个任务安排了学徒,实习生,学员。真的是太棒了,群策群力!
今天分享的文章是2021年6月 发表在《Front. Oncol 》杂志的《Single-Cell RNA-seq Identification of the Cellular Molecular Characteristics of Sporadic Bilateral Clear Cell Renal Cell Carcinoma》文章链接是:https://www.frontiersin.org/articles/10.3389/fonc.2021.659251/full
该研究揭示双侧肾细胞癌的大多数细胞类型之间的基因表达高度相似,但不同部位肿瘤细胞之间基因表达的显着差异。
一共两个10x的单细胞转录组样品
GSM5222644 ccRCC1
GSM5222645 ccRCC2
提供了10x的标准数据文件:
本次要复现的文献图表如下所示:
是单细胞转录组数据分析的标准降维聚类分群,进行生物学注释后的结果,并统计细胞比例。
下面我们开始复现:
对每个样本创建一个在运行环境,并创建一个outputs
的文件夹,里面包含对应的三个10x文件,需要改成标准的文件名:
ccRCC1
step1:读取表达矩阵文件,然后构建Seurat对象
构建Seurat对象后保存为 sce.all.filt.Rdata
文件,供后续使用
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
## =============1.Load dataset
#1.1 读入数据,文件命名只剩下三个文件名类别和后缀
sce= Read10X(data.dir = './outputs')
sce=CreateSeuratObject(counts = sce,
project = "ccRCC1")
#计算线粒体基因比例
sce=PercentageFeatureSet(sce, "^MT-", col.name = "percent_mito")
#计算核糖体基因比例
sce=PercentageFeatureSet(sce, "^RP[SL]", col.name = "percent_ribo")
#计算红血细胞基因比例
sce=PercentageFeatureSet(sce, "^HB[^(P)]", col.name = "percent_hb")
#根据上述指标,过滤低质量细胞/基因
#过滤指标1:最少表达基因数的细胞&最少表达细胞数的基因
selected_c <- WhichCells(sce, expression = nFeature_RNA > 300)
selected_f <- rownames(sce)[Matrix::rowSums(sce@assays$RNA@counts > 0 ) > 3]
sce.all.filt <- subset(sce, features = selected_f, cells = selected_c)
#过滤指标2:线粒体/核糖体基因比例(根据上面的violin图)
selected_mito <- WhichCells(sce.all.filt, expression = percent_mito < 20)
selected_ribo <- WhichCells(sce.all.filt, expression = percent_ribo > 3)
selected_hb <- WhichCells(sce.all.filt, expression = percent_hb < 0.1)
sce.all.filt <- subset(sce.all.filt, cells = selected_mito)
sce.all.filt <- subset(sce.all.filt, cells = selected_ribo)
sce.all.filt <- subset(sce.all.filt, cells = selected_hb)
save(sce.all.filt,file = "sce.all.filt.Rdata")
step2: 降维聚类分群
sce.all=sce.all.filt
sce.all = NormalizeData(sce.all)
sce.all <- FindVariableFeatures(sce.all, nfeatures = 3000)
# 默认后续使用 HVGs 进行 特征寻找
sce.all = ScaleData(sce.all
#,vars.to.regress = c("nFeature_RNA", "percent_mito")
) #加上vars.to.regress参数后运行so slow!!!
# 降维处理
sce.all <- RunPCA(sce.all, npcs = 30)
sce.all <- RunTSNE(sce.all, dims = 1:30)
sce.all <- RunUMAP(sce.all, dims = 1:30)
# 聚类分析
sce.all <- FindNeighbors(sce.all, dims = 1:30)
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
sce.all <- FindClusters(sce.all, graph.name = "RNA_snn", resolution = res, algorithm = 1)
}
clustree(sce.all@meta.data, prefix = "RNA_snn_res.")
#选择分辨率为0.3进行后续分析
sel.clust = "RNA_snn_res.0.3"
sce.all <- SetIdent(sce.all, value = sel.clust)
table(sce.all@active.ident)
#首先进行分群结果可视化
DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.3", label = T)& NoAxes()
step3:细胞亚群的生物学注释
我们根据文章提供的marker gene 进行注释
Marker2=c("UBE2C","TOP2A","MKI67","KIT","TPSB2","TPSAB1","CTLA4",
"MS4A1","CD79A","PLVAP","FCGR3A","PTPRB","KDR","CDH5",
"PECAM1","CD14","S100A8","LYZ","NDUFA4L2","CA9","ACTA2",
"TAGLN","PDGFRB","COL1A1","NKG7","GNLY","KLRD1","KLRB1",
"MSR1","GPNMB","SLC40A1","CD163","CD68","IL7R","CD8B",
"CD8A","CD3E","CD3D")
DotPlot(sce.all, features = Marker2,assay='RNA',group.by = "RNA_snn_res.0.3")+
theme(axis.text.x=element_text(angle=45,hjust=1.2,vjust = 1.1,size = 8))
celltype=data.frame(ClusterID=0:14,
celltype='unkown')
celltype[celltype$ClusterID %in% c(0),2]='Tumor cells'
celltype[celltype$ClusterID %in% c(1),2]='CD8+ T cells'
celltype[celltype$ClusterID %in% c(2),2]='TAM'
celltype[celltype$ClusterID %in% c(3),2]='CD4+ T cells'
celltype[celltype$ClusterID %in% c(4),2]='CAF'
celltype[celltype$ClusterID %in% c(5),2]='NK cells'
celltype[celltype$ClusterID %in% c(6,11),2]='Endothelial cells 1'
celltype[celltype$ClusterID %in% c(7),2]='NK-T cells'
celltype[celltype$ClusterID %in% c(8),2]='Monocytes'
celltype[celltype$ClusterID %in% c(9),2]='Exhausted T cells'
celltype[celltype$ClusterID %in% c(10),2]='Mast cells'
celltype[celltype$ClusterID %in% c(12),2]='B cells'
celltype[celltype$ClusterID %in% c(13),2]='Proliferative CD8+ T cells'
celltype[celltype$ClusterID %in% c(14),2]='Endothelial cells 2 '
celltype
sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.0.3 == celltype$ClusterID[i])
,'celltype'] <- celltype$celltype[i]}
DimPlot(sce.all, reduction = "umap", group.by = "celltype", label = T,pt.size = 1.5)
step4:查看细胞比例
tb=as.data.frame(table(sce.all@meta.data$celltype),stringsAsFactors = F)
tb=tb[order(tb$Freq,decreasing=F),]
colnames(tb) <- c('Celltype', 'Freq')
percentage <- scales::percent(tb$Freq / sum(tb$Freq))
percentage
labs <- paste0(tb$Celltype,'(', percentage, ')')#设置标签名
labs
tb$cell=factor(tb$Celltype,levels=tb$Celltype)
p.pie = ggplot(tb, aes(x = "", y = Freq, fill = cell)) +
geom_bar(stat = "identity",width = 2,colour="black") +
scale_fill_discrete(breaks = tb$cell, labels = labs) +
labs(x = "", y = "", title = "") +
coord_polar(theta = "y") +
theme_minimal()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.border = element_blank(),
panel.grid=element_blank(),
axis.ticks = element_blank(),
plot.title=element_text(size=14, face="bold")
)+ scale_fill_manual(values = getPalette(colourCount))
p.pie
ccRCC2
对于ccRCC2数据的过滤、降维的代码与ccRCC1一致,区别在于后面的分群以及细胞生物学注释
# 聚类分析
sce.all <- FindNeighbors(sce.all, dims = 1:30)
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
sce.all <- FindClusters(sce.all, graph.name = "RNA_snn", resolution = res, algorithm = 1)
}
clustree(sce.all@meta.data, prefix = "RNA_snn_res.")
#选择分辨率为0.2进行后续分析
sel.clust = "RNA_snn_res.0.2"
sce.all <- SetIdent(sce.all, value = sel.clust)
table(sce.all@active.ident)
#首先进行分群结果可视化
DimPlot(sce.all, reduction = "umap", group.by = "RNA_snn_res.0.2", label = T,pt.size = 2)& NoAxes()
#根据文中提供的marker gene
Marker=c("VCAM1","ACKR1","CP","CD14","S100A8","LYZ",
"MS4A1","CD79A","KIT","TPSB2","TPSAB1",
"ACTA2","TAGLN","PDGFRB","COL1A1","SLC40A1",
"CD163","CD68","PLVAP","CLEC14A","KDR",
"CDH5","PECAM1",'NKG7','GNLY','KLRD1',
'KLRB1',"NDUFA4L2",'CA9',"CD8B",'CD8A',
"IL7R","CD3E","CD3D")
DotPlot(sce.all, features = Marker,assay='RNA',group.by = "RNA_snn_res.0.2")+
theme(axis.text.x=element_text(angle=45,hjust=1.2,vjust = 1.1,size = 8))
#与文中的图标进行比较对每个cluster进行命名
celltype=data.frame(ClusterID=0:13,
celltype='unkown')
celltype[celltype$ClusterID %in% c(0),2]='CD4+ T cells'
celltype[celltype$ClusterID %in% c(1),2]='Endothelial cells 1'
celltype[celltype$ClusterID %in% c(2),2]='Tumor cells 1'
celltype[celltype$ClusterID %in% c(3),2]='NK cells'
celltype[celltype$ClusterID %in% c(4),2]='C8+ T cells'
celltype[celltype$ClusterID %in% c(5),2]='TAM1'
celltype[celltype$ClusterID %in% c(6),2]='Endothelial cells 2'
celltype[celltype$ClusterID %in% c(7),2]='CAF'
celltype[celltype$ClusterID %in% c(8),2]='Tumor cells 2'
celltype[celltype$ClusterID %in% c(9),2]='TAM2'
celltype[celltype$ClusterID %in% c(10),2]='B cells'
celltype[celltype$ClusterID %in% c(11),2]='Mast cells'
celltype[celltype$ClusterID %in% c(12),2]='Endothelial cells 2'
celltype
sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.0.2 == celltype$ClusterID[i])
,'celltype'] <- celltype$celltype[i]}
DimPlot(sce.all, reduction = "umap", group.by = "celltype", label = T,pt.size = 1.5)
tb=as.data.frame(table(sce.all@meta.data$celltype),stringsAsFactors = F)
tb=tb[order(tb$Freq,decreasing=F),]
colnames(tb) <- c('Celltype', 'Freq')
percentage <- scales::percent(tb$Freq / sum(tb$Freq))
percentage
labs <- paste0(tb$Celltype,'(', percentage, ')')#设置标签名
labs
tb$cell=factor(tb$Celltype,levels=tb$Celltype)
p.pie = ggplot(tb, aes(x = "", y = Freq, fill = cell)) +
geom_bar(stat = "identity",width = 1,colour="black") +
scale_fill_discrete(breaks = tb$cell, labels = labs) +
labs(x = "", y = "", title = "") +
coord_polar(theta = "y") +
theme_minimal()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.border = element_blank(),
panel.grid=element_blank(),
axis.ticks = element_blank(),
plot.title=element_text(size=14, face="bold")
)
p.pie
虽然说上面的代码都是复制粘贴即可运行,但是如果要更好地完成上面的图表,通常是需要掌握5个R包,分别是: scater,monocle,Seurat,scran,M3Drop,需要熟练掌握它们的对象,:一些单细胞转录组R包的对象 而且分析流程也大同小异:
step1: 创建对象 step2: 质量控制 step3: 表达量的标准化和归一化 step4: 去除干扰因素(多个样本整合) step5: 判断重要的基因 step6: 多种降维算法 step7: 可视化降维结果 step8: 多种聚类算法 step9: 聚类后找每个细胞亚群的标志基因 step10: 继续分类
单细胞转录组数据分析的标准降维聚类分群,并且进行生物学注释后的结果。可以参考前面的例子:人人都能学会的单细胞聚类分群注释 ,我们演示了第一层次的分群。
如果你对单细胞数据分析还没有基础认知,可以看基础10讲:
写在文末
咱们现在这个专栏《100个单细胞转录组数据降维聚类分群图表复现》分享的代码是到此为止,但是一般来说单细胞文章数据分析还有很多进阶图表制作,比如inferCNV看肿瘤拷贝数变异,monocle看拟时序等等。如果你也需要,可以加入我们这个专栏《100个单细胞转录组数据降维聚类分群图表复现》创作团队,获取进阶指引哦!见:急!计划招募100个单细胞爱好者,免费学全套单细胞降维聚类分群和生物学亚群注释