单细胞水平解析肺间质细胞在发育和纤维化过程中的分类 / 四六文摘

考虑到咱们生信技能树粉丝对单细胞数据挖掘的需求，我开通了一个专栏《100个单细胞转录组数据降维聚类分群图表复现》，也亲自示范了几个，不过自己带娃，读博，时间精力有限，所以把剩余的90多个任务安排了学徒，实习生，学员。真的是太棒了，群策群力！

下面七月份学徒的投稿

今天介绍的文章是2021年6月发表在iScience杂志的《Categorization of lung mesenchymal cells in development and fibrosis》

文章链接是：https://www.sciencedirect.com/science/article/pii/S2589004221005198

文章的研究主要内容是：对肺间质亚型发育和纤维化的综合分析，在发育和纤维化过程中识别新的肺间充质细胞亚型，描述新的具有鉴别性和一致性的间充质亚型标记。

这篇文章包含的数据集很多，本次要复现的是文章的F1的ABC三张图。

是单细胞转录组数据分析的标准降维聚类分群，并且进行生物学注释，再提取mesenchymal细胞进行细分亚群。

使用的数据集是 GSE156329 ：

E17.5 mouse lungs, GSE156329

GSM4729129 E17.5 WT1 scRNA-seq GSM4729130 E17.5 WT2 scRNA-seq GSM4729131 E17.5 WT3 scRNA-seq

下载如下数据

下面我们开始复现

在运行环境下创建一个GSE156329文件夹，里面包含下面3个文件：

step1：读取表达矩阵文件，然后构建Seurat对象

rm(list=ls()) options(stringsAsFactors = F) library(Seurat) library(ggplot2) library(clustree) library(cowplot) library(dplyr) library(data.table) samples=list.files('GSE156329/') samples sceList = lapply(samples,function(x){ print(x) y=file.path('./GSE156329/',x ) a=fread(y,data.table = F) a[1:4,1:4] a = a[!duplicated(a$symbol),] rownames(a)=a[,2] a=a[,-c(1:2)] sce=CreateSeuratObject(a,project = substr(x,12,14)) return(sce) }) sce.all=merge(x=sceList[[1]], y=sceList[ -1 ]) #计算线粒体基因比例 sce.all=PercentageFeatureSet(sce.all, "^mt-", col.name = "percent_mito") #计算核糖体基因比例 sce.all=PercentageFeatureSet(sce.all, "^Rp[sl]", col.name = "percent_ribo") #计算红血细胞基因比例 sce.all=PercentageFeatureSet(sce.all, "^Hb[^(p)]", col.name = "percent_hb") #根据上述指标，过滤低质量细胞/基因 #过滤指标1:最少表达基因数的细胞&最少表达细胞数的基因 selected_c <- WhichCells(sce.all, expression = nFeature_RNA > 300) selected_f <- rownames(sce.all)[Matrix::rowSums(sce.all@assays$RNA@counts > 0 ) > 3] sce.all.filt <- subset(sce.all, features = selected_f, cells = selected_c) #过滤指标2:线粒体/核糖体基因比例(根据上面的violin图) selected_mito <- WhichCells(sce.all.filt, expression = percent_mito < 20) selected_ribo <- WhichCells(sce.all.filt, expression = percent_ribo > 3) selected_hb <- WhichCells(sce.all.filt, expression = percent_hb < 0.1) sce.all.filt <- subset(sce.all.filt, cells = selected_mito) sce.all.filt <- subset(sce.all.filt, cells = selected_ribo) sce.all.filt <- subset(sce.all.filt, cells = selected_hb) save(sce.all.filt,file = "sce.all.filt.Rdata")

step2：降维聚类分群

sce.all=sce.all.filt sce.all = NormalizeData(sce.all) sce.all <- FindVariableFeatures(sce.all) # 默认后续使用 HVGs 进行特征寻找 sce.all = ScaleData(sce.all #,vars.to.regress = c("nFeature_RNA", "percent_mito") ) #加上vars.to.regress参数后运行so slow!!! # 降维处理 sce.all <- RunPCA(sce.all, npcs = 30) sce.all <- RunTSNE(sce.all, dims = 1:30) sce.all <- RunUMAP(sce.all, dims = 1:30) # 聚类分析 sce.all <- FindNeighbors(sce.all, dims = 1:30) for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) { sce.all <- FindClusters(sce.all, graph.name = "RNA_snn", resolution = res, algorithm = 1) } clustree(sce.all@meta.data, prefix = "RNA_snn_res.")

p1 <- DimPlot(sce.all, reduction = "umap", group.by = "orig.ident", label = T, label.box = T)

下面我们对数据进行整合

######整合#### #拆分为个seurat子对象 sce.all.list <- SplitObject(sce.all.filt, split.by = "orig.ident") #数据整合之前要对每个样本的seurat对象进行数据标准化和选择高变基因 for (i in 1:length(sce.all.list)) { print(i) sce.all.list[[i]] <- NormalizeData(sce.all.list[[i]], verbose = FALSE) sce.all.list[[i]] <- FindVariableFeatures(sce.all.list[[i]], selection.method = "vst", nfeatures = 2000, verbose = FALSE) } ###以VariableFeatures为基础寻找锚点，运行时间较长 alldata.anchors <- FindIntegrationAnchors(object.list = sce.all.list, dims = 1:30, reduction = "cca") ##利用锚点整合数据，运行时间较长 sce.all.int <- IntegrateData(anchorset = alldata.anchors, dims = 1:30, new.assay.name = "CCA") sce.all.int=ScaleData(sce.all.int) sce.all.int=RunPCA(sce.all.int, npcs = 30) sce.all.int=RunTSNE(sce.all.int, dims = 1:30) sce.all.int=RunUMAP(sce.all.int, dims = 1:30) #查看整合前后数据 plot_grid(ncol = 3, DimPlot(sce.all, reduction = "pca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA raw_data"), DimPlot(sce.all, reduction = "tsne", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE raw_data"), DimPlot(sce.all, reduction = "umap", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP raw_data"), DimPlot(sce.all.int, reduction = "pca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA integrated"), DimPlot(sce.all.int, reduction = "tsne", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE integrated"), DimPlot(sce.all.int, reduction = "umap", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP integrated") )

##整合前后区别不大，这里我们使用整合后的数据 # 聚类分析 sce.all.int=FindNeighbors(sce.all.int, dims = 1:30, k.param = 60, prune.SNN = 1/15) #设置不同的分辨率，观察分群效果 for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) { sce.all.int=FindClusters(sce.all.int, graph.name = "CCA_snn", resolution = res, algorithm = 1) } apply(sce.all.int@meta.data[,grep("CCA_snn_res",colnames(sce.all.int@meta.data))],2,table) clustree(sce.all.int@meta.data, prefix = "CCA_snn_res.")

#选取0.3进行后续分析 DimPlot(sce.all.int, reduction = "tsne", group.by = "CCA_snn_res.0.3", label = T,label.box = T)

step3：细胞注释

根据文献，首先区分成为4个大的亚群：

#根据marker genne进行第一次注释 first_check=list( Mes=c("Ter119","Tagln","Scgb1a1","Lgr6","Lgr5","Actc1","Acta2","Cd34","Cd31","Cd11b"), Imm=c("AI607873", "Alox5ap", "Apobec1", "Apoe", "Arhgdib", "Bcl2a1d"), Epi=c("Epcam","Krt19","Cd24","Prom1","Aladh1a1","Gsta4","Lmo7","Ppap2c"), Endo=c("Ets1","Kdr","Cdh5","Pecam1","Gata2","Clec14a") ) first_check DotPlot(sce.all.int,features = first_check,assay='RNA',group.by = "CCA_snn_res.0.3") + theme(axis.text.x=element_text(angle=45,hjust=1.2,vjust = 1.1,size = 8))

celltype=data.frame(ClusterID=0:11, celltype='un') #定义细胞亚群 celltype[celltype$ClusterID %in% c(0),2]='Endo' celltype[celltype$ClusterID %in% c(2,6,7,10),2]='Mes' celltype[celltype$ClusterID %in% c(1,5,8,9),2]='Imm' celltype[celltype$ClusterID %in% c(3,4,11),2]='Epi' celltype sce.all.int@meta.data$celltype = "un" for(i in 1:nrow(celltype)){ sce.all.int@meta.data[which(sce.all.int@meta.data$CCA_snn_res.0.3 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]} table(sce.all.int@meta.data$celltype) p2 <- DimPlot(sce.all.int, reduction = "umap", group.by = "celltype", label = T,label.box = T)

###提取mes进行二次分群 cells.use <- row.names(sce.all.int@meta.data)[which(sce.all.int@meta.data$celltype=='Mes')] length(cells.use) sce_second <-subset(sce.all.int, cells=cells.use) sce_second ##第二次分群 DefaultAssay(sce_second) <- "RNA" sce_second <- NormalizeData(sce_second, normalization.method = "LogNormalize", scale.factor = 1e4) sce_second <- FindVariableFeatures(sce_second, selection.method = 'vst', nfeatures = 2000) sce_second <- ScaleData(sce_second) sce_second <- RunPCA(sce_second, features = VariableFeatures(object = sce_second)) sce_second <- RunPCA(object = sce_second, pc.genes = VariableFeatures(sce_second)) sce_second <- RunTSNE(sce_second, dims = 1:30) sce_second <- RunUMAP(sce_second, dims = 1:30) sce_second <- FindNeighbors(sce_second, dims = 1:15) for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) { sce_second <- FindClusters(sce_second, graph.name = "RNA_snn", resolution = res, algorithm = 1) } clustree(sce_second@meta.data, prefix = "RNA_snn_res.")

DimPlot(sce_second, reduction = "umap", group.by = "RNA_snn_res.0.8", label = T, label.box = T,pt.size = 2)

#这里根据文章提示，分成6个细胞亚群 genes_to_check = list( lipo=c('Plin2', 'Tcf21','Fgf10','G0s2','Gyg','Macf1','Wnt2','Col13a1'),#lipofibroblasts myo=c('Acta2', 'Pdgfra','Tagln','Myh11','Tgfbi','Hhip','Enpp2','Wnt5a'),#myofibroblasts Ebf1=c('Higd1b','Cox4i2','Notch3','Ebf1','Gucy1a3','Pdzd2','Postn'),#Ebf1 inter=c('Agtr2','Prss35','Igfbp7','Ptn','Heyl','Fstl1','Tm4sf1'),#inter proli=c('Hmmr','Mki67','Top2a','Spc25','Cdca3','Ccnb2','Hist1h2ap'),#proliferating fibroblasts meso=c('Wt1','Upk3b', 'Msln','Upk1b','Aldh1a2','Cpe','Gm12840','Aqp1')#mesothelial cells ) library(stringr) DotPlot(sce_second,features = genes_to_check,assay='RNA',group.by = "RNA_snn_res.0.8") + theme(axis.text.x=element_text(angle=45,hjust=1.2,vjust = 1.1,size = 8))

celltype=data.frame(ClusterID=0:11, celltype='un') #根据marker基因定义细胞亚群 celltype[celltype$ClusterID %in% c(0,2,3),2]='lipo' celltype[celltype$ClusterID %in% c(1,6,9),2]='Myo' celltype[celltype$ClusterID %in% c(4,11),2]='Ebf1' celltype[celltype$ClusterID %in% c(5,8),2]='Inter ' celltype[celltype$ClusterID %in% c(7,11),2]='Proli' celltype[celltype$ClusterID %in% c(10),2]='Meso' sce_second@meta.data$celltype = "un" for(i in 1:nrow(celltype)){ sce_second@meta.data[which(sce_second@meta.data$RNA_snn_res.0.8 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]} table(sce_second@meta.data$celltype) p3 <- DimPlot(sce_second, reduction = "umap", group.by = "celltype", label = T,pt.size = 2) p3

p1+p2+p3

虽然说上面的代码都是复制粘贴即可运行，但是如果要更好地完成上面的图表，通常是需要掌握5个R包，分别是: scater,monocle,Seurat,scran,M3Drop，需要熟练掌握它们的对象，：一些单细胞转录组R包的对象而且分析流程也大同小异：

step1: 创建对象
step2: 质量控制
step3: 表达量的标准化和归一化
step4: 去除干扰因素(多个样本整合)
step5: 判断重要的基因
step6: 多种降维算法
step7: 可视化降维结果
step8: 多种聚类算法
step9: 聚类后找每个细胞亚群的标志基因
step10: 继续分类

单细胞转录组数据分析的标准降维聚类分群，并且进行生物学注释后的结果。可以参考前面的例子：人人都能学会的单细胞聚类分群注释，我们演示了第一层次的分群。

如果你对单细胞数据分析还没有基础认知，可以看基础10讲：

写在文末

咱们现在这个专栏《100个单细胞转录组数据降维聚类分群图表复现》分享的代码是到此为止，但是一般来说单细胞文章数据分析还有很多进阶图表制作，比如inferCNV看肿瘤拷贝数变异，monocle看拟时序等等。如果你也需要，可以加入我们这个专栏《100个单细胞转录组数据降维聚类分群图表复现》创作团队，获取进阶指引哦！见：急！计划招募100个单细胞爱好者，免费学全套单细胞降维聚类分群和生物学亚群注释

单细胞水平解析肺间质细胞在发育和纤维化过程中的分类

下面我们开始复现

step1：读取表达矩阵文件，然后构建Seurat对象

step2：降维聚类分群

step3：细胞注释

写在文末

单细胞服务列表

相关推荐