硬着头皮往下走PCA|GSEA
一直在生信菜鸟团发布的学徒数据挖掘第二期目录如下:
本期数据挖掘任务来自于Paper:Tumor Evolution and Drug Response in Patient-Derived Organoid Models of Bladder Cancer
要重复的图表是:Figures:
其实,是很简单的处理:
差异分析
后提取top1000进行PCA,要表达的意思是,tumor和TCGA的tumor能聚在一起;对差异基因进行GSEA的KEGG富集;
有了其实,必有但是;作者上传的矩阵是辣个样子的,
a
.GEOquery拿不到矩阵;
b
.网页下载的矩阵是DESeq2-normalized counts;
c
.英文不翻之DESeq2 doesn’t actually use normalized counts, rather it uses the raw counts and models the normalization inside the Generalized Linear Model (GLM). These normalized counts will be useful for downstream visualization of results, but cannot be used as input to DESeq2 or any other tools that peform differential expression analysis which use the negative binomial model.
纯代码:
Step1-download
###一些常规的设置
rm(list = ls())#清空环境变量
options(stringsAsFactors = F)##字符不作为因子读入
#####数据下载
library(GEOquery)
h<-'GSE103990.Rdata'
####getGPL获得平台的注释信息,但下载速度会慢很多
####而且注释文件格式大多不如bioconductor包好用
if(!file.exists(h)){
gset<-getGEO('GSE103990',destdir='.',
AnnotGPL=F,
getGPL=F)
save(gset,file=h)
}
load('GSE103990.Rdata')
ex<- exprs(gset[[1]])
pd <- pData(gset[[1]])
#system('nohup wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE103nnn/GSE103990/suppl/GSE103990_Normalized_counts.txt.gz &')
counts_nor <- read.table('GSE103990_Normalized_counts.txt.gz')
save(counts_nor,pd,file='Nor.Rdata')
Step2-差异分析(数据不合适,但代码木有问题)
rm(list = ls())
options(stringsAsFactors = F)
load('Nor.Rdata')
rownames(counts_nor)<- substr(rownames(counts_nor),1,15)
exprSet<- floor(counts_nor)
pd <- pd[match(colnames(exprSet),pd$description.1),]
group_list <- ifelse(grepl('org',pd$title),'org','tumor')
suppressMessages(library(DESeq2))
#### 第一步,构建DESeq2的DESeq对象
colData <- data.frame(row.names=colnames(exprSet),group_list=group_list)
dds <- DESeqDataSetFromMatrix(countData = exprSet,colData = colData,
design = ~ group_list)
#### 第二步,进行差异表达分析
dds2 <- DESeq(dds)
res <- results(dds2,contrast=c("group_list","org","tumor"))
resOrdered <- res[order(res$padj),]
DEG <- as.data.frame(resOrdered)
DESeq2_DEG = na.omit(DEG)
nrDEG=DESeq2_DEG[,c(2,6)]
colnames(nrDEG)=c('log2FoldChange','pvalue')
save(nrDEG, DESeq2_DEG, file = "DEG.Rdata")
Step3-PCA
rm(list=ls())
library(edgeR)
load('Nor.Rdata')
load('BLC.Rdata')
load('DEG.Rdata')
rownames(counts_nor)<- substr(rownames(counts_nor),1,15)
nor_BLCA <- edgeR::cpm(expr_BLC,log=T)
nor_paper <- edgeR::cpm(floor(counts_nor),log=T)
inter_gene<- intersect(rownames(nor_BLCA),rownames(nor_paper))
choose_gene <- rownames(nrDEG)[abs(nrDEG$log2FoldChange)>1.5&nrDEG$pvalue<0.01]
nrDEG <- nrDEG[order(abs(nrDEG$log2FoldChange),nrDEG$pvalue,decreasing = T),]
final_choose<- rownames(nrDEG)[rownames(nrDEG)%in%inter_gene][1:1000]
nr_paper <- nor_paper[final_choose,]
nr_BLC <- nor_BLCA[final_choose,]
nr_pca <- cbind(nr_paper,nr_BLC)
group_list <- ifelse(grepl('TCGA',colnames(nr_pca)),'TCGA',ifelse(grepl('org',pd$title),'org','tumor'))
library("FactoMineR")
library("factoextra")
####
df.pca <- PCA(t(nr_pca), graph = FALSE)
fviz_pca_ind(df.pca,
geom.ind = "point",
col.ind = group_list,
addEllipses = F,
legend.title = "Groups")
Step4-GSEA
如果对GSEA中的细节感兴趣,看https://www.jianshu.com/p/be8fe1318850
rm(list=ls())
load('DEG.Rdata')
library(org.Hs.eg.db)
library(clusterProfiler)
gene <- bitr(rownames(nrDEG), fromType = "ENSEMBL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db)
gene$logfc <- nrDEG$log2FoldChange[match(gene$ENSEMBL,rownames(nrDEG))]
geneList=nrDEG$log2FoldChange
names(geneList)=gene$ENTREZID
geneList=sort(geneList,decreasing = T)
head(geneList)
library(clusterProfiler)
kk_gse <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 10,
pvalueCutoff = 0.9,
verbose = FALSE)
paper_choose <- c('Cell adhesion molecules (CAMs)',
'Cell cycle',
'ErbB signaling pathway')
a <- list()
for (i in 1:3){
a[[i]]<- gseaplot2(kk_gse,
geneSetID = kk_gse@result$ID[kk_gse@result$Description==paper_choose[i]],
pvalue_table = T)
}
a
Results:
写在最后,其实PCA的目的是为了说明,肿瘤和TCGA的肿瘤可以聚在一起,而org聚在一起;
如果感兴趣的话,就自己从上游走一遍RNA-seq流程,再走一遍我这里R的代码;再如果,你重复出来了,一定一定一定要回复告诉我,我给你个小福利;
image.png
有趣的是,基本上没有重复出来,所以里面留有一个彩蛋!