硬着头皮往下走PCA|GSEA

一直在生信菜鸟团发布的学徒数据挖掘第二期目录如下:

因为阅读量爆棚,所以提升到生信技能树发布

本期数据挖掘任务来自于Paper:Tumor Evolution and Drug Response in Patient-Derived Organoid Models of Bladder Cancer

要重复的图表是:Figures:

PCA
ssGSEA
其实,是很简单的处理:
  • 差异分析后提取top1000进行PCA,要表达的意思是,tumor和TCGA的tumor能聚在一起;

  • 对差异基因进行GSEA的KEGG富集;

  • 有了其实,必有但是;作者上传的矩阵是辣个样子的,

  • a.GEOquery拿不到矩阵;

  • b.网页下载的矩阵是DESeq2-normalized counts

  • c.英文不翻之DESeq2 doesn’t actually use normalized counts, rather it uses the raw counts and models the normalization inside the Generalized Linear Model (GLM). These normalized counts will be useful for downstream visualization of results, but cannot be used as input to DESeq2 or any other tools that peform differential expression analysis which use the negative binomial model.

纯代码:

Step1-download

###一些常规的设置
rm(list = ls())#清空环境变量
options(stringsAsFactors = F)##字符不作为因子读入
#####数据下载
library(GEOquery)
h<-'GSE103990.Rdata'
####getGPL获得平台的注释信息,但下载速度会慢很多
####而且注释文件格式大多不如bioconductor包好用
if(!file.exists(h)){
  gset<-getGEO('GSE103990',destdir='.',
               AnnotGPL=F,
               getGPL=F)
  save(gset,file=h)
}
load('GSE103990.Rdata')
ex<- exprs(gset[[1]])
pd <- pData(gset[[1]])
#system('nohup wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE103nnn/GSE103990/suppl/GSE103990_Normalized_counts.txt.gz &')
counts_nor <- read.table('GSE103990_Normalized_counts.txt.gz')
save(counts_nor,pd,file='Nor.Rdata')

Step2-差异分析(数据不合适,但代码木有问题)

rm(list = ls())
options(stringsAsFactors = F)
load('Nor.Rdata')
rownames(counts_nor)<- substr(rownames(counts_nor),1,15)
exprSet<- floor(counts_nor)
pd <- pd[match(colnames(exprSet),pd$description.1),]
group_list <- ifelse(grepl('org',pd$title),'org','tumor')
suppressMessages(library(DESeq2))
#### 第一步,构建DESeq2的DESeq对象
colData <- data.frame(row.names=colnames(exprSet),group_list=group_list)
dds <- DESeqDataSetFromMatrix(countData = exprSet,colData = colData,
                              design = ~ group_list)
#### 第二步,进行差异表达分析
dds2 <- DESeq(dds)
res <- results(dds2,contrast=c("group_list","org","tumor"))
resOrdered <- res[order(res$padj),]
DEG <- as.data.frame(resOrdered)
DESeq2_DEG = na.omit(DEG)
nrDEG=DESeq2_DEG[,c(2,6)]
colnames(nrDEG)=c('log2FoldChange','pvalue')  
save(nrDEG, DESeq2_DEG, file = "DEG.Rdata")

Step3-PCA

rm(list=ls())
library(edgeR)
load('Nor.Rdata')
load('BLC.Rdata')
load('DEG.Rdata')
rownames(counts_nor)<- substr(rownames(counts_nor),1,15)
nor_BLCA <- edgeR::cpm(expr_BLC,log=T)
nor_paper <- edgeR::cpm(floor(counts_nor),log=T)
inter_gene<- intersect(rownames(nor_BLCA),rownames(nor_paper))
choose_gene <- rownames(nrDEG)[abs(nrDEG$log2FoldChange)>1.5&nrDEG$pvalue<0.01]
nrDEG <- nrDEG[order(abs(nrDEG$log2FoldChange),nrDEG$pvalue,decreasing = T),]
final_choose<- rownames(nrDEG)[rownames(nrDEG)%in%inter_gene][1:1000]
nr_paper <- nor_paper[final_choose,]
nr_BLC <- nor_BLCA[final_choose,]
nr_pca <- cbind(nr_paper,nr_BLC)
group_list <- ifelse(grepl('TCGA',colnames(nr_pca)),'TCGA',ifelse(grepl('org',pd$title),'org','tumor'))
library("FactoMineR")
library("factoextra") 
####
df.pca <- PCA(t(nr_pca), graph = FALSE)
fviz_pca_ind(df.pca,
             geom.ind = "point",
             col.ind = group_list, 
             addEllipses = F, 
             legend.title = "Groups")

Step4-GSEA

如果对GSEA中的细节感兴趣,看https://www.jianshu.com/p/be8fe1318850

rm(list=ls())
load('DEG.Rdata')
library(org.Hs.eg.db)
library(clusterProfiler)
gene <- bitr(rownames(nrDEG), fromType = "ENSEMBL",
                toType =  "ENTREZID",
                OrgDb = org.Hs.eg.db)
gene$logfc <- nrDEG$log2FoldChange[match(gene$ENSEMBL,rownames(nrDEG))]
geneList=nrDEG$log2FoldChange
names(geneList)=gene$ENTREZID
geneList=sort(geneList,decreasing = T)
head(geneList)
library(clusterProfiler)
kk_gse <- gseKEGG(geneList     = geneList,
                  organism     = 'hsa',
                  nPerm        = 1000,
                  minGSSize    = 10,
                  pvalueCutoff = 0.9,
                  verbose      = FALSE)
paper_choose <- c('Cell adhesion molecules (CAMs)',
                  'Cell cycle',
                  'ErbB signaling pathway')
a <- list()
for (i in 1:3){
a[[i]]<- gseaplot2(kk_gse, 
         geneSetID = kk_gse@result$ID[kk_gse@result$Description==paper_choose[i]],
         pvalue_table = T)
  }
a

Results:
  • 写在最后,其实PCA的目的是为了说明,肿瘤和TCGA的肿瘤可以聚在一起,而org聚在一起;

  • 如果感兴趣的话,就自己从上游走一遍RNA-seq流程,再走一遍我这里R的代码;再如果,你重复出来了,一定一定一定要回复告诉我,我给你个小福利;

    image.png
image.png
ErbB
Cell cycle

有趣的是,基本上没有重复出来,所以里面留有一个彩蛋!

(0)

相关推荐