送你一篇TCGA数据挖掘文章
下载样本信息
首先从UCSC Xena数据库下载样本信息
UCSC Xena网址:https://xena.ucsc.edu/public-hubs/
根据上图的链接可以下载到样本信息,文件为
“TCGA-BRCA.GDC_phenotype.tsv.gz”
通过上图得到的三个重要信息:
数据下载链接
样本数:1217
count值计算方法:log2(count+1)
下载好样本信息及表达矩阵的数据之后,我们就可以开始处理数据了。
挑选出TNBC样本
首先根据样本信息找到三阴性乳腺癌的样本。
载入下载好的样本信息文件
phenotype_file <- read.table('TCGA-BRCA.GDC_phenotype.tsv.gz',header = T,sep = '\t',quote = '')
## 检查一下表头,其实Xena上有两个样本信息的文件,选择'TCGA-BRCA.GDC_phenotype.tsv.gz'的原因就在于另一个样本信息文件所包含的内容过少。大家可以下载下来看一下。
(phenotype_colnames <- asN.data.frame(colnames(phenotype_file)))
## 三阴性乳腺癌的患者不表达ER,PR,Her2,所以先检查一下样本信息中的这三列
table(phenotype_file$breast_carcinoma_estrogen_receptor_status) ## 雌激素受体(ER)
table(phenotype_file$breast_carcinoma_progesterone_receptor_status) ## 黄体酮受体(PR)
table(phenotype_file$lab_proc_her2_neu_immunohistochemistry_receptor_status) ## 人类表皮生长因子受体(HER2)
## 取出含有'receptor_status'信息的列
colnames_num <- grep('receptor_status',colnames(phenotype_file))
phenotype_colnames <- colnames(phenotype_file)[colnames_num]
eph <- phenotype_file[,colnames_num[1:3]]
## 之后的代码用到apply函数,会用跳过这部分就好
## apply函数需要三个参数,第一个参数是matrix
## 第二个参数如果是1,说明是按行取;第二个参数如果是2,说明是按列取
## 第三个参数是方法
## example
## > x <- cbind(x1 = 3, x2 = c(4:1))
## > dimnames(x)[[1]] <- letters[1:4]
## > x
## x1 x2
## a 3 4
## b 3 3
## c 3 2
## d 3 1
## > apply(x, 2, function(x) sum(x =='3'))
## x1 x2
## 4 1
## > apply(x, 2, function(x) sum(x =='3'))
## x1 x2
## 4 1
## 根据ER,PR,HER2将样本分组
tnbc_rownum <- apply(eph, 1, function(x) sum(x =='Negative'))
tnbc_sample <- phenotype_file[tnbc_rownum == 3, 1]
save(tnbc_s,file = 'tnbc_sample.Rdata')
到这里,我们就从1217个样本中挑出了118个tnbc样本,接下来就可以用在表达矩阵中取出这些样本了
从Xena下载到的矩阵不是可以直接用的,我们要先把它处理一下
library(data.table)
a=fread('TCGA-BRCA.htseq_counts.tsv',sep = '\t',header = T)
a=as.data.frame(a)
a[1:4,1:4]
rownames(a)=a[,1]
a=a[,-1]
genes=rownames(a)
a[1:4,1:4]
## 在数据的介绍页面上我们已经得知了数据的计算方法现在我们只要把它还原回去就可以了
a=2^a-1
a[1:4,1:4]
## 接下来我们就要取出表达数据,但是我们只要这118个tnbc样本中成对的样本,## 即,同一个tnbc病人既有正常样本,又有肿瘤样本的表达信息
load('tnbc_s.Rdata')
tnbc_p=substring(tnbc_s,1,12)
all_p=substring(colnames(a),1,12)
paired_p=names(table(all_p)[table(all_p)==2])
## 这样挑选过后,符合要求的样本就只有9个了
need_p=intersect(tnbc_p,paired_p)
exprSet=a[,all_p %in% need_p]
tmp=apply(exprSet,1,function(x){
sum(x==0) < 10
})
exprSet=exprSet[tmp,]
save(exprSet,file = 'tnbc_paired_exprSet.Rdata')
差异分析
## 接下来做差异分析
Rdata_dir='.'
load( file =
file.path(Rdata_dir,'tnbc_paired_exprSet.Rdata')
)
dim(exprSet)
## 根据TCGA样本的命名可以区分正常组织和肿瘤样本的测序结果
group_list=factor(ifelse(as.numeric(substr(colnames(exprSet),14,15)) < 10,'tumor','normal'))
table(group_list)
## 对表达差异结果数据取整
exprSet=ceiling(exprSet)
这个时候exprSet结果就可以进行差异分析了
## 方法一:DESeq2
if(T){
library(DESeq2)
(colData <- data.frame(row.names=colnames(exprSet),
group_list=group_list) )
dds <- DESeqDataSetFromMatrix(countData = exprSet,
colData = colData,
design = ~ group_list)
tmp_f=file.path(Rdata_dir,'TCGA-KIRC-miRNA-DESeq2-dds.Rdata')
if(!file.exists(tmp_f)){
dds <- DESeq(dds)
save(dds,file = tmp_f)
}
load(file = tmp_f)
res <- results(dds,
contrast=c("group_list","tumor","normal"))
resOrdered <- res[order(res$padj),]
head(resOrdered)
DEG =as.data.frame(resOrdered)
DESeq2_DEG = na.omit(DEG)
nrDEG=DESeq2_DEG[,c(2,6)]
colnames(nrDEG)=c('log2FoldChange','pvalue')
}
## 方法二:edgeR
if(T){
library(edgeR)
d <- DGEList(counts=exprSet,group=factor(group_list))
keep <- rowSums(cpm(d)>1) >= 2
table(keep)
d <- d[keep, , keep.lib.sizes=FALSE]
d$samples$lib.size <- colSums(d$counts)
d <- calcNormFactors(d)
d$samples
dge=d
design <- model.matrix(~0+factor(group_list))
rownames(design)<-colnames(dge)
colnames(design)<-levels(factor(group_list))
dge=d
dge <- estimateGLMCommonDisp(dge,design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
# https://www.biostars.org/p/110861/
lrt <- glmLRT(fit, contrast=c(-1,1))
nrDEG=topTags(lrt, n=nrow(dge))
nrDEG=as.data.frame(nrDEG)
head(nrDEG)
edgeR_DEG =nrDEG
nrDEG=edgeR_DEG[,c(1,5)]
colnames(nrDEG)=c('log2FoldChange','pvalue')
}
## 方法三:limma
if(T){
suppressMessages(library(limma))
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exprSet)
design
dge <- DGEList(counts=exprSet)
dge <- calcNormFactors(dge)
logCPM <- cpm(dge, log=TRUE, prior.count=3)
v <- voom(dge,design,plot=TRUE, normalize="quantile")
fit <- lmFit(v, design)
group_list
cont.matrix=makeContrasts(contrasts=c('tumor-normal'),levels = design)
fit2=contrasts.fit(fit,cont.matrix)
fit2=eBayes(fit2)
tempOutput = topTable(fit2, coef='tumor-normal', n=Inf)
DEG_limma_voom = na.omit(tempOutput)
head(DEG_limma_voom)
nrDEG=DEG_limma_voom[,c(1,4)]
colnames(nrDEG)=c('log2FoldChange','pvalue')
}
## 比较一下这三个差异分析的结果
nrDEG1=DEG_limma_voom[,c(1,4)]
colnames(nrDEG1)=c('log2FoldChange','pvalue')
nrDEG2=edgeR_DEG[,c(1,5)]
colnames(nrDEG2)=c('log2FoldChange','pvalue')
nrDEG3=DESeq2_DEG[,c(2,6)]
colnames(nrDEG3)=c('log2FoldChange','pvalue')
mi=unique(c(rownames(nrDEG1),rownames(nrDEG1),rownames(nrDEG1)))
lf=data.frame(limma=nrDEG1[mi,1],
edgeR=nrDEG2[mi,1],
DESeq2=nrDEG3[mi,1])
cor(na.omit(lf))
## limma edgeR DESeq2
## limma 1.0000000 0.9134287 0.9402436
## edgeR 0.9134287 1.0000000 0.9568595
## DESeq2 0.9402436 0.9568595 1.0000000
注释
library( "clusterProfiler" )
library( "org.Hs.eg.db" )
nrDEG = DEG_limma_voom
colnames(DESeq2_DEG)[2] = 'logFC'
colnames(DESeq2_DEG)[5] = 'P.Value'
nrDEG = DESeq2_DEG
colnames(edgeR_DEG)[1] = 'logFC'
colnames(edgeR_DEG)[4] = 'P.Value'
nrDEG = edgeR_DEG
logFC_cutoff <- with( nrDEG, mean( abs( logFC ) ) + 2 * sd( abs( logFC ) ) )
logFC_cutoff
logFC_cutoff = 1.2
nrDEG$change = as.factor( ifelse( nrDEG$P.Value < 0.05 & abs(nrDEG$logFC) > logFC_cutoff,
ifelse( nrDEG$logFC > logFC_cutoff , 'UP', 'DOWN' ), 'NOT' ) )
table(nrDEG$change)
keytypes(org.Hs.eg.db)
library("stringr")
rownames( nrDEG ) <- str_sub(rownames( nrDEG ), start = 1, end = 15)
nrDEG$ENSEMBL <- rownames( nrDEG )
df <- bitr( rownames( nrDEG ), fromType = "ENSEMBL", toType = c( "ENTREZID" ), OrgDb = org.Hs.eg.db )
head( df )
{
nrDEG$SYMBOL = rownames( nrDEG )
nrDEG = merge( nrDEG, df, by='ENSEMBL' )
}
head( nrDEG )
{
gene_up = nrDEG[ nrDEG$change == 'UP', 'ENTREZID' ]
gene_down = nrDEG[ nrDEG$change == 'DOWN', 'ENTREZID' ]
gene_diff = c( gene_up, gene_down )
gene_all = as.character(nrDEG[ ,'ENTREZID'] )
}
{
geneList = nrDEG$logFC
names( geneList ) = nrDEG$ENTREZID
geneList = sort( geneList, decreasing = T )
}
{
## KEGG pathway analysis
kk.up <- enrichKEGG( gene = gene_up ,
organism = 'hsa' ,
universe = gene_all ,
pvalueCutoff = 0.8 ,
qvalueCutoff = 0.8 )
kk.down <- enrichKEGG( gene = gene_down ,
organism = 'hsa' ,
universe = gene_all ,
pvalueCutoff = 0.05 )
}
head( kk.up )[ ,1:6 ]
head( kk.down )[ ,1:6 ]
library( "ggplot2" )
{
kegg_down_dt <- as.data.frame( kk.down )
kegg_up_dt <- as.data.frame( kk.up )
down_kegg <- kegg_down_dt[ kegg_down_dt$pvalue < 0.05, ]
down_kegg$group = -1
up_kegg <- kegg_up_dt[ kegg_up_dt$pvalue < 0.05, ]
up_kegg$group = 1
dat = rbind( up_kegg, down_kegg )
dat$pvalue = -log10( dat$pvalue )
dat$pvalue = dat$pvalue * dat$group
dat = dat[ order( dat$pvalue, decreasing = F ), ]
g_kegg <- ggplot( dat,
aes(x = reorder( Description, order( pvalue, decreasing=F ) ), y = pvalue, fill = group)) +
geom_bar( stat = "identity" ) +
scale_fill_gradient( low = "blue", high = "red", guide = FALSE ) +
scale_x_discrete( name = "Pathway names" ) +
scale_y_continuous( name = "log10P-value" ) +
coord_flip() + theme_bw() + theme( plot.title = element_text( hjust = 0.5 ) ) +
ggtitle( "Pathway Enrichment" )
print( g_kegg )
ggsave( g_kegg, filename = 'kegg_up_down_edgeR.png' )
}
KEGG通路几乎没有差异,说明三个方法找到的差异基因很一致
# 挑选显著表达差异的基因热图可视化看看效果
load(file = 'tnbc_paired_exprSet.Rdata')
load('TCGA-BRCA-mRNA-DEG_results.Rdata')
exprSet[1:4,1:4]
exprSet=log2(edgeR::cpm(exprSet)+1)
pheatmap::pheatmap(exprSet[rownames(head(DESeq2_DEG)),])
## 挑选指定基因看它在同一个病人的配对表达情况
dim(exprSet)
group_list=factor(ifelse(as.numeric(substr(colnames(exprSet),14,15)) < 10,'tumor','normal'))
table(group_list)
library(ggpubr)
dat=data.frame(g=group_list,
p=colnames(exprSet),
v=as.numeric(exprSet[rownames(DESeq2_DEG[1,]),]))
ggpaired(dat, x = "g", y = "v",
color = "g", line.color = "gray", line.size = 0.4,
palette = "npg")+stat_compare_means()
第一个基因相当有差异性
■ ■ ■