GDCRNATools|ceRNA套路终结者Part1
欢迎来到医科研,这里是白介素2的读书笔记,跟我一起聊临床与科研的故事, 生物医学数据挖掘,R语言,TCGA、GEO数据挖掘。
1 GDCRNATools-TCGA数据挖掘
1.1 基本介绍
TCGA ceRNA网络构建神器-几乎要终结那个著名的ceRNA套路了
这是一个用户友好的R/Bioconductor软件包,可用于下载,组织和分析GDC中的RNA数据,亮点功能是能分析ceRNA调控网络。我们的软件包中使用了许多广泛使用的生物信息学工具和数据库。用户可以轻易的集成到自己的工作流程中。
1.2 GDCRNATools主要分析功能
基于limma,edgeR,DESeq2的差异分析
CoxPH and KM单因素生存分析
ceRNA网络分析
GO,KEGG,DO的功能富集分析
1.2.1 工作目录
1temp<-("D:/R/GDCRNATools")
2setwd(temp)
1.2.2 install pacakge
1## try http:// if https:// URLs are not supported
2if (!requireNamespace("BiocManager", quietly=TRUE))
3 install.packages("BiocManager")
4library(GDCRNATools)1##
1##
1## Registered S3 method overwritten by 'enrichplot':
2## method from
3## fortify.enrichResult DOSE1## ##############################################################################
2## Pathview is an open source software package distributed under GNU General
3## Public License version 3 (GPLv3). Details of GPLv3 is available at
4## http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to
5## formally cite the original Pathview paper (not just mention it) in publications
6## or products. For details, do citation("pathview") within R.
7##
8## The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG
9## license agreement (details at http://www.kegg.jp/kegg/legal.html).
10## ##############################################################################
1.3 简单案例分析-牛刀小试
1.3.1 count data 标准化
1library(DT)
2
3### load RNA counts data
4data(rnaCounts)
5rnaCounts[1:5,1:5]1## TCGA-3X-AAV9-01 TCGA-3X-AAVA-01 TCGA-3X-AAVB-01
2## ENSG00000003989 1520 960 2177
3## ENSG00000004799 7659 957 2295
4## ENSG00000005812 2246 1698 2454
5## ENSG00000007908 214 39 30
6## ENSG00000011105 5246 1664 4028
7## TCGA-3X-AAVC-01 TCGA-3X-AAVE-01
8## ENSG00000003989 454 1211
9## ENSG00000004799 9393 908
10## ENSG00000005812 2377 1561
11## ENSG00000007908 237 51
12## ENSG00000011105 1310 19711dim(rnaCounts)
1## [1] 1000 45
1### load miRNAs counts data
2data(mirCounts)
3mirCounts[1:5,1:5]1## TCGA-3X-AAV9-01 TCGA-3X-AAVA-01 TCGA-3X-AAVB-01
2## hsa-let-7a-5p 165141 132094 210259
3## hsa-let-7a-3p 204 169 298
4## hsa-let-7a-2-3p 30 26 50
5## hsa-let-7b-5p 63905 38917 138022
6## hsa-let-7b-3p 50 60 138
7## TCGA-3X-AAVC-01 TCGA-3X-AAVE-01
8## hsa-let-7a-5p 84663 117897
9## hsa-let-7a-3p 127 149
10## hsa-let-7a-2-3p 1 39
11## hsa-let-7b-5p 55664 101545
12## hsa-let-7b-3p 65 1251####### Normalization of RNAseq data #######
2rnaExpr <- gdcVoomNormalization(counts = rnaCounts, filter = FALSE)
3rnaExpr[1:5,1:5]1## TCGA-3X-AAV9-01 TCGA-3X-AAVA-01 TCGA-3X-AAVB-01
2## ENSG00000003989 10.435492 10.007138 10.990261
3## ENSG00000004799 12.768196 10.002625 11.066397
4## ENSG00000005812 10.998625 10.829542 11.163017
5## ENSG00000007908 7.609995 5.403277 4.832541
6## ENSG00000011105 12.222301 10.800370 11.877831
7## TCGA-3X-AAVC-01 TCGA-3X-AAVE-01
8## ENSG00000003989 8.859192 10.094094
9## ENSG00000004799 13.228502 9.678858
10## ENSG00000005812 11.246285 10.460233
11## ENSG00000007908 7.922839 5.538016
12## ENSG00000011105 10.386957 10.7965941####### Normalization of miRNAs data #######
2mirExpr <- gdcVoomNormalization(counts = mirCounts, filter = FALSE)
3mirExpr[1:5,1:5]1## TCGA-3X-AAV9-01 TCGA-3X-AAVA-01 TCGA-3X-AAVB-01
2## hsa-let-7a-5p 14.781661 14.850358 15.066842
3## hsa-let-7a-3p 5.124275 5.244289 5.606618
4## hsa-let-7a-2-3p 2.379055 2.567068 3.043242
5## hsa-let-7b-5p 13.411970 13.087279 14.459575
6## hsa-let-7b-3p 3.106529 3.758011 4.498773
7## TCGA-3X-AAVC-01 TCGA-3X-AAVE-01
8## hsa-let-7a-5p 13.937813 14.653520
9## hsa-let-7a-3p 4.562713 5.030348
10## hsa-let-7a-2-3p -1.846678 3.110128
11## hsa-let-7b-5p 13.332830 14.438114
12## hsa-let-7b-3p 3.601783 4.777890
1.3.2 解析metadata
1####### Parse and filter RNAseq metadata #######
2metaMatrix.RNA <- gdcParseMetadata(project.id = 'TCGA-CHOL',
3 data.type = 'RNAseq',
4 write.meta = FALSE)
5metaMatrix.RNA[1:5,1:5]1## # A tibble: 5 x 5
2## file_name file_id patient sample submitter_id
3## <chr> <chr> <chr> <chr> <chr>
4## 1 725eaa94-5221-4c22-bce~ 85bc7f81-51fb-44~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAV9~
5## 2 b6a2c03a-c8ad-41e9-8a1~ 42b8d463-6209-4e~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVA~
6## 3 c2765336-c804-4fd2-b45~ 6e2031e9-df75-48~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVB~
7## 4 8b20cba8-9fd5-4d56-bd0~ 19e8fd21-f6c8-49~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVC~
8## 5 4082f7d5-5656-476a-9aa~ 1ace0df3-9837-46~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVE~1colnames(metaMatrix.RNA)
1## [1] "file_name" "file_id"
2## [3] "patient" "sample"
3## [5] "submitter_id" "entity_submitter_id"
4## [7] "sample_type" "gender"
5## [9] "age_at_diagnosis" "tumor_stage"
6## [11] "tumor_grade" "days_to_death"
7## [13] "days_to_last_follow_up" "vital_status"
8## [15] "project_id"1## metadata过滤
2metaMatrix.RNA <- gdcFilterDuplicate(metaMatrix.RNA)1## Removed 0 samples
1metaMatrix.RNA <- gdcFilterSampleType(metaMatrix.RNA)
1## Removed 0 samples
1metaMatrix.RNA[1:5,1:5]
1## # A tibble: 5 x 5
2## file_name file_id patient sample submitter_id
3## <chr> <chr> <chr> <chr> <chr>
4## 1 725eaa94-5221-4c22-bce~ 85bc7f81-51fb-44~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAV9~
5## 2 b6a2c03a-c8ad-41e9-8a1~ 42b8d463-6209-4e~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVA~
6## 3 c2765336-c804-4fd2-b45~ 6e2031e9-df75-48~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVB~
7## 4 8b20cba8-9fd5-4d56-bd0~ 19e8fd21-f6c8-49~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVC~
8## 5 4082f7d5-5656-476a-9aa~ 1ace0df3-9837-46~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVE~1table(metaMatrix.RNA$sample_type)
1##
2## PrimaryTumor SolidTissueNormal
3## 36 9
1.4 ceRNA网络分析
1.4.1 筛选差异mRNA,miRNA,lncRNA
1DEGAll <- gdcDEAnalysis(counts = rnaCounts, # rnacounts数据
2 group = metaMatrix.RNA$sample_type, # 分组
3 comparison = 'PrimaryTumor-SolidTissueNormal', # 比较
4 method = 'limma')
5## 展示所有基因
6DEGAll[1:5,1:5]1## # A tibble: 5 x 5
2## symbol group logFC AveExpr t
3## <fct> <fct> <dbl> <dbl> <dbl>
4## 1 NR1I3 protein_coding -6.92 7.02 -17.3
5## 2 ETFRF1 protein_coding -2.49 9.52 -16.1
6## 3 SOX5 protein_coding -4.87 6.23 -15.0
7## 4 ABCA8 protein_coding -5.65 7.52 -14.9
8## 5 ISOC1 protein_coding -2.37 10.5 -14.61dim(DEGAll)
1## [1] 565 8
1table(DEGAll$group)
1##
2## IG long_non_coding ncRNA protein_coding
3## 2 101 8 403
4## pseudogene TEC TR
5## 45 4 21range(DEGAll$logFC)
1## [1] -10.855558 5.919461
1### All DEGs
2deALL <- gdcDEReport(deg = DEGAll, gene.type = 'all')
3dim(deALL)1## [1] 283 8
1### DE long-noncoding
2deLNC <- gdcDEReport(deg = DEGAll, gene.type = 'long_non_coding')
3
4### DE protein coding genes
5dePC <- gdcDEReport(deg = DEGAll, gene.type = 'protein_coding')
1.4.2 构建差异基因ceRNA网络
1ceOutput <- gdcCEAnalysis(lnc = rownames(deLNC), # lncRNA
2 pc = rownames(dePC), # mRNA
3 lnc.targets = 'starBase',
4 pc.targets = 'starBase',
5 rna.expr = rnaExpr,
6 mir.expr = mirExpr)
71## Step 1/3: Hypergenometric test done !
1## Step 2/3: Correlation analysis done !
2## Step 3/3: Regulation pattern analysis done !1dim(ceOutput)
1## [1] 453 13
1ceOutput[1:5,1:13]
1## # A tibble: 5 x 13
2## lncRNAs Genes Counts listTotal popHits popTotal foldEnrichment
3## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
4## 1 ENSG00~ ENSG~ 2 2 95 277 2.91578947368~
5## 2 ENSG00~ ENSG~ 2 2 24 277 11.5416666666~
6## 3 ENSG00~ ENSG~ 2 2 8 277 34.625
7## 4 ENSG00~ ENSG~ 2 2 20 277 13.85
8## 5 ENSG00~ ENSG~ 2 2 28 277 9.89285714285~
9## # ... with 6 more variables: hyperPValue <chr>, miRNAs <chr>, cor <dbl>,
10## # corPValue <dbl>, regSim <dbl>, sppc <dbl>
1.4.3 导出ceRNA网络到cytoscape
1ceOutput2 <- ceOutput[ceOutput$hyperPValue<0.01 ## hyperPvalue少选
2 & ceOutput$corPValue<0.01 & ceOutput$regSim != 0,] ## corPvalue筛选
3
4## 导出edges
5edges <- gdcExportNetwork(ceNetwork = ceOutput2, net = 'edges')
6str(edges)1## 'data.frame': 452 obs. of 3 variables:
2## $ fromNode : chr "ENSG00000234456" "ENSG00000234456" "ENSG00000234741" "ENSG00000255717" ...
3## $ toNode : chr "hsa-miR-374b-5p" "hsa-miR-374a-5p" "hsa-miR-137" "hsa-miR-377-3p" ...
4## $ altNode1Name: chr "MAGI2-AS3" "MAGI2-AS3" "GAS5" "SNHG1" ...1head(edges)
1## # A tibble: 6 x 3
2## fromNode toNode altNode1Name
3## <chr> <chr> <chr>
4## 1 ENSG00000234456 hsa-miR-374b-5p MAGI2-AS3
5## 2 ENSG00000234456 hsa-miR-374a-5p MAGI2-AS3
6## 3 ENSG00000234741 hsa-miR-137 GAS5
7## 4 ENSG00000255717 hsa-miR-377-3p SNHG1
8## 5 ENSG00000255717 hsa-miR-421 SNHG1
9## 6 ENSG00000255717 hsa-miR-326 SNHG11## 导出nodes
2nodes <- gdcExportNetwork(ceNetwork = ceOutput2, net = 'nodes')
3str(nodes)1## 'data.frame': 187 obs. of 4 variables:
2## $ gene : Factor w/ 187 levels "ENSG00000003989",..: 1 2 3 4 5 6 7 8 9 10 ...
3## $ symbol : Factor w/ 187 levels "ABCC5","AC015813.1",..: 162 130 31 154 131 129 124 118 113 15 ...
4## $ type : Factor w/ 3 levels "lnc","mir","pc": 3 3 3 3 3 3 3 3 3 3 ...
5## $ numInteractions: num 2 5 3 3 3 2 2 2 2 1 ...1head(nodes)
1## # A tibble: 6 x 4
2## gene symbol type numInteractions
3## <fct> <fct> <fct> <dbl>
4## 1 ENSG00000003989 SLC7A2 pc 2
5## 2 ENSG00000004799 PDK4 pc 5
6## 3 ENSG00000021826 CPS1 pc 3
7## 4 ENSG00000047634 SCML1 pc 3
8## 5 ENSG00000049246 PER3 pc 3
9## 6 ENSG00000065320 NTN1 pc 2
1.4.4 参考资料
[GDCRNAtools说明文档]http://bioconductor.org/packages/devel/bioc/vignettes/GDCRNATools/inst/doc/GDCRNATools.html#data-download
[GDCRNATools: an R/Bioconductor package for integrative analysis of lncRNA, miRNA and mRNA data in GDC]https://academic.oup.com/bioinformatics/article/34/14/2515/4917355