欢迎来到医科研，这里是白介素2的读书笔记，跟我一起聊临床与科研的故事, 生物医学数据挖掘，R语言，TCGA、GEO数据挖掘。

1 GDCRNATools-TCGA数据挖掘

1.1 基本介绍

TCGA ceRNA网络构建神器-几乎要终结那个著名的ceRNA套路了
这是一个用户友好的R/Bioconductor软件包，可用于下载，组织和分析GDC中的RNA数据，亮点功能是能分析ceRNA调控网络。我们的软件包中使用了许多广泛使用的生物信息学工具和数据库。用户可以轻易的集成到自己的工作流程中。

1.2 GDCRNATools主要分析功能

基于limma，edgeR，DESeq2的差异分析
CoxPH and KM单因素生存分析
ceRNA网络分析
GO,KEGG,DO的功能富集分析

1.2.1 工作目录

1temp<-("D:/R/GDCRNATools") 2setwd(temp)

1.2.2 install pacakge

1## try http:// if https:// URLs are not supported 2if (!requireNamespace("BiocManager", quietly=TRUE)) 3 install.packages("BiocManager") 4library(GDCRNATools)1## 1## 1## Registered S3 method overwritten by 'enrichplot': 2## method from 3## fortify.enrichResult DOSE 1## ############################################################################## 2## Pathview is an open source software package distributed under GNU General 3## Public License version 3 (GPLv3). Details of GPLv3 is available at 4## http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to 5## formally cite the original Pathview paper (not just mention it) in publications 6## or products. For details, do citation("pathview") within R. 7## 8## The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG 9## license agreement (details at http://www.kegg.jp/kegg/legal.html). 10## ##############################################################################

1.3 简单案例分析-牛刀小试

1.3.1 count data 标准化

1library(DT) 2 3### load RNA counts data 4data(rnaCounts) 5rnaCounts[1:5,1:5] 1## TCGA-3X-AAV9-01 TCGA-3X-AAVA-01 TCGA-3X-AAVB-01 2## ENSG00000003989 1520 960 2177 3## ENSG00000004799 7659 957 2295 4## ENSG00000005812 2246 1698 2454 5## ENSG00000007908 214 39 30 6## ENSG00000011105 5246 1664 4028 7## TCGA-3X-AAVC-01 TCGA-3X-AAVE-01 8## ENSG00000003989 454 1211 9## ENSG00000004799 9393 908 10## ENSG00000005812 2377 1561 11## ENSG00000007908 237 51 12## ENSG00000011105 1310 19711dim(rnaCounts)1## [1] 1000 451### load miRNAs counts data 2data(mirCounts) 3mirCounts[1:5,1:5] 1## TCGA-3X-AAV9-01 TCGA-3X-AAVA-01 TCGA-3X-AAVB-01 2## hsa-let-7a-5p 165141 132094 210259 3## hsa-let-7a-3p 204 169 298 4## hsa-let-7a-2-3p 30 26 50 5## hsa-let-7b-5p 63905 38917 138022 6## hsa-let-7b-3p 50 60 138 7## TCGA-3X-AAVC-01 TCGA-3X-AAVE-01 8## hsa-let-7a-5p 84663 117897 9## hsa-let-7a-3p 127 149 10## hsa-let-7a-2-3p 1 39 11## hsa-let-7b-5p 55664 101545 12## hsa-let-7b-3p 65 1251####### Normalization of RNAseq data ####### 2rnaExpr <- gdcVoomNormalization(counts = rnaCounts, filter = FALSE) 3rnaExpr[1:5,1:5] 1## TCGA-3X-AAV9-01 TCGA-3X-AAVA-01 TCGA-3X-AAVB-01 2## ENSG00000003989 10.435492 10.007138 10.990261 3## ENSG00000004799 12.768196 10.002625 11.066397 4## ENSG00000005812 10.998625 10.829542 11.163017 5## ENSG00000007908 7.609995 5.403277 4.832541 6## ENSG00000011105 12.222301 10.800370 11.877831 7## TCGA-3X-AAVC-01 TCGA-3X-AAVE-01 8## ENSG00000003989 8.859192 10.094094 9## ENSG00000004799 13.228502 9.678858 10## ENSG00000005812 11.246285 10.460233 11## ENSG00000007908 7.922839 5.538016 12## ENSG00000011105 10.386957 10.7965941####### Normalization of miRNAs data ####### 2mirExpr <- gdcVoomNormalization(counts = mirCounts, filter = FALSE) 3mirExpr[1:5,1:5] 1## TCGA-3X-AAV9-01 TCGA-3X-AAVA-01 TCGA-3X-AAVB-01 2## hsa-let-7a-5p 14.781661 14.850358 15.066842 3## hsa-let-7a-3p 5.124275 5.244289 5.606618 4## hsa-let-7a-2-3p 2.379055 2.567068 3.043242 5## hsa-let-7b-5p 13.411970 13.087279 14.459575 6## hsa-let-7b-3p 3.106529 3.758011 4.498773 7## TCGA-3X-AAVC-01 TCGA-3X-AAVE-01 8## hsa-let-7a-5p 13.937813 14.653520 9## hsa-let-7a-3p 4.562713 5.030348 10## hsa-let-7a-2-3p -1.846678 3.110128 11## hsa-let-7b-5p 13.332830 14.438114 12## hsa-let-7b-3p 3.601783 4.777890

1.3.2 解析metadata

1####### Parse and filter RNAseq metadata ####### 2metaMatrix.RNA <- gdcParseMetadata(project.id = 'TCGA-CHOL', 3 data.type = 'RNAseq', 4 write.meta = FALSE) 5metaMatrix.RNA[1:5,1:5]1## # A tibble: 5 x 5 2## file_name file_id patient sample submitter_id 3## <chr> <chr> <chr> <chr> <chr> 4## 1 725eaa94-5221-4c22-bce~ 85bc7f81-51fb-44~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAV9~ 5## 2 b6a2c03a-c8ad-41e9-8a1~ 42b8d463-6209-4e~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVA~ 6## 3 c2765336-c804-4fd2-b45~ 6e2031e9-df75-48~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVB~ 7## 4 8b20cba8-9fd5-4d56-bd0~ 19e8fd21-f6c8-49~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVC~ 8## 5 4082f7d5-5656-476a-9aa~ 1ace0df3-9837-46~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVE~1colnames(metaMatrix.RNA)1## [1] "file_name" "file_id" 2## [3] "patient" "sample" 3## [5] "submitter_id" "entity_submitter_id" 4## [7] "sample_type" "gender" 5## [9] "age_at_diagnosis" "tumor_stage" 6## [11] "tumor_grade" "days_to_death" 7## [13] "days_to_last_follow_up" "vital_status" 8## [15] "project_id"1## metadata过滤 2metaMatrix.RNA <- gdcFilterDuplicate(metaMatrix.RNA)1## Removed 0 samples1metaMatrix.RNA <- gdcFilterSampleType(metaMatrix.RNA)1## Removed 0 samples1metaMatrix.RNA[1:5,1:5]1## # A tibble: 5 x 5 2## file_name file_id patient sample submitter_id 3## <chr> <chr> <chr> <chr> <chr> 4## 1 725eaa94-5221-4c22-bce~ 85bc7f81-51fb-44~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAV9~ 5## 2 b6a2c03a-c8ad-41e9-8a1~ 42b8d463-6209-4e~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVA~ 6## 3 c2765336-c804-4fd2-b45~ 6e2031e9-df75-48~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVB~ 7## 4 8b20cba8-9fd5-4d56-bd0~ 19e8fd21-f6c8-49~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVC~ 8## 5 4082f7d5-5656-476a-9aa~ 1ace0df3-9837-46~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVE~1table(metaMatrix.RNA$sample_type)1## 2## PrimaryTumor SolidTissueNormal 3## 36 9

1.4 ceRNA网络分析

1.4.1 筛选差异mRNA,miRNA,lncRNA

1DEGAll <- gdcDEAnalysis(counts = rnaCounts, # rnacounts数据 2 group = metaMatrix.RNA$sample_type, # 分组 3 comparison = 'PrimaryTumor-SolidTissueNormal', # 比较 4 method = 'limma') 5## 展示所有基因 6DEGAll[1:5,1:5]1## # A tibble: 5 x 5 2## symbol group logFC AveExpr t 3## <fct> <fct> <dbl> <dbl> <dbl> 4## 1 NR1I3 protein_coding -6.92 7.02 -17.3 5## 2 ETFRF1 protein_coding -2.49 9.52 -16.1 6## 3 SOX5 protein_coding -4.87 6.23 -15.0 7## 4 ABCA8 protein_coding -5.65 7.52 -14.9 8## 5 ISOC1 protein_coding -2.37 10.5 -14.61dim(DEGAll)1## [1] 565 81table(DEGAll$group)1## 2## IG long_non_coding ncRNA protein_coding 3## 2 101 8 403 4## pseudogene TEC TR 5## 45 4 21range(DEGAll$logFC)1## [1] -10.855558 5.9194611### All DEGs 2deALL <- gdcDEReport(deg = DEGAll, gene.type = 'all') 3dim(deALL)1## [1] 283 81### DE long-noncoding 2deLNC <- gdcDEReport(deg = DEGAll, gene.type = 'long_non_coding') 3 4### DE protein coding genes 5dePC <- gdcDEReport(deg = DEGAll, gene.type = 'protein_coding')

1.4.2 构建差异基因ceRNA网络

1ceOutput <- gdcCEAnalysis(lnc = rownames(deLNC), # lncRNA 2 pc = rownames(dePC), # mRNA 3 lnc.targets = 'starBase', 4 pc.targets = 'starBase', 5 rna.expr = rnaExpr, 6 mir.expr = mirExpr) 71## Step 1/3: Hypergenometric test done !1## Step 2/3: Correlation analysis done ! 2## Step 3/3: Regulation pattern analysis done !1dim(ceOutput)1## [1] 453 131ceOutput[1:5,1:13] 1## # A tibble: 5 x 13 2## lncRNAs Genes Counts listTotal popHits popTotal foldEnrichment 3## <chr> <chr> <chr> <chr> <chr> <chr> <chr> 4## 1 ENSG00~ ENSG~ 2 2 95 277 2.91578947368~ 5## 2 ENSG00~ ENSG~ 2 2 24 277 11.5416666666~ 6## 3 ENSG00~ ENSG~ 2 2 8 277 34.625 7## 4 ENSG00~ ENSG~ 2 2 20 277 13.85 8## 5 ENSG00~ ENSG~ 2 2 28 277 9.89285714285~ 9## # ... with 6 more variables: hyperPValue <chr>, miRNAs <chr>, cor <dbl>, 10## # corPValue <dbl>, regSim <dbl>, sppc <dbl>

1.4.3 导出ceRNA网络到cytoscape

1ceOutput2 <- ceOutput[ceOutput$hyperPValue<0.01 ## hyperPvalue少选 2 & ceOutput$corPValue<0.01 & ceOutput$regSim != 0,] ## corPvalue筛选 3 4## 导出edges 5edges <- gdcExportNetwork(ceNetwork = ceOutput2, net = 'edges') 6str(edges)1## 'data.frame': 452 obs. of 3 variables: 2## $ fromNode : chr "ENSG00000234456" "ENSG00000234456" "ENSG00000234741" "ENSG00000255717" ... 3## $ toNode : chr "hsa-miR-374b-5p" "hsa-miR-374a-5p" "hsa-miR-137" "hsa-miR-377-3p" ... 4## $ altNode1Name: chr "MAGI2-AS3" "MAGI2-AS3" "GAS5" "SNHG1" ...1head(edges)1## # A tibble: 6 x 3 2## fromNode toNode altNode1Name 3## <chr> <chr> <chr> 4## 1 ENSG00000234456 hsa-miR-374b-5p MAGI2-AS3 5## 2 ENSG00000234456 hsa-miR-374a-5p MAGI2-AS3 6## 3 ENSG00000234741 hsa-miR-137 GAS5 7## 4 ENSG00000255717 hsa-miR-377-3p SNHG1 8## 5 ENSG00000255717 hsa-miR-421 SNHG1 9## 6 ENSG00000255717 hsa-miR-326 SNHG11## 导出nodes 2nodes <- gdcExportNetwork(ceNetwork = ceOutput2, net = 'nodes') 3str(nodes)1## 'data.frame': 187 obs. of 4 variables: 2## $ gene : Factor w/ 187 levels "ENSG00000003989",..: 1 2 3 4 5 6 7 8 9 10 ... 3## $ symbol : Factor w/ 187 levels "ABCC5","AC015813.1",..: 162 130 31 154 131 129 124 118 113 15 ... 4## $ type : Factor w/ 3 levels "lnc","mir","pc": 3 3 3 3 3 3 3 3 3 3 ... 5## $ numInteractions: num 2 5 3 3 3 2 2 2 2 1 ...1head(nodes)1## # A tibble: 6 x 4 2## gene symbol type numInteractions 3## <fct> <fct> <fct> <dbl> 4## 1 ENSG00000003989 SLC7A2 pc 2 5## 2 ENSG00000004799 PDK4 pc 5 6## 3 ENSG00000021826 CPS1 pc 3 7## 4 ENSG00000047634 SCML1 pc 3 8## 5 ENSG00000049246 PER3 pc 3 9## 6 ENSG00000065320 NTN1 pc 2

1.4.4 参考资料

[GDCRNAtools说明文档]http://bioconductor.org/packages/devel/bioc/vignettes/GDCRNATools/inst/doc/GDCRNATools.html#data-download
[GDCRNATools: an R/Bioconductor package for integrative analysis of lncRNA, miRNA and mRNA data in GDC]https://academic.oup.com/bioinformatics/article/34/14/2515/4917355

GDCRNATools|ceRNA套路终结者Part1