GDCRNATools|ceRNA套路终结者Part1

欢迎来到医科研,这里是白介素2的读书笔记,跟我一起聊临床与科研的故事, 生物医学数据挖掘,R语言,TCGA、GEO数据挖掘。


1 GDCRNATools-TCGA数据挖掘

1.1 基本介绍

  • TCGA ceRNA网络构建神器-几乎要终结那个著名的ceRNA套路了

  • 这是一个用户友好的R/Bioconductor软件包,可用于下载,组织和分析GDC中的RNA数据,亮点功能是能分析ceRNA调控网络。我们的软件包中使用了许多广泛使用的生物信息学工具和数据库。用户可以轻易的集成到自己的工作流程中。

1.2 GDCRNATools主要分析功能

  1. 基于limma,edgeR,DESeq2的差异分析

  2. CoxPH and KM单因素生存分析

  3. ceRNA网络分析

  4. GO,KEGG,DO的功能富集分析

1.2.1 工作目录

1temp<-("D:/R/GDCRNATools")
2setwd(temp)

1.2.2 install pacakge

1## try http:// if https:// URLs are not supported
2if (!requireNamespace("BiocManager", quietly=TRUE))
3    install.packages("BiocManager")
4library(GDCRNATools)
1## 
1## 
1## Registered S3 method overwritten by 'enrichplot':
2##   method               from
3##   fortify.enrichResult DOSE
1## ##############################################################################
2## Pathview is an open source software package distributed under GNU General
3## Public License version 3 (GPLv3). Details of GPLv3 is available at
4## http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to
5## formally cite the original Pathview paper (not just mention it) in publications
6## or products. For details, do citation("pathview") within R.
7## 
8## The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG
9## license agreement (details at http://www.kegg.jp/kegg/legal.html).
10## ##############################################################################

1.3 简单案例分析-牛刀小试

1.3.1 count data 标准化

1library(DT)
2
3### load RNA counts data
4data(rnaCounts)
5rnaCounts[1:5,1:5]
1##                 TCGA-3X-AAV9-01 TCGA-3X-AAVA-01 TCGA-3X-AAVB-01
2## ENSG00000003989            1520             960            2177
3## ENSG00000004799            7659             957            2295
4## ENSG00000005812            2246            1698            2454
5## ENSG00000007908             214              39              30
6## ENSG00000011105            5246            1664            4028
7##                 TCGA-3X-AAVC-01 TCGA-3X-AAVE-01
8## ENSG00000003989             454            1211
9## ENSG00000004799            9393             908
10## ENSG00000005812            2377            1561
11## ENSG00000007908             237              51
12## ENSG00000011105            1310            1971
1dim(rnaCounts)
1## [1] 1000   45
1### load miRNAs counts data
2data(mirCounts)
3mirCounts[1:5,1:5]
1##                 TCGA-3X-AAV9-01 TCGA-3X-AAVA-01 TCGA-3X-AAVB-01
2## hsa-let-7a-5p            165141          132094          210259
3## hsa-let-7a-3p               204             169             298
4## hsa-let-7a-2-3p              30              26              50
5## hsa-let-7b-5p             63905           38917          138022
6## hsa-let-7b-3p                50              60             138
7##                 TCGA-3X-AAVC-01 TCGA-3X-AAVE-01
8## hsa-let-7a-5p             84663          117897
9## hsa-let-7a-3p               127             149
10## hsa-let-7a-2-3p               1              39
11## hsa-let-7b-5p             55664          101545
12## hsa-let-7b-3p                65             125
1####### Normalization of RNAseq data #######
2rnaExpr <- gdcVoomNormalization(counts = rnaCounts, filter = FALSE)
3rnaExpr[1:5,1:5]
1##                 TCGA-3X-AAV9-01 TCGA-3X-AAVA-01 TCGA-3X-AAVB-01
2## ENSG00000003989       10.435492       10.007138       10.990261
3## ENSG00000004799       12.768196       10.002625       11.066397
4## ENSG00000005812       10.998625       10.829542       11.163017
5## ENSG00000007908        7.609995        5.403277        4.832541
6## ENSG00000011105       12.222301       10.800370       11.877831
7##                 TCGA-3X-AAVC-01 TCGA-3X-AAVE-01
8## ENSG00000003989        8.859192       10.094094
9## ENSG00000004799       13.228502        9.678858
10## ENSG00000005812       11.246285       10.460233
11## ENSG00000007908        7.922839        5.538016
12## ENSG00000011105       10.386957       10.796594
1####### Normalization of miRNAs data #######
2mirExpr <- gdcVoomNormalization(counts = mirCounts, filter = FALSE)
3mirExpr[1:5,1:5]
1##                 TCGA-3X-AAV9-01 TCGA-3X-AAVA-01 TCGA-3X-AAVB-01
2## hsa-let-7a-5p         14.781661       14.850358       15.066842
3## hsa-let-7a-3p          5.124275        5.244289        5.606618
4## hsa-let-7a-2-3p        2.379055        2.567068        3.043242
5## hsa-let-7b-5p         13.411970       13.087279       14.459575
6## hsa-let-7b-3p          3.106529        3.758011        4.498773
7##                 TCGA-3X-AAVC-01 TCGA-3X-AAVE-01
8## hsa-let-7a-5p         13.937813       14.653520
9## hsa-let-7a-3p          4.562713        5.030348
10## hsa-let-7a-2-3p       -1.846678        3.110128
11## hsa-let-7b-5p         13.332830       14.438114
12## hsa-let-7b-3p          3.601783        4.777890

1.3.2 解析metadata

1####### Parse and filter RNAseq metadata #######
2metaMatrix.RNA <- gdcParseMetadata(project.id = 'TCGA-CHOL',
3                                   data.type  = 'RNAseq', 
4                                   write.meta = FALSE)
5metaMatrix.RNA[1:5,1:5]
1## # A tibble: 5 x 5
2##   file_name               file_id           patient  sample   submitter_id 
3##   <chr>                   <chr>             <chr>    <chr>    <chr>        
4## 1 725eaa94-5221-4c22-bce~ 85bc7f81-51fb-44~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAV9~
5## 2 b6a2c03a-c8ad-41e9-8a1~ 42b8d463-6209-4e~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVA~
6## 3 c2765336-c804-4fd2-b45~ 6e2031e9-df75-48~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVB~
7## 4 8b20cba8-9fd5-4d56-bd0~ 19e8fd21-f6c8-49~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVC~
8## 5 4082f7d5-5656-476a-9aa~ 1ace0df3-9837-46~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVE~
1colnames(metaMatrix.RNA)
1##  [1] "file_name"              "file_id"               
2##  [3] "patient"                "sample"                
3##  [5] "submitter_id"           "entity_submitter_id"   
4##  [7] "sample_type"            "gender"                
5##  [9] "age_at_diagnosis"       "tumor_stage"           
6## [11] "tumor_grade"            "days_to_death"         
7## [13] "days_to_last_follow_up" "vital_status"          
8## [15] "project_id"
1## metadata过滤
2metaMatrix.RNA <- gdcFilterDuplicate(metaMatrix.RNA)
1## Removed 0 samples
1metaMatrix.RNA <- gdcFilterSampleType(metaMatrix.RNA)
1## Removed 0 samples
1metaMatrix.RNA[1:5,1:5]
1## # A tibble: 5 x 5
2##   file_name               file_id           patient  sample   submitter_id 
3##   <chr>                   <chr>             <chr>    <chr>    <chr>        
4## 1 725eaa94-5221-4c22-bce~ 85bc7f81-51fb-44~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAV9~
5## 2 b6a2c03a-c8ad-41e9-8a1~ 42b8d463-6209-4e~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVA~
6## 3 c2765336-c804-4fd2-b45~ 6e2031e9-df75-48~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVB~
7## 4 8b20cba8-9fd5-4d56-bd0~ 19e8fd21-f6c8-49~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVC~
8## 5 4082f7d5-5656-476a-9aa~ 1ace0df3-9837-46~ TCGA-3X~ TCGA-3X~ TCGA-3X-AAVE~
1table(metaMatrix.RNA$sample_type)
1## 
2##      PrimaryTumor SolidTissueNormal 
3##                36                 9

1.4 ceRNA网络分析

1.4.1 筛选差异mRNA,miRNA,lncRNA

1DEGAll <- gdcDEAnalysis(counts     = rnaCounts, # rnacounts数据
2                        group      = metaMatrix.RNA$sample_type, # 分组
3                        comparison = 'PrimaryTumor-SolidTissueNormal', # 比较
4                        method     = 'limma')
5## 展示所有基因
6DEGAll[1:5,1:5]
1## # A tibble: 5 x 5
2##   symbol group          logFC AveExpr     t
3##   <fct>  <fct>          <dbl>   <dbl> <dbl>
4## 1 NR1I3  protein_coding -6.92    7.02 -17.3
5## 2 ETFRF1 protein_coding -2.49    9.52 -16.1
6## 3 SOX5   protein_coding -4.87    6.23 -15.0
7## 4 ABCA8  protein_coding -5.65    7.52 -14.9
8## 5 ISOC1  protein_coding -2.37   10.5  -14.6
1dim(DEGAll)
1## [1] 565   8
1table(DEGAll$group)
1## 
2##              IG long_non_coding           ncRNA  protein_coding 
3##               2             101               8             403 
4##      pseudogene             TEC              TR 
5##              45               4               2
1range(DEGAll$logFC)
1## [1] -10.855558   5.919461
1### All DEGs
2deALL <- gdcDEReport(deg = DEGAll, gene.type = 'all')
3dim(deALL)
1## [1] 283   8
1### DE long-noncoding
2deLNC <- gdcDEReport(deg = DEGAll, gene.type = 'long_non_coding')
3
4### DE protein coding genes
5dePC <- gdcDEReport(deg = DEGAll, gene.type = 'protein_coding')

1.4.2 构建差异基因ceRNA网络

1ceOutput <- gdcCEAnalysis(lnc         = rownames(deLNC), # lncRNA
2                          pc          = rownames(dePC), # mRNA
3                          lnc.targets = 'starBase', 
4                          pc.targets  = 'starBase', 
5                          rna.expr    = rnaExpr, 
6                          mir.expr    = mirExpr)
7
1## Step 1/3: Hypergenometric test done !
1## Step 2/3: Correlation analysis done !
2## Step 3/3: Regulation pattern analysis done !
1dim(ceOutput)
1## [1] 453  13
1ceOutput[1:5,1:13]
1## # A tibble: 5 x 13
2##   lncRNAs Genes Counts listTotal popHits popTotal foldEnrichment
3##   <chr>   <chr> <chr>  <chr>     <chr>   <chr>    <chr>         
4## 1 ENSG00~ ENSG~ 2      2         95      277      2.91578947368~
5## 2 ENSG00~ ENSG~ 2      2         24      277      11.5416666666~
6## 3 ENSG00~ ENSG~ 2      2         8       277      34.625        
7## 4 ENSG00~ ENSG~ 2      2         20      277      13.85         
8## 5 ENSG00~ ENSG~ 2      2         28      277      9.89285714285~
9## # ... with 6 more variables: hyperPValue <chr>, miRNAs <chr>, cor <dbl>,
10## #   corPValue <dbl>, regSim <dbl>, sppc <dbl>

1.4.3 导出ceRNA网络到cytoscape

1ceOutput2 <- ceOutput[ceOutput$hyperPValue<0.01 ## hyperPvalue少选
2    & ceOutput$corPValue<0.01 & ceOutput$regSim != 0,] ## corPvalue筛选
3
4## 导出edges
5edges <- gdcExportNetwork(ceNetwork = ceOutput2, net = 'edges')
6str(edges)
1## 'data.frame':    452 obs. of  3 variables:
2##  $ fromNode    : chr  "ENSG00000234456" "ENSG00000234456" "ENSG00000234741" "ENSG00000255717" ...
3##  $ toNode      : chr  "hsa-miR-374b-5p" "hsa-miR-374a-5p" "hsa-miR-137" "hsa-miR-377-3p" ...
4##  $ altNode1Name: chr  "MAGI2-AS3" "MAGI2-AS3" "GAS5" "SNHG1" ...
1head(edges)
1## # A tibble: 6 x 3
2##   fromNode        toNode          altNode1Name
3##   <chr>           <chr>           <chr>       
4## 1 ENSG00000234456 hsa-miR-374b-5p MAGI2-AS3   
5## 2 ENSG00000234456 hsa-miR-374a-5p MAGI2-AS3   
6## 3 ENSG00000234741 hsa-miR-137     GAS5        
7## 4 ENSG00000255717 hsa-miR-377-3p  SNHG1       
8## 5 ENSG00000255717 hsa-miR-421     SNHG1       
9## 6 ENSG00000255717 hsa-miR-326     SNHG1
1## 导出nodes
2nodes <- gdcExportNetwork(ceNetwork = ceOutput2, net = 'nodes')
3str(nodes)
1## 'data.frame':    187 obs. of  4 variables:
2##  $ gene           : Factor w/ 187 levels "ENSG00000003989",..: 1 2 3 4 5 6 7 8 9 10 ...
3##  $ symbol         : Factor w/ 187 levels "ABCC5","AC015813.1",..: 162 130 31 154 131 129 124 118 113 15 ...
4##  $ type           : Factor w/ 3 levels "lnc","mir","pc": 3 3 3 3 3 3 3 3 3 3 ...
5##  $ numInteractions: num  2 5 3 3 3 2 2 2 2 1 ...
1head(nodes)
1## # A tibble: 6 x 4
2##   gene            symbol type  numInteractions
3##   <fct>           <fct>  <fct>           <dbl>
4## 1 ENSG00000003989 SLC7A2 pc                  2
5## 2 ENSG00000004799 PDK4   pc                  5
6## 3 ENSG00000021826 CPS1   pc                  3
7## 4 ENSG00000047634 SCML1  pc                  3
8## 5 ENSG00000049246 PER3   pc                  3
9## 6 ENSG00000065320 NTN1   pc                  2

1.4.4 参考资料

  1. [GDCRNAtools说明文档]http://bioconductor.org/packages/devel/bioc/vignettes/GDCRNATools/inst/doc/GDCRNATools.html#data-download

  2. [GDCRNATools: an R/Bioconductor package for integrative analysis of lncRNA, miRNA and mRNA data in GDC]https://academic.oup.com/bioinformatics/article/34/14/2515/4917355

(0)

相关推荐