TARGET临床数据学习
参考:共由小兑 https://www.jianshu.com/p/1be8c349dd9f
TARGET ( Therapeutically Applicable Research To Generate
Effective Treatments ),采用多组学方法来确定驱动儿童癌症的分子变化。
包含主要疾病数据
2018年才对中国开放,数据库还在不断更新中。
ALL (Acute Lymphoblastic Leukemia) 急性淋系白血病
AML (Acute Myeloid Leukemia) 急性髓性白血病
KT(Kidney Tumors)肾癌
MDLS (Model Systems)
NBL(Neuroblastoma)神经母细胞瘤
OS(Osteosarcoma)骨肉瘤
TARGET数据类型
临床数据
转录组测序数据
拷贝数变异数据
甲基化数据
miRNA数据
基因表达谱芯片数据
全基因组测序数目
靶向测序数据
TARGET临床数据下载
注意:蓝色的才有权限下载
如果想下载红色链接看下方其他数据下载流程
TARGET其他数据下载
第一步:登录dbGap
dbgap账号需要NCI/NIH认证资格,咨询实验室有无账号。
https://dbgap.ncbi.nlm.nih.gov/aa/wga.cgi?page=login
第二步:确定要下载的数据
https://ocg.cancer.gov/programs/target/data-matrix
点击链接到NCBI后找到数据集gap_parent_phs为phs000218,数据集大类gap_accession为phs000465,可以直接申请整个大类。
第三步:填写申请理由
https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000465.v19.p8
提交给SO后等待即可,同意后会发具体下载方式。
TARGET临床数据分析
学习来源:SCI狂人团队
整理数据并选择感兴趣的因素
拆分建模人群和验证人群
library(foregin)library(survival)library(caret)setwd("/Users/huangyue/Desktop/")target=read.table("/Users/huangyue/Desktop/target.txt",header = T,sep = "\t")#设定生成随机数的种子,种子是为了让结果具有重复性set.seed(300)targetd=createDataPartition(y=target$id,p=0.70,list=F)targetdev=target[targetd, ]targetver=target[-targetd,]write.csv(targetdev,"target_dev.csv")write.csv(targetver,"target_ver.csv")
多因素Cox回归模型
#将数据转换成因子#target$Vital_Status=factor(target$Vital_Status,labels = c("Alive","dead"))target=read.csv("target_dev.csv",header = T)target$Gender=factor(target$Gender,labels = c("Female","Male"))target$Age_group=factor(target$Age_group,labels = c("<18",">=18"))target$WBC_group=factor(target$WBC_group,labels = c("<200",">=200"))target$CNS.disease=factor(target$CNS.disease,labels = c("No","Yes"))target$FLT3_ITD=factor(target$FLT3_ITD,labels = c("No","Yes"))#构建多因素的cox回归模型fmla1=as.formula(Surv(target$Surviva_Time,target$Vital_Status) ~ target$Age_group + target$Gender + target$WBC_group + target$CNS.disease + target$FLT3_ITD)cox2=coxph(fmla1,data = target)summary(cox2)
注意:Vital_Status保留数字状态就行,将其也设置为因子状态会在后面的回归分析中报错,报错如下。
> cox2=coxph(fmla1,data = target)Error in coxph(fmla1, data = target) : an id statement is required for multi-state models
依据Pr(>|z|) 值挑选出有统计学差异的因素,但是由于选出的有统计学意义的太少,一般文章在后面的分析中将这几个因素都做。
lasso回归 防止过度拟合
#install.packages('glmnet')library(glmnet)library(survival)target=read.csv("target_dev.csv",header = T)target = na.omit(target)for (a in names(target)[c(5:13)]) { target[,a] = as.factor(target[,a])}v1 = data.matrix(target[,c(5,6,9,11,12)])v2 = data.matrix(Surv(target$Surviva_Time,target$Vital_Status))myfit = glmnet(v1, v2, family='cox')plot(myfit,xvar='lambda',label=T)myfit2 =cv.glmnet(v1, v2,family='cox')plot(myfit2)abline(v=log(c(myfit2$lambda.min,myfit2$lambda.1se)),lty='dashed')
查看重要因素
coe=coef(myfit,s=myfit2$lambda.min)act_index = which(coe!=0)act_coe=coe[act_index]row.names(coe)[act_index]
报错记录:
myfit2 =cv.glmnet(v1, v2,family='cox')Error in h(simpleError(msg, call)) : 在为'as.matrix'函数选择方法时评估'x'参数出了错: invalid class 'NA' to dup_mMatrix_as_dgeMatrix
解决方法:将as.matrix改成data.matrix
画Nomogram
Nomogram,中文常称为诺莫图或者列线图,是将Logistic回归或Cox回归的结果进行可视化呈现。它根据所有自变量回归系数的大小来制定评分标准,给每个自变量的每种取值水平一个评分,对每个患者,就可计算得到一个总分,再通过得分与结局发生概率之间的转换函数来计算每个患者的结局时间发生的概率。
#加载函数包library(rms)target = na.omit(target)#打包数据dd = datadist(target)options(datadist='dd')#构建COX回归模型cox=cph(Surv(target$Surviva_Time,target$Vital_Status) ~ Age_group + Gender + WBC_group + CNS.disease + FLT3_ITD,data = target,x=T, y=T, surv=T)surv=Survival(cox)#构建nomogramsur_3_year=function(x)surv(1*365*3,lp=x) #3年生存sur_5_year=function(x)surv(1*365*5,lp=x) #5年生存nom_sur = nomogram(cox, fun = list(sur_3_year,sur_5_year),lp=F,funlabel = c('3-Year survival','5-Year survival'), fun.at = c('0.9','0.8','0.7','0.6','0.5','0.4','0.3','0.2','0.1'))#绘制nomogram图plot(nom_sur)
之前没有加上打包数据这一步报错如下:
Error in if (!length(fname) || !any(fname == zname)) { : 需要TRUE/FALSE值的地方不可以用缺少值
加上打包数据的步骤后后面直接用Age_group替换target$Age_group就不会报错啦!
nomogram预测效果的评价
#计算C-index 采用Hmisc包中的rcorrcens函数来计算library(Hmisc)rcorrcens(Surv(target$Surviva_Time,target$Vital_Status) ~ predict(cox))#绘制标准曲线cox=cph(Surv(target$Surviva_Time,target$Vital_Status) ~ Age_group + Gender + WBC_group + CNS.disease + FLT3_ITD,data = target,x=T, y=T, surv=T,, time.inc = 365*3 )cal= calibrate(cox, cmethod = 'KM',method = 'boot' , u = 365*3 , m = 144, B = 1000)#m要根据样本量来确定,由于标准曲线一般将所有样本分为3组(在图中显示3个点),而m代表每组的样本量数,因此m*3应该等于或近似等于样本量;par(mar=c(8,5,3,2),cex = 1.0)plot(cal,lwd=2,lty=1,errbar.col=c(rgb(0,118,192,maxColorValue=255)),xlim=c(0,1),ylim=c(0,1), xlab="Nomogram-Predicted Probabilityof 3-Year DFS",ylab="Actual 3-Year DFS (proportion)", col=c(rgb(192,98,83,maxColorValue=255)))lines(cal[,c("mean.predicted","KM")],type="b",lwd=2,col=c(rgb(192,98,83,maxColorValue=255)),pch=16)abline(0,1,lty=3,lwd=2,col=c(rgb(0,118,192,maxColorValue=255)))
绘制ROC曲线
#计算风险得分library(survival)target=read.csv("/Users/huangyue/Desktop/target_dev.csv",header = T)fmla1=as.formula(Surv(target$Surviva_Time,target$Vital_Status) ~ target$Age_group + target$Gender + target$WBC_group + target$CNS.disease + target$FLT3_ITD)cox_m = coxph(fmla1,data = target)cox_m1 = step(cox_m,direction = 'both')risk_score = predict(cox_m1,type = "risk",newdata = target)risk_level = as.vector(ifelse(risk_score > median(risk_score),'High','Low'))write.table(cbind(id=rownames(cbind(target[,1:2],risk_score,risk_level)), cbind(target[,3:4],risk_score,risk_level)), "risk_score.txt",sep='\t',quote=F,row.names=F)
#绘制ROCinstall.packages("timeROC")library(survival)library(timeROC)target=read.table("risk_score.txt",header = T)predict_1_year = 1*365 #month改为1*12predict_3_year = 3*365ROC =timeROC(T=target$Surviva_Time,delta = target$Vital_Status,marker = target$risk_score,cause=1, weighting = 'marginal',times = c(predict_1_year,predict_3_year),ROC = T)plot(ROC,time = predict_1_year,title = F,lwd=3)plot(ROC,time = predict_3_year,title = F,lwd=3,col = 'blue',add = T)legend('bottomright',c(paste('AUC of 1 year survival:',round(ROC$AUC[1],3)), paste('AUC of 3 year survival:',round(ROC$AUC[2],3))), col = c("red","blue"),lwd = 3)
sensitivity代表真阳性;specificity代表假阳性
AUC 0.5则没有预测能力;0.51-0.7为低准确度;0.71-0.9为中等准确度;>0.9则为高准确度
模型验证
用ver数据再计算一遍C和画一遍ROC和calibrate
KM_生存分析
library(survival)#by risktarget=read.table("risk_score.txt",header = T)kms = survfit(Surv(target$Surviva_Time,target$Vital_Status)~target$risk_level,data = target)kmdffexp=survdiff(Surv(target$Surviva_Time,target$Vital_Status)~target$risk_level,data = target)pvalue=round(1-pchisq(kmdffexp$chisq,df=1),4)plot(kms,lty=1,col = c("red","green"), xlab = "survival time in days", ylab = "survival probabilities", main=paste("survival curve of risk score(P=",pvalue,")",sep = ""))legend("bottomright",c("high risk","low risk"),lty=1,col = c('red','green'))#by agetarget=read.csv("target_dev.csv",header = T)kms = survfit(Surv(target$Surviva_Time,target$Vital_Status)~target$Age_group,data = target)kmdffexp=survdiff(Surv(target$Surviva_Time,target$Vital_Status)~target$Age_group,data = target)pvalue=round(1-pchisq(kmdffexp$chisq,df=1),4)plot(kms,lty=1,col = c("red","green"), xlab = "survival time in days", ylab = "survival probabilities", main=paste("survival curve of age group(P=",pvalue,")",sep = ""))legend("bottomright",c("< 18",">= 18"),lty=1,col = c('red','green'))