### Herold dataset ### # strategies for the validation of the added predictive value (De Bin et al., BMC Medical Research Methodology, 2014) library(survival) library(pec) load("../../dataset/ccl/data.maerz.Rdata") # load training data (molecular) load("../../dataset/ccl/pheno.maerz.Rdata") # load training data (clinical) dataTraining.mol<-data dataTraining.clin<-Table load("../../dataset/ccl/testdaten.new.Rdata") # load validation set dataValid.mol<-testdata dataValid.clin<-pheno.test rm(data,Table,testdata,pheno.test) data.tm<-dataTraining.mol[c('37004_at','226039_at','205255_x_at','221286_s_at','226247_at','212522_at','243010_at','202600_s_at'),] data.vm<-dataValid.mol[c('SFTPB','MGAT4A','TCF7','MGC29506','PLEKHA1','PDE8A','MSI2','NRIP1'),] comp.ps.8.herold<-function(sample) c(0.160,-0.151,-0.096,0.089,-0.110,-0.108,0.081,-0.208)%*%sample ps.8.T<-apply(data.tm,2,comp.ps.8.herold) ps.8.V<-apply(data.vm,2,comp.ps.8.herold) data.tc<-cbind(dataTraining.clin[,c('OS','L_STAT','MUT_STAT','AGE','SEX','VH_STAT')],ps.8.T) colnames(data.tc)<-c('OS','L_STAT','MUT_STAT','AGE','SEX','VH_STAT','ps.8') data.tc[41,6]<-NA data.vc<-na.omit(cbind(dataValid.clin[,c('Overall.survival..Tage.ab.Analyse.','L_STAT.ja.1.nein.0..Tod.aus.irgendeinem.Grund.','X5.Dimensionen.FISH.0.13q.solo.1.11q.2.17p.3.T12.4.ohne','Alter.bei.Messung..Jahre.','SEX','VH.Status')],ps.8.V)) colnames(data.vc)<-c('OS','L_STAT','MUT_STAT','AGE','SEX','VH_STAT','ps.8') rm(data.tm,data.vm,dataValid.clin,dataValid.mol,dataTraining.clin,dataTraining.mol,ps.8.T,ps.8.V) # Kaplan-Meier curves pdf('CLL_KM.pdf') par(mfrow=c(4,2),mar=c(3,3,3,3),mgp=c(2,1,0)) plot(survfit(Surv(as.numeric(OS),as.numeric(L_STAT))~I(AGE<60),data=data.tc),col=c(2,3),main='Training set - AGE',ylab='overall survival',xlab='day') legend('bottomleft',c('AGE<60','AGE>=60'),lty=1, col=c(3,2)) plot(survfit(Surv(as.numeric(OS),as.numeric(L_STAT))~I(AGE<60),data=data.vc),col=c(2,3),main='Validation set - AGE',ylab='overall survival',xlab='day') legend('bottomleft',c('AGE<60','AGE>=60'),lty=1,,col=c(3,2)) plot(survfit(Surv(as.numeric(OS),as.numeric(L_STAT))~SEX,data=data.tc),col=c(2,3),main='Training set - SEX',ylab='overall survival',xlab='day') legend('bottomleft',c('male','female'),lty=1, col=c(2,3)) plot(survfit(Surv(as.numeric(OS),as.numeric(L_STAT))~SEX,data=data.vc),col=c(2,3),main='Validation set - SEX',ylab='overall survival',xlab='day') legend('bottomleft',c('male','female'),lty=1,,col=c(2,3)) plot(survfit(Surv(as.numeric(OS),as.numeric(L_STAT))~MUT_STAT,data=data.tc),col=c(2,3,4,5,6),main='Training set - FISH',ylab='overall survival',xlab='day') legend('bottomleft',c('0','1','2','3','4'),lty=1, col=c(2:6),ncol=2,cex=0.6) plot(survfit(Surv(as.numeric(OS),as.numeric(L_STAT))~MUT_STAT,data=data.vc),col=c(2,3,4,5,6),main='Validation set - FISH',ylab='overall survival',xlab='day') legend('bottomleft',c('0','1','2','3','4'),lty=1, col=c(2:6),ncol=2,cex=0.6) plot(survfit(Surv(as.numeric(OS),as.numeric(L_STAT))~VH_STAT,data=data.tc),col=c(3,2),main='Training set - IGVH',ylab='overall survival',xlab='day') legend('bottomleft',c('not mutated','mutated'),lty=1, col=c(3,2)) plot(survfit(Surv(as.numeric(OS),as.numeric(L_STAT))~VH_STAT,data=data.vc),col=c(3,2),main='Validation set - IGVH',ylab='overall survival',xlab='day') legend('bottomleft',c('not mutated','mutated'),lty=1, col=c(3,2)) dev.off() par(mfrow=c(1,1)) # comparison between sets pdf('CCL_KaplanMeier_sets.pdf') plot(survfit(Surv(as.numeric(data.tc$OS),as.numeric(data.tc$L_STAT))~1),col=2,lwd=c(3,1,1),xlab='days',ylab='overall survival') lines(survfit(Surv(as.numeric(data.vc$OS),as.numeric(data.vc$L_STAT))~1),col=3,lwd=3) lines(survfit(Surv(as.numeric(data.vc$OS),as.numeric(data.vc$L_STAT))~1)$time,survfit(Surv(as.numeric(data.vc$OS),as.numeric(data.vc$L_STAT))~1)$upper,col=3,lty=2) lines(survfit(Surv(as.numeric(data.vc$OS),as.numeric(data.vc$L_STAT))~1)$time,survfit(Surv(as.numeric(data.vc$OS),as.numeric(data.vc$L_STAT))~1)$lower,col=3,lty=2) legend('bottomleft',c('training set','validation set'),col=c(2,3),lwd=1) dev.off() ############## # Strategy A # ############## # can not be used, due different kind of variable (microarray and LDA data) ############## # STRATEGY B # ############## # fit prediction model based on validation data modBoth.v<-coxph(Surv(as.numeric(OS),as.numeric(L_STAT))~as.factor(MUT_STAT) + as.numeric(AGE) + as.factor(SEX) + as.factor(VH_STAT) + ps.8, data=data.vc) summary(modBoth.v) # test the null hypothesis (with Wald test or likelihood ratio <-- asymptotically equal) 2*(min(pnorm(modBoth.v$coeff[8]/sqrt(modBoth.v$var[8,8])),1-pnorm(modBoth.v$coeff[8]/sqrt(modBoth.v$var[8,8])))) ############## # STRATEGY C # ############## modClin.v<-coxph(Surv(as.numeric(OS),as.numeric(L_STAT))~as.factor(MUT_STAT)+as.numeric(AGE)+as.factor(SEX)+as.factor(VH_STAT),data=data.vc) # ---------------------------------- # # 10-fold cross-validation procedure # # ---------------------------------- # set.seed(123) test.cv<-pec(list(modClin.v,modBoth.v),formula=modBoth.v$formula,data=data.vc,cens.model='marginal',splitMethod='cv10',B=1e2) # compute the integrated Brier score up to 1500 (value obtained looking at the Kaplan-Maier curves) ibs(test.cv,times=1500)[,2] # plot the prediction error curves pdf('CLL_approachSD.pdf') par(mfrow=c(1,1),mar=c(5,4,4,2)+0.1,mgp=c(3,1,0)) plot(test.cv,legend=F,xlim=c(1,1500)) legend(0,0.3,c('Kaplan-Meier','clinical model','combined model'),lty=1,col=1:3) dev.off() # -------------------------------------------- # # random split in learning and validation sets # # -------------------------------------------- # strategyCwithRandomSplit<-function(data,niter,prop,maxtime,split) { inbs<-prec.null<-prec.clin<-prec.both<-NULL n<-dim(data)[1] for (i in 1:niter) { label<-sample(c(rep(1,trunc(n*prop)),rep(2,n-trunc(n*prop)))) modClin.cv<-coxph(Surv(as.numeric(OS),as.numeric(L_STAT))~as.factor(MUT_STAT)+as.numeric(AGE)+as.factor(SEX)+as.factor(VH_STAT),data=data[label==1,]) modBoth.cv<-coxph(Surv(as.numeric(OS),as.numeric(L_STAT))~as.factor(MUT_STAT) + as.numeric(AGE) + as.factor(SEX) + as.factor(VH_STAT) + ps.8, data=data[label==1,]) curves<-pec(list(modClin.v,modBoth.v),formula=modBoth.v$formula,data=data[label==2,],cens.model='marginal',times=seq(0,maxtime,le=split),exact=F) inbs<-cbind(inbs,ibs(curves,times=maxtime)) prec.null<-rbind(prec.null,curves$AppErr$Reference) prec.clin<-rbind(prec.clin,curves$AppErr$coxph) prec.both<-rbind(prec.both,curves$AppErr$coxph.1) } if(niter>1) { inbs<-apply(inbs,1,mean) prec.null<-apply(prec.null,2,mean) prec.clin<-apply(prec.clin,2,mean) prec.both<-apply(prec.both,2,mean) } list(ibs=inbs, pec.null=prec.null, pec.clin=prec.clin, pec.both=prec.both) } time<-2e2 maxtime<-15e2 set.seed(123) res<-strategyCwithRandomSplit(data=data.vc,niter=100,prop=0.75,maxtime=maxtime,split=time) res$ibs pdf('CLL_approachSD_withSubsampling.pdf') par(mfrow=c(1,1),mar=c(5,4,4,2)+0.1,mgp=c(3,1,0)) plot(seq(0,maxtime,le=time),res$pec.null,ty='s',col=1,lwd=2,ylab='Prediction error',xaxt='n',ylim=c(0,0.30),xlab='Time',bty='n') axis(1,at=seq(0,maxtime,le=4),labels=seq(0,maxtime,le=4)) lines(seq(0,maxtime,le=time),res$pec.clin,ty='s',col=2,lwd=2) lines(seq(0,maxtime,le=time),res$pec.both,ty='s',col=3,lwd=2) legend(0,0.3,c('Kaplan-Meier','clinical model','combined model'),lty=1,col=1:3) dev.off() ##################### # SUBGROUP ANALYSIS # ##################### # can not be performed, due to small sample size