### Metzeler dataset ### # strategies for validation of added predictive value (De Bin et al., BMC Medical Research Methodology, 2014) # library(survival) library(pec) library(CPE) # Metzeler et al (2008) data library(GEOquery) GSE12417<-getGEO('GSE12417') validData.gene<-t(exprs(GSE12417[[1]])) trainData.gene<-cbind(t(exprs(GSE12417[[2]])),t(exprs(GSE12417[[3]]))) # ISSUE, clinical data are not available online trainData.clin<-NULL validData.clin<-NULL # read the information in order to compute the molecular score tmp<-read.table('raw.txt') # data provided in the online supplementary material of Metzeler et al (2008) toCompScore<-data.frame() for(i in 1:86) for (j in 1:6) toCompScore[i,j]<-tmp[((i-1)*6+j),1] colnames(toCompScore)<-c('number','probe_set','gene_symbol','cox-score','weight','sd') toCompScore[,2]<-as.character(toCompScore[,2]) toCompScore[,5]<-as.numeric(as.character(toCompScore[,5])) toCompScore[,6]<-as.numeric(as.character(toCompScore[,6])) trainData.selection<-trainData.genes[,as.character(toCompScore[,2])] validData.selection<-validData.genes[,as.character(toCompScore[,2])] # compute the molecular score means<-colMeans(trainData.selection) stud.train<-trainData.selection stud.valid<-validData.selection for (i in 1:86) { stud.train[,i]<-(trainData.selection[,i]-means[i])/toCompScore[i,6] stud.valid[,i]<-(validData.selection[,i]-means[i])/toCompScore[i,6] } score.train<-stud.train%*%toCompScore[,5] score.valid<-stud.valid%*%toCompScore[,5] trainData.both<-cbind(trainData.clin,score.train) validData.both<-cbind(validData.clin,score.valid) colnames(trainData.both)<-colnames(validData.both)<-c(colnames(trainData.clin),'score') ######################## # preliminary analysis # ######################## pdf('AML_KM.pdf') par(mfrow=c(4,2),mar=c(3,3,3,3),mgp=c(2,1,0)) plot(survfit(Surv(time,status)~I(AGE<60),data=trainData.clin),col=c(2,3),main='Training set - AGE',ylab='overall survival',xlab='day') legend('topright',c('AGE<60','AGE>=60'),lty=1, col=c(3,2)) plot(survfit(Surv(time,status)~I(AGE<60),data=validData.clin),col=c(2,3),main='Validation set - AGE',ylab='overall survival',xlab='day') legend('topright',c('AGE<60','AGE>=60'),lty=1,,col=c(3,2)) plot(survfit(Surv(time,status)~SEX,data=trainData.clin),col=c(3,2),main='Training set - SEX',ylab='overall survival',xlab='day') legend('topright',c('female','male'),lty=1, col=c(3,2)) plot(survfit(Surv(time,status)~SEX,data=validData.clin),col=c(3,2),main='Validation set - SEX',ylab='overall survival',xlab='day') legend('topright',c('female','male'),lty=1,,col=c(3,2)) plot(survfit(Surv(time,status)~FLT3,data=trainData.clin),col=c(3,2),main='Training set - FLT3',ylab='overall survival',xlab='day') legend('topright',c('NON-FLT3-ITD','FLT3-ITD'),lty=1, col=c(3,2)) plot(survfit(Surv(time,status)~FLT3,data=validData.clin),col=c(3,2),main='Validation set - FLT3',ylab='overall survival',xlab='day') legend('topright',c('NON-FLT3-ITD','FLT3-ITD'),lty=1,,col=c(3,2)) plot(survfit(Surv(time,status)~NPM,data=trainData.clin),col=c(2,3),main='Training set - NPM1',ylab='overall survival',xlab='day') legend('topright',c('NPM1 wt','NPM1 mut'),lty=1, col=c(2,3)) plot(survfit(Surv(time,status)~NPM,data=validData.clin),col=c(2,3),main='Validation set - NPM1',ylab='overall survival',xlab='day') legend('topright',c('NPM1 wt','NPM1 mut'),lty=1,,col=c(2,3)) dev.off() # comparison between sets pdf('AML_KaplanMeier_sets_1200.pdf') par(mfrow=c(1,1),mar=c(5,4,4,2)+0.1,mgp=c(3,1,0)) plot(survfit(Surv(trainData.clin$time,trainData.clin$status)~1),col=2, xlim=c(0,1200),lwd=c(3,1,1),xlab='days',ylab='overall survival') lines(survfit(Surv(validData.clin$time,validData.clin$status)~1),col=3,lwd=3) lines(survfit(Surv(validData.clin$time,validData.clin$status)~1)$time,survfit(Surv(validData.clin$time,validData.clin$status)~1)$upper,col=3,lty=2) lines(survfit(Surv(validData.clin$time,validData.clin$status)~1)$time,survfit(Surv(validData.clin$time,validData.clin$status)~1)$lower,col=3,lty=2) legend('bottomleft',c('training set','validation set'),col=c(2,3),lwd=1) dev.off() maxTime<-365*1.5 ############## # Strategy A # ############## # fit prediction models based on training data modClin<-coxph(Surv(time,status)~SEX+AGE+FLT3+NPM,data=trainData.clin) modBoth<-coxph(Surv(time,status)~SEX+AGE+FLT3+NPM+score,data=trainData.both) predEC<-pec(list(modClin,modBoth),formula=modBoth$formula,data=validData.both,cens.model='marginal') # compute the integrated Brier score ibs(predEC,times=maxTime) # plot the prediction error curves pdf('AML_approachA.pdf') plot(predEC,legend=F,col=c(1,2,3),xlab='Time in days',xlim=c(0,maxTime)) legend('topleft',c('Kaplan-Meier','only clinical','with omics score'),col=1:3,lty=1) dev.off() # check the association between the scores and the validation set outcome score.clin<-predict(modClin,newdata=validData.clin,type='lp') score.both<-predict(modBoth,newdata=validData.both,type='lp') # split the observations in low- and high-risk groups based on the median of the scores scoreC<-NULL scoreC[score.clin < median(score.clin)]<-0 scoreC[score.clin >= median(score.clin)]<-1 scoreB<-NULL scoreB[score.both < median(score.both)]<-0 scoreB[score.both >= median(score.both)]<-1 # plot Kaplan-Meier curves for low and high-risk groups pdf('AML_approachB.pdf') par(mfrow=c(1,1)) plot(survfit(Surv(validData.clin$time,validData.clin$status)~scoreC),col=2,ylab='overall survival',xlab='day') lines(survfit(Surv(validData.clin$time,validData.clin$status)~scoreB),col=3) legend('bottomleft',c('without omics score','with omics score'),lty=1,col=c(2,3)) dev.off() # compute the c-index (using the version of Gerds et al. (2013)) cindex(list(modClin,modBoth),formula=modBoth$formula,data=validData.both,cens.model='marginal') # compute the statistic K phcpe(modClin) phcpe(modBoth) # calibration slope summary(coxph(Surv(validData.clin$time,validData.clin$status)~score.clin)) summary(coxph(Surv(validData.clin$time,validData.clin$status)~score.both)) # function to interpolate the base hazard interpol_baseHaz<-function(base.clin,newtime) { S_0<-data.frame(rep(0,length(newtime)),newtime) colnames(S_0)<-c('hazard','time') for (i in 1:length(S_0$time)) { cntr<-F j<-1 while (!cntr) { if (S_0$time[i]==base.clin$time[j]) { S_0$hazard[i]<-base.clin$hazard[j] cntr<-T } if (S_0$time[i] < base.clin$time[j]&cntr==F) { if(j!=1) S_0$hazard[i]<-base.clin$hazard[j-1]+ (S_0$time[i]-base.clin$time[j-1])*(base.clin$hazard[j]-base.clin$hazard[j-1])/(base.clin$time[j]-base.clin$time[j-1]) else S_0$hazard[i]<-S_0$time[i]*(base.clin$hazard[j])/(base.clin$time[j]) cntr<-T } ifelse(j < length(base.clin$time),j<-j+1,cntr<-T) } } S_0 } # compare Kaplan-Meier curve with the average predicted curves modClin.v<-coxph(Surv(time,status)~.,data=validData.clin) modBoth.v<-coxph(Surv(time,status)~.,data=validData.both) base.clin<-basehaz(modClin,centered=F) base.both<-basehaz(modBoth,centered=F) base.clin.v<-basehaz(modClin.v,centered=F) base.both.v<-basehaz(modBoth.v,centered=F) lambda_clin<-interpol_baseHaz(base.clin,base.clin.v$time) lambda_both<-interpol_baseHaz(base.both,base.both.v$time) score.clin.v<-as.matrix(validData.clin[,-c(1,2)])%*%modClin$coefficients score.both.v<-as.matrix(validData.both[,-c(1,2)])%*%modBoth$coefficients pdf('AML_approachB_KMcalib_ext.pdf') surv.clin<-sapply(exp(-lambda_clin$hazard),function(x,score) x^exp(score), score=score.clin.v) surv.both<-sapply(exp(-lambda_both$hazard),function(x,score) x^exp(score), score=score.both.v) plot(survfit(Surv(time,status)~1,data=validData.clin),ylab='overall survival',xlab='days',lwd=c(3,1,1)) lines(lambda_clin$time,apply(surv.clin,2,mean),ty='s',ylim=c(0,1),col=2,lwd=3) lines(lambda_both$time,apply(surv.both,2,mean),ty='s',ylim=c(0,1),col=3,lwd=3) legend('bottomleft',c('observed (Kaplan-Meier)','predicted without omics score','predicted with omics score'),lty=1,col=c(1,2,3)) surv.clin<-sapply(exp(-base.clin.v$hazard),function(x,score) x^exp(score), score=score.clin.v) surv.both<-sapply(exp(-base.both.v$hazard),function(x,score) x^exp(score), score=score.both.v) lines(lambda_clin$time,apply(surv.clin,2,mean),ty='s',ylim=c(0,1),col=2,lty=2,lwd=3) lines(lambda_both$time,apply(surv.both,2,mean),ty='s',ylim=c(0,1),col=3,lty=2,lwd=3) score.clin.b<-as.matrix(validData.clin[,-c(1,2)])%*%(0.92*modClin$coefficients) score.both.b<-as.matrix(validData.both[,-c(1,2)])%*%c(modBoth$coefficients[1:4],0.50*modBoth.v$coefficients[5]) surv.clin<-sapply(exp(-base.clin.v$hazard),function(x,score) x^exp(score), score=score.clin.b) surv.both<-sapply(exp(-base.both.v$hazard),function(x,score) x^exp(score), score=score.both.b) lines(lambda_clin$time,apply(surv.clin,2,mean),ty='s',ylim=c(0,1),col=2,lty=3,lwd=3) lines(lambda_both$time,apply(surv.both,2,mean),ty='s',ylim=c(0,1),col=3,lty=3,lwd=3) dev.off() ############## # STRATEGY B # ############## # test the null hypothesis (Wald statistics, asymptotically equal to the likelihood ratio test) 2*(min(pnorm(modBoth.v$coeff[5]/sqrt(modBoth.v$var[5,5])),1-pnorm(modBoth.v$coeff[5]/sqrt(modBoth.v$var[5,5])))) # --------------------- # # check for overfitting # # --------------------- # print(cbind(summary(modBoth)$coeff[,c(1,3)],summary(modBoth.v)$coeff[,c(1,3)]),digits=2) # ----------------------------------------- # # check the iteration between sex and score # # ----------------------------------------- # summary(modBoth.v) modBoth.iter<-coxph(Surv(time,status)~SEX*score+AGE+FLT3+NPM,data=validData.both) summary(modBoth.iter) ############## # STRATEGY C # ############## # ---------------------------------- # # 10-fold cross-validation procedure # # ---------------------------------- # set.seed(123) test.cv<-pec(list(modClin,modBoth),formula=modBoth$formula,data=validData.both,cens.model='marginal',splitMethod='cv10',B=1e2) # compute the integrated Brier score print(ibs(test.cv,times=maxTime)[,2],digits=3) # plot the prediction error curves pdf('AML_approachD.pdf') par(mfrow=c(1,1)) plot(test.cv,legend=F,xlim=c(0,547.5)) legend('bottomright',c('null model','clinical model','combined model'),lty=1,col=1:3) dev.off() # --------------------------------- # # 3-fold cross-validation procedure # # --------------------------------- # set.seed(123) test.cv3<-pec(list(modClin,modBoth),formula=modBoth$formula,data=validData.both,cens.model='marginal',splitMethod='cv3',B=1e2) # compute the integrated Brier score print(ibs(test.cv3,times=maxTime)[,2],digits=3) # plot the prediction error curves par(mfrow=c(1,1)) plot(test.cv3,legend=F,xlim=c(0,547.5)) legend('bottomright',c('null model','clinical model','combined model'),lty=1,col=1:3) # --------------------------------- # # 4-fold cross-validation procedure # # --------------------------------- # set.seed(123) test.cv4<-pec(list(modClin,modBoth),formula=modBoth$formula,data=validData.both,cens.model='marginal',splitMethod='cv4',B=1e2) # compute the integrated Brier score print(ibs(test.cv4,times=maxTime)[,2],digits=3) # plot the prediction error curves par(mfrow=c(1,1)) plot(test.cv4,legend=F,xlim=c(0,547.5)) legend('bottomright',c('null model','clinical model','combined model'),lty=1,col=1:3) # ------------------------------------ # # bootstrap cross-validation procedure # # ------------------------------------ # set.seed(123) test.boot<-pec(list(modClin,modBoth),formula=modBoth$formula,data=validData.both,cens.model='marginal',splitMethod='Boot632plus',B=1e3) # compute the integrated Brier score print(ibs(test.boot,times=maxTime)[,2],digits=3) # plot the prediction error curves plot(test.boot,legend=F,xlim=c(0,547.5)) legend('bottomright',c('null model','clinical model','combined model'),lty=1,col=1:3) # -------------------------------------------- # # random split in learning and validation sets # # -------------------------------------------- # strategyCwithRandomSplit<-function(data,niter,prop,maxtime,split) { inbs<-prec.null<-prec.clin<-prec.both<-NULL n<-dim(data)[1] for (i in 1:niter) { label<-sample(c(rep(1,trunc(n*prop)),rep(2,n-trunc(n*prop)))) modClin.cv<-coxph(Surv(time,status)~.-score,data=data[label==1,]) modBoth.cv<-coxph(Surv(time,status)~.,data=data[label==1,]) curves<-pec(list(modClin.cv,modBoth.cv),formula=modBoth.cv$formula,data=data[label==2,],cens.model='marginal',times=seq(0,maxtime,le=split),exact=F) inbs<-cbind(inbs,ibs(curves,times=min(maxtime,curves$minmaxtime))) if (length(curves$AppErr$Reference)!=split) { curves$AppErr$Reference<-c(curves$AppErr$Reference,rep(NA,split-length(curves$AppErr$Reference))) curves$AppErr$coxph<-c(curves$AppErr$coxph,rep(NA,split-length(curves$AppErr$coxph))) curves$AppErr$coxph.1<-c(curves$AppErr$coxph.1,rep(NA,split-length(curves$AppErr$coxph.1))) } prec.null<-rbind(prec.null,curves$AppErr$Reference) prec.clin<-rbind(prec.clin,curves$AppErr$coxph) prec.both<-rbind(prec.both,curves$AppErr$coxph.1) } if(niter>1) { inbs<-apply(inbs,1,mean) prec.null<-apply(prec.null,2,mean,na.rm=T) prec.clin<-apply(prec.clin,2,mean,na.rm=T) prec.both<-apply(prec.both,2,mean,na.rm=T) } list(ibs=inbs, pec.null=prec.null, pec.clin=prec.clin, pec.both=prec.both) } time<-2e2 set.seed(123) res<-strategyCwithRandomSplit(data=validData.both,niter=100,prop=2/3,maxtime=maxTime,split=time) print(res$ibs,digits=3) pdf('CLL_approachSD_withSubsampling.pdf') par(mfrow=c(1,1),mar=c(5,4,4,2)+0.1,mgp=c(3,1,0)) plot(seq(0,maxTime,le=time),res$pec.null,ty='s',col=1,lwd=2,ylab='Prediction error',xaxt='n',ylim=c(0,0.30),xlab='Time',bty='n') axis(1,at=c(0,100,200,300,400,500),labels=c(0,100,200,300,400,500)) lines(seq(0,maxTime,le=time),res$pec.clin,ty='s',col=2,lwd=2) lines(seq(0,maxTime,le=time),res$pec.both,ty='s',col=3,lwd=2) legend(0,0.3,c('Kaplan-Meier','clinical model','combined model'),lty=1,col=1:3) dev.off() ############################# # +++++++++++++++++++++++++ # # + ##################### + # # + # SUBGROUP ANALYSIS # + # # + ##################### + # # +++++++++++++++++++++++++ # ############################# ################### # subgroup by SEX # ################### ### STRATEGY SA ### # apply strategy A to different subgroup subgroup.A<-function(label,data,data.t) { modClin<-coxph(Surv(time,status)~AGE+FLT3+NPM,data.t[data.t$SEX==label,]) modBoth<-coxph(Surv(time,status)~AGE+FLT3+NPM+score,data.t[data.t$SEX==label,]) pec<-pec(list(modClin,modBoth),formula=modBoth$formula,data[data$SEX==label,],cens.model='marginal') cindex<-cindex(list(modClin,modBoth),formula=modBoth$formula,data=data[data$SEX==label,],cens.model='marginal')$AppCindex Kstat<-rbind(phcpe(modClin),phcpe(modBoth)) ibs<-ibs(pec,times=547.5) list(pec=pec,cindex=cindex,Kstat=Kstat,ibs=ibs) } res<-lapply(c('0','1'),subgroup.A,data=validData.both,data.t=trainData.both) # integrated Brier score print(cbind(res[[1]]$ibs,res[[2]]$ibs),digits=3) # C-index print(cbind(res[[1]]$cindex,res[[2]]$cindex),digits=3) # statistic K print(cbind(res[[1]]$Kstat,res[[2]]$Kstat),digits=3) # plot prediction error curves pdf('AML_approachSA.pdf') par(mfrow=c(1,2)) plot(res[[1]]$pec,legend=F,xlim=c(0,547.5),ylim=c(0,0.4)) legend('bottomright',c('null model','clinical model','combined model'),lty=1,col=1:3) mtext('female',side=3) plot(res[[2]]$pec,legend=F,,xlim=c(0,547.5),ylim=c(0,0.4)) legend('bottomright',c('null model','clinical model','combined model'),lty=1,col=1:3) mtext('male',side=3) dev.off() par(mfrow=c(1,1)) # calibration slope subgroup.B<-function(label,data,data.t) { modClin<-coxph(Surv(time,status)~AGE+FLT3+NPM,data.t[data.t$SEX==label,]) modBoth<-coxph(Surv(time,status)~AGE+FLT3+NPM+score,data.t[data.t$SEX==label,]) #2: check the association between the scores and the validation set outcome score.clin<-predict(modClin,newdata=data[data$SEX==label,],type='lp') score.both<-predict(modBoth,newdata=data[data$SEX==label,],type='lp') list(clin=coxph(Surv(data$time[data$SEX==label],data$status[data$SEX==label])~score.clin), comb=coxph(Surv(data$time[data$SEX==label],data$status[data$SEX==label])~score.both)) } resCS<-lapply(c('0','1'),subgroup.B,data=validData.both,data.t=trainData.both) print(rbind(cbind(resCS[[1]]$clin$coeff,resCS[[2]]$clin$coeff), cbind(resCS[[1]]$comb$coeff,resCS[[2]]$comb$coeff)),digits=3) ### STRATEGY SB ### subgroup.C<-function(label,data) { modSC<-coxph(Surv(time,status)~score+AGE+FLT3+NPM,data=data[data$SEX==label,]) 2*(min(pnorm(modSC$coeff[1]/sqrt(modSC$var[1,1])),1-pnorm(modSC$coeff[1]/sqrt(modSC$var[1,1])))) } sapply(c('0','1'),subgroup.C,data=validData.both) ### STRATEGY SD ### # 10-fold cross-validation subgroup.D<-function(label,data,data.t) { modClin<-coxph(Surv(time,status)~AGE+FLT3+NPM,data.t[data.t$SEX==label,]) modBoth<-coxph(Surv(time,status)~AGE+FLT3+NPM+score,data.t[data.t$SEX==label,]) pec(list(modClin,modBoth),formula=modBoth$formula,data=data[data$SEX==label,],cens.model='marginal',splitMethod='cv10',B=1e2) } set.seed(123) res<-lapply(c('0','1'),subgroup.D,data=validData.both,data.t=trainData.both) # lapply(res,ibs,times=maxTime) # pdf('AML_approachSD_cv.pdf') par(mfrow=c(1,2)) plot(res[[1]],legend=F, main='female',xlim=c(0,547.5)) mtext('female',side=3,font=2) legend('bottomright',c('null model','clinical model','combined model'),lty=1,col=1:3) plot(res[[2]],legend=F, main='male',xlim=c(0,547.5)) mtext('male',side=3,font=2) legend('bottomright',c('null model','clinical model','combined model'),lty=1,col=1:3) dev.off() # subsampling set.seed(12) res0<-strategyCwithRandomSplit(data=validData.both[validData.both$SEX=='0',-3],niter=100,prop=2/3,maxtime=maxTime,split=time) set.seed(12) res1<-strategyCwithRandomSplit(data=validData.both[validData.both$SEX=='1',-3],niter=100,prop=2/3,maxtime=maxTime,split=time) print(res0$ibs,digits=3) print(res1$ibs,digits=3) pdf('CLL_approachSD_withSubsampling.pdf') par(mfrow=c(1,2),mar=c(5,4,4,2)+0.1,mgp=c(3,1,0)) plot(seq(0,maxTime,le=time),res0$pec.null,ty='s',col=1,lwd=2,ylab='Prediction error',xaxt='n',ylim=c(0,0.30),xlab='Time',bty='n',main='female') axis(1,at=c(0,100,200,300,400,500),labels=c(0,100,200,300,400,500)) lines(seq(0,maxTime,le=time),res0$pec.clin,ty='s',col=2,lwd=2) lines(seq(0,maxTime,le=time),res0$pec.both,ty='s',col=3,lwd=2) legend('bottomright',c('Kaplan-Meier','clinical model','combined model'),lty=1,col=1:3) plot(seq(0,maxTime,le=time),res1$pec.null,ty='s',col=1,lwd=2,ylab='Prediction error',xaxt='n',ylim=c(0,0.40),xlab='Time',bty='n',main='male') axis(1,at=c(0,100,200,300,400,500),labels=c(0,100,200,300,400,500)) lines(seq(0,maxTime,le=time),res1$pec.clin,ty='s',col=2,lwd=2) lines(seq(0,maxTime,le=time),res1$pec.both,ty='s',col=3,lwd=2) legend('bottomright',c('Kaplan-Meier','clinical model','combined model'),lty=1,col=1:3) dev.off()