### Metzeler dataset ###
# strategies for validation of added predictive value (De Bin et al., BMC Medical Research Methodology, 2014) #

library(survival)
library(pec)
library(CPE)
# Metzeler et al (2008) data
library(GEOquery)
GSE12417<-getGEO('GSE12417')
validData.gene<-t(exprs(GSE12417[[1]]))
trainData.gene<-cbind(t(exprs(GSE12417[[2]])),t(exprs(GSE12417[[3]])))

# ISSUE, clinical data are not available online
trainData.clin<-NULL
validData.clin<-NULL

# read the information in order to compute the molecular score
tmp<-read.table('raw.txt') # data provided in the online supplementary material of Metzeler et al (2008)
toCompScore<-data.frame()
for(i in 1:86)
  for (j in 1:6)
    toCompScore[i,j]<-tmp[((i-1)*6+j),1]
colnames(toCompScore)<-c('number','probe_set','gene_symbol','cox-score','weight','sd')
toCompScore[,2]<-as.character(toCompScore[,2])
toCompScore[,5]<-as.numeric(as.character(toCompScore[,5]))
toCompScore[,6]<-as.numeric(as.character(toCompScore[,6]))
trainData.selection<-trainData.genes[,as.character(toCompScore[,2])]
validData.selection<-validData.genes[,as.character(toCompScore[,2])]

# compute the molecular score
means<-colMeans(trainData.selection)
stud.train<-trainData.selection
stud.valid<-validData.selection
for (i in 1:86)
{
  stud.train[,i]<-(trainData.selection[,i]-means[i])/toCompScore[i,6]
  stud.valid[,i]<-(validData.selection[,i]-means[i])/toCompScore[i,6]
}  
score.train<-stud.train%*%toCompScore[,5]
score.valid<-stud.valid%*%toCompScore[,5]
trainData.both<-cbind(trainData.clin,score.train)
validData.both<-cbind(validData.clin,score.valid)
colnames(trainData.both)<-colnames(validData.both)<-c(colnames(trainData.clin),'score')

########################
# preliminary analysis #
########################
pdf('AML_KM.pdf')
  par(mfrow=c(4,2),mar=c(3,3,3,3),mgp=c(2,1,0))

  plot(survfit(Surv(time,status)~I(AGE<60),data=trainData.clin),col=c(2,3),main='Training set - AGE',ylab='overall survival',xlab='day')
  legend('topright',c('AGE<60','AGE>=60'),lty=1, col=c(3,2))
  plot(survfit(Surv(time,status)~I(AGE<60),data=validData.clin),col=c(2,3),main='Validation set - AGE',ylab='overall survival',xlab='day')
  legend('topright',c('AGE<60','AGE>=60'),lty=1,,col=c(3,2))
  
  plot(survfit(Surv(time,status)~SEX,data=trainData.clin),col=c(3,2),main='Training set - SEX',ylab='overall survival',xlab='day')
  legend('topright',c('female','male'),lty=1, col=c(3,2))
  plot(survfit(Surv(time,status)~SEX,data=validData.clin),col=c(3,2),main='Validation set - SEX',ylab='overall survival',xlab='day')
  legend('topright',c('female','male'),lty=1,,col=c(3,2))
  
  plot(survfit(Surv(time,status)~FLT3,data=trainData.clin),col=c(3,2),main='Training set - FLT3',ylab='overall survival',xlab='day')
  legend('topright',c('NON-FLT3-ITD','FLT3-ITD'),lty=1, col=c(3,2))
  plot(survfit(Surv(time,status)~FLT3,data=validData.clin),col=c(3,2),main='Validation set - FLT3',ylab='overall survival',xlab='day')
  legend('topright',c('NON-FLT3-ITD','FLT3-ITD'),lty=1,,col=c(3,2))
  
  plot(survfit(Surv(time,status)~NPM,data=trainData.clin),col=c(2,3),main='Training set - NPM1',ylab='overall survival',xlab='day')
  legend('topright',c('NPM1 wt','NPM1 mut'),lty=1, col=c(2,3))
  plot(survfit(Surv(time,status)~NPM,data=validData.clin),col=c(2,3),main='Validation set - NPM1',ylab='overall survival',xlab='day')
  legend('topright',c('NPM1 wt','NPM1 mut'),lty=1,,col=c(2,3))
dev.off()

# comparison between sets
pdf('AML_KaplanMeier_sets_1200.pdf')
  par(mfrow=c(1,1),mar=c(5,4,4,2)+0.1,mgp=c(3,1,0))
  plot(survfit(Surv(trainData.clin$time,trainData.clin$status)~1),col=2, xlim=c(0,1200),lwd=c(3,1,1),xlab='days',ylab='overall survival')
  lines(survfit(Surv(validData.clin$time,validData.clin$status)~1),col=3,lwd=3)
  lines(survfit(Surv(validData.clin$time,validData.clin$status)~1)$time,survfit(Surv(validData.clin$time,validData.clin$status)~1)$upper,col=3,lty=2)
  lines(survfit(Surv(validData.clin$time,validData.clin$status)~1)$time,survfit(Surv(validData.clin$time,validData.clin$status)~1)$lower,col=3,lty=2)
  legend('bottomleft',c('training set','validation set'),col=c(2,3),lwd=1)
dev.off()

maxTime<-365*1.5
##############
# Strategy A #
##############
# fit prediction models based on training data
modClin<-coxph(Surv(time,status)~SEX+AGE+FLT3+NPM,data=trainData.clin)
modBoth<-coxph(Surv(time,status)~SEX+AGE+FLT3+NPM+score,data=trainData.both)
predEC<-pec(list(modClin,modBoth),formula=modBoth$formula,data=validData.both,cens.model='marginal')
# compute the integrated Brier score
ibs(predEC,times=maxTime)
# plot the prediction error curves
pdf('AML_approachA.pdf')
  plot(predEC,legend=F,col=c(1,2,3),xlab='Time in days',xlim=c(0,maxTime))
  legend('topleft',c('Kaplan-Meier','only clinical','with omics score'),col=1:3,lty=1)
dev.off()

# check the association between the scores and the validation set outcome
score.clin<-predict(modClin,newdata=validData.clin,type='lp')
score.both<-predict(modBoth,newdata=validData.both,type='lp')
# split the observations in low- and high-risk groups based on the median of the scores
scoreC<-NULL
scoreC[score.clin < median(score.clin)]<-0
scoreC[score.clin >= median(score.clin)]<-1
scoreB<-NULL
scoreB[score.both < median(score.both)]<-0
scoreB[score.both >= median(score.both)]<-1
# plot Kaplan-Meier curves for low and high-risk groups
pdf('AML_approachB.pdf')
  par(mfrow=c(1,1))
  plot(survfit(Surv(validData.clin$time,validData.clin$status)~scoreC),col=2,ylab='overall survival',xlab='day')
  lines(survfit(Surv(validData.clin$time,validData.clin$status)~scoreB),col=3)
  legend('bottomleft',c('without omics score','with omics score'),lty=1,col=c(2,3))
dev.off()

# compute the c-index (using the version of Gerds et al. (2013))
cindex(list(modClin,modBoth),formula=modBoth$formula,data=validData.both,cens.model='marginal')

# compute the statistic K
phcpe(modClin)
phcpe(modBoth)

# calibration slope
summary(coxph(Surv(validData.clin$time,validData.clin$status)~score.clin))
summary(coxph(Surv(validData.clin$time,validData.clin$status)~score.both))

# function to interpolate the base hazard 
interpol_baseHaz<-function(base.clin,newtime)
{
  S_0<-data.frame(rep(0,length(newtime)),newtime)
  colnames(S_0)<-c('hazard','time')
  for (i in 1:length(S_0$time))
  {
    cntr<-F
    j<-1
    while (!cntr)
    {
      if (S_0$time[i]==base.clin$time[j])
      {
        S_0$hazard[i]<-base.clin$hazard[j]
        cntr<-T
      }
      if (S_0$time[i] < base.clin$time[j]&cntr==F)
      {
        if(j!=1) S_0$hazard[i]<-base.clin$hazard[j-1]+
          (S_0$time[i]-base.clin$time[j-1])*(base.clin$hazard[j]-base.clin$hazard[j-1])/(base.clin$time[j]-base.clin$time[j-1])
        else S_0$hazard[i]<-S_0$time[i]*(base.clin$hazard[j])/(base.clin$time[j])
        cntr<-T
      }
      ifelse(j < length(base.clin$time),j<-j+1,cntr<-T)
    }
  }
  S_0
}

# compare Kaplan-Meier curve with the average predicted curves
modClin.v<-coxph(Surv(time,status)~.,data=validData.clin)
modBoth.v<-coxph(Surv(time,status)~.,data=validData.both)
base.clin<-basehaz(modClin,centered=F)
base.both<-basehaz(modBoth,centered=F)
base.clin.v<-basehaz(modClin.v,centered=F)
base.both.v<-basehaz(modBoth.v,centered=F)

lambda_clin<-interpol_baseHaz(base.clin,base.clin.v$time)
lambda_both<-interpol_baseHaz(base.both,base.both.v$time)
score.clin.v<-as.matrix(validData.clin[,-c(1,2)])%*%modClin$coefficients
score.both.v<-as.matrix(validData.both[,-c(1,2)])%*%modBoth$coefficients

pdf('AML_approachB_KMcalib_ext.pdf')
  surv.clin<-sapply(exp(-lambda_clin$hazard),function(x,score) x^exp(score), score=score.clin.v)
  surv.both<-sapply(exp(-lambda_both$hazard),function(x,score) x^exp(score), score=score.both.v)
  plot(survfit(Surv(time,status)~1,data=validData.clin),ylab='overall survival',xlab='days',lwd=c(3,1,1))
  lines(lambda_clin$time,apply(surv.clin,2,mean),ty='s',ylim=c(0,1),col=2,lwd=3)
  lines(lambda_both$time,apply(surv.both,2,mean),ty='s',ylim=c(0,1),col=3,lwd=3)
  legend('bottomleft',c('observed (Kaplan-Meier)','predicted without omics score','predicted with omics score'),lty=1,col=c(1,2,3))
  
  surv.clin<-sapply(exp(-base.clin.v$hazard),function(x,score) x^exp(score), score=score.clin.v)
  surv.both<-sapply(exp(-base.both.v$hazard),function(x,score) x^exp(score), score=score.both.v)
  lines(lambda_clin$time,apply(surv.clin,2,mean),ty='s',ylim=c(0,1),col=2,lty=2,lwd=3)
  lines(lambda_both$time,apply(surv.both,2,mean),ty='s',ylim=c(0,1),col=3,lty=2,lwd=3)
  
  score.clin.b<-as.matrix(validData.clin[,-c(1,2)])%*%(0.92*modClin$coefficients)
  score.both.b<-as.matrix(validData.both[,-c(1,2)])%*%c(modBoth$coefficients[1:4],0.50*modBoth.v$coefficients[5])
  surv.clin<-sapply(exp(-base.clin.v$hazard),function(x,score) x^exp(score), score=score.clin.b)
  surv.both<-sapply(exp(-base.both.v$hazard),function(x,score) x^exp(score), score=score.both.b)
  lines(lambda_clin$time,apply(surv.clin,2,mean),ty='s',ylim=c(0,1),col=2,lty=3,lwd=3)
  lines(lambda_both$time,apply(surv.both,2,mean),ty='s',ylim=c(0,1),col=3,lty=3,lwd=3)
dev.off()


##############
# STRATEGY B #
##############
# test the null hypothesis (Wald statistics, asymptotically equal to the likelihood ratio test)
2*(min(pnorm(modBoth.v$coeff[5]/sqrt(modBoth.v$var[5,5])),1-pnorm(modBoth.v$coeff[5]/sqrt(modBoth.v$var[5,5]))))

# --------------------- #
# check for overfitting #
# --------------------- #
print(cbind(summary(modBoth)$coeff[,c(1,3)],summary(modBoth.v)$coeff[,c(1,3)]),digits=2)

# ----------------------------------------- #
# check the iteration between sex and score #
# ----------------------------------------- #
summary(modBoth.v)
modBoth.iter<-coxph(Surv(time,status)~SEX*score+AGE+FLT3+NPM,data=validData.both)
summary(modBoth.iter)


##############
# STRATEGY C #
##############
# ---------------------------------- #
# 10-fold cross-validation procedure # 
# ---------------------------------- #
set.seed(123)
test.cv<-pec(list(modClin,modBoth),formula=modBoth$formula,data=validData.both,cens.model='marginal',splitMethod='cv10',B=1e2)
# compute the integrated Brier score
print(ibs(test.cv,times=maxTime)[,2],digits=3)
# plot the prediction error curves
pdf('AML_approachD.pdf')
  par(mfrow=c(1,1))
  plot(test.cv,legend=F,xlim=c(0,547.5))
  legend('bottomright',c('null model','clinical model','combined model'),lty=1,col=1:3)
dev.off()

# --------------------------------- #
# 3-fold cross-validation procedure # 
# --------------------------------- #
set.seed(123)
test.cv3<-pec(list(modClin,modBoth),formula=modBoth$formula,data=validData.both,cens.model='marginal',splitMethod='cv3',B=1e2)
# compute the integrated Brier score
print(ibs(test.cv3,times=maxTime)[,2],digits=3)
# plot the prediction error curves
par(mfrow=c(1,1))
plot(test.cv3,legend=F,xlim=c(0,547.5))
legend('bottomright',c('null model','clinical model','combined model'),lty=1,col=1:3)

# --------------------------------- #
# 4-fold cross-validation procedure # 
# --------------------------------- #
set.seed(123)
test.cv4<-pec(list(modClin,modBoth),formula=modBoth$formula,data=validData.both,cens.model='marginal',splitMethod='cv4',B=1e2)
# compute the integrated Brier score
print(ibs(test.cv4,times=maxTime)[,2],digits=3)
# plot the prediction error curves
par(mfrow=c(1,1))
plot(test.cv4,legend=F,xlim=c(0,547.5))
legend('bottomright',c('null model','clinical model','combined model'),lty=1,col=1:3)

# ------------------------------------ #
# bootstrap cross-validation procedure # 
# ------------------------------------ #
set.seed(123)
test.boot<-pec(list(modClin,modBoth),formula=modBoth$formula,data=validData.both,cens.model='marginal',splitMethod='Boot632plus',B=1e3)
# compute the integrated Brier score
print(ibs(test.boot,times=maxTime)[,2],digits=3)
# plot the prediction error curves
plot(test.boot,legend=F,xlim=c(0,547.5))
legend('bottomright',c('null model','clinical model','combined model'),lty=1,col=1:3)

# -------------------------------------------- #
# random split in learning and validation sets # 
# -------------------------------------------- #
strategyCwithRandomSplit<-function(data,niter,prop,maxtime,split)
{
  inbs<-prec.null<-prec.clin<-prec.both<-NULL
  n<-dim(data)[1]
  for (i in 1:niter)
  {
    label<-sample(c(rep(1,trunc(n*prop)),rep(2,n-trunc(n*prop))))
    modClin.cv<-coxph(Surv(time,status)~.-score,data=data[label==1,])
    modBoth.cv<-coxph(Surv(time,status)~.,data=data[label==1,])
    curves<-pec(list(modClin.cv,modBoth.cv),formula=modBoth.cv$formula,data=data[label==2,],cens.model='marginal',times=seq(0,maxtime,le=split),exact=F)
    inbs<-cbind(inbs,ibs(curves,times=min(maxtime,curves$minmaxtime)))
    if (length(curves$AppErr$Reference)!=split)
    {
      curves$AppErr$Reference<-c(curves$AppErr$Reference,rep(NA,split-length(curves$AppErr$Reference)))
      curves$AppErr$coxph<-c(curves$AppErr$coxph,rep(NA,split-length(curves$AppErr$coxph)))
      curves$AppErr$coxph.1<-c(curves$AppErr$coxph.1,rep(NA,split-length(curves$AppErr$coxph.1)))
    }
    prec.null<-rbind(prec.null,curves$AppErr$Reference)
    prec.clin<-rbind(prec.clin,curves$AppErr$coxph)
    prec.both<-rbind(prec.both,curves$AppErr$coxph.1)
  }
  if(niter>1)
  {
    inbs<-apply(inbs,1,mean)
    prec.null<-apply(prec.null,2,mean,na.rm=T)
    prec.clin<-apply(prec.clin,2,mean,na.rm=T)
    prec.both<-apply(prec.both,2,mean,na.rm=T)
  }
  list(ibs=inbs, pec.null=prec.null, pec.clin=prec.clin, pec.both=prec.both)
}
time<-2e2
set.seed(123)
res<-strategyCwithRandomSplit(data=validData.both,niter=100,prop=2/3,maxtime=maxTime,split=time)
print(res$ibs,digits=3)

pdf('CLL_approachSD_withSubsampling.pdf')
  par(mfrow=c(1,1),mar=c(5,4,4,2)+0.1,mgp=c(3,1,0))
  plot(seq(0,maxTime,le=time),res$pec.null,ty='s',col=1,lwd=2,ylab='Prediction error',xaxt='n',ylim=c(0,0.30),xlab='Time',bty='n')
  axis(1,at=c(0,100,200,300,400,500),labels=c(0,100,200,300,400,500))
  lines(seq(0,maxTime,le=time),res$pec.clin,ty='s',col=2,lwd=2)
  lines(seq(0,maxTime,le=time),res$pec.both,ty='s',col=3,lwd=2)
  legend(0,0.3,c('Kaplan-Meier','clinical model','combined model'),lty=1,col=1:3)
dev.off()


#############################
# +++++++++++++++++++++++++ #
# + ##################### + #
# + # SUBGROUP ANALYSIS # + #
# + ##################### + #
# +++++++++++++++++++++++++ #
#############################

###################
# subgroup by SEX #
###################

### STRATEGY SA ###
# apply strategy A to different subgroup
subgroup.A<-function(label,data,data.t)
{
  modClin<-coxph(Surv(time,status)~AGE+FLT3+NPM,data.t[data.t$SEX==label,])
  modBoth<-coxph(Surv(time,status)~AGE+FLT3+NPM+score,data.t[data.t$SEX==label,])
  pec<-pec(list(modClin,modBoth),formula=modBoth$formula,data[data$SEX==label,],cens.model='marginal')
  cindex<-cindex(list(modClin,modBoth),formula=modBoth$formula,data=data[data$SEX==label,],cens.model='marginal')$AppCindex
  Kstat<-rbind(phcpe(modClin),phcpe(modBoth))
  ibs<-ibs(pec,times=547.5)
  list(pec=pec,cindex=cindex,Kstat=Kstat,ibs=ibs)
} 
res<-lapply(c('0','1'),subgroup.A,data=validData.both,data.t=trainData.both)
# integrated Brier score
print(cbind(res[[1]]$ibs,res[[2]]$ibs),digits=3)
# C-index
print(cbind(res[[1]]$cindex,res[[2]]$cindex),digits=3)
# statistic K
print(cbind(res[[1]]$Kstat,res[[2]]$Kstat),digits=3)
# plot prediction error curves
pdf('AML_approachSA.pdf')
  par(mfrow=c(1,2))
  plot(res[[1]]$pec,legend=F,xlim=c(0,547.5),ylim=c(0,0.4))
  legend('bottomright',c('null model','clinical model','combined model'),lty=1,col=1:3)
  mtext('female',side=3)
  plot(res[[2]]$pec,legend=F,,xlim=c(0,547.5),ylim=c(0,0.4))
  legend('bottomright',c('null model','clinical model','combined model'),lty=1,col=1:3)
  mtext('male',side=3)
dev.off()
par(mfrow=c(1,1))

# calibration slope
subgroup.B<-function(label,data,data.t)
{
  modClin<-coxph(Surv(time,status)~AGE+FLT3+NPM,data.t[data.t$SEX==label,])
  modBoth<-coxph(Surv(time,status)~AGE+FLT3+NPM+score,data.t[data.t$SEX==label,])
  #2: check the association between the scores and the validation set outcome
  score.clin<-predict(modClin,newdata=data[data$SEX==label,],type='lp')
  score.both<-predict(modBoth,newdata=data[data$SEX==label,],type='lp')

  list(clin=coxph(Surv(data$time[data$SEX==label],data$status[data$SEX==label])~score.clin),
comb=coxph(Surv(data$time[data$SEX==label],data$status[data$SEX==label])~score.both))
}
resCS<-lapply(c('0','1'),subgroup.B,data=validData.both,data.t=trainData.both)
print(rbind(cbind(resCS[[1]]$clin$coeff,resCS[[2]]$clin$coeff),
            cbind(resCS[[1]]$comb$coeff,resCS[[2]]$comb$coeff)),digits=3)


### STRATEGY SB ###
subgroup.C<-function(label,data)
{
  modSC<-coxph(Surv(time,status)~score+AGE+FLT3+NPM,data=data[data$SEX==label,])
  2*(min(pnorm(modSC$coeff[1]/sqrt(modSC$var[1,1])),1-pnorm(modSC$coeff[1]/sqrt(modSC$var[1,1]))))
}  
sapply(c('0','1'),subgroup.C,data=validData.both)


### STRATEGY SD ###
# 10-fold cross-validation
subgroup.D<-function(label,data,data.t)
{
  modClin<-coxph(Surv(time,status)~AGE+FLT3+NPM,data.t[data.t$SEX==label,])
  modBoth<-coxph(Surv(time,status)~AGE+FLT3+NPM+score,data.t[data.t$SEX==label,])
  pec(list(modClin,modBoth),formula=modBoth$formula,data=data[data$SEX==label,],cens.model='marginal',splitMethod='cv10',B=1e2)
}
set.seed(123)
res<-lapply(c('0','1'),subgroup.D,data=validData.both,data.t=trainData.both)
#
lapply(res,ibs,times=maxTime)
#
pdf('AML_approachSD_cv.pdf')
  par(mfrow=c(1,2))
  plot(res[[1]],legend=F, main='female',xlim=c(0,547.5))
  mtext('female',side=3,font=2)
  legend('bottomright',c('null model','clinical model','combined model'),lty=1,col=1:3)
  plot(res[[2]],legend=F, main='male',xlim=c(0,547.5))
  mtext('male',side=3,font=2)
  legend('bottomright',c('null model','clinical model','combined model'),lty=1,col=1:3)
dev.off()

# subsampling
set.seed(12)
res0<-strategyCwithRandomSplit(data=validData.both[validData.both$SEX=='0',-3],niter=100,prop=2/3,maxtime=maxTime,split=time)
set.seed(12)
res1<-strategyCwithRandomSplit(data=validData.both[validData.both$SEX=='1',-3],niter=100,prop=2/3,maxtime=maxTime,split=time)

print(res0$ibs,digits=3)
print(res1$ibs,digits=3)

pdf('CLL_approachSD_withSubsampling.pdf')
  par(mfrow=c(1,2),mar=c(5,4,4,2)+0.1,mgp=c(3,1,0))
  plot(seq(0,maxTime,le=time),res0$pec.null,ty='s',col=1,lwd=2,ylab='Prediction error',xaxt='n',ylim=c(0,0.30),xlab='Time',bty='n',main='female')
  axis(1,at=c(0,100,200,300,400,500),labels=c(0,100,200,300,400,500))
  lines(seq(0,maxTime,le=time),res0$pec.clin,ty='s',col=2,lwd=2)
  lines(seq(0,maxTime,le=time),res0$pec.both,ty='s',col=3,lwd=2)
  legend('bottomright',c('Kaplan-Meier','clinical model','combined model'),lty=1,col=1:3)
  plot(seq(0,maxTime,le=time),res1$pec.null,ty='s',col=1,lwd=2,ylab='Prediction error',xaxt='n',ylim=c(0,0.40),xlab='Time',bty='n',main='male')
  axis(1,at=c(0,100,200,300,400,500),labels=c(0,100,200,300,400,500))
  lines(seq(0,maxTime,le=time),res1$pec.clin,ty='s',col=2,lwd=2)
  lines(seq(0,maxTime,le=time),res1$pec.both,ty='s',col=3,lwd=2)
  legend('bottomright',c('Kaplan-Meier','clinical model','combined model'),lty=1,col=1:3)
dev.off()