### Herold dataset ###
# strategies for the validation of the added predictive value (De Bin et al., BMC Medical Research Methodology, 2014)

library(survival)
library(pec)

load("../../dataset/ccl/data.maerz.Rdata") # load training data (molecular)
load("../../dataset/ccl/pheno.maerz.Rdata") # load training data (clinical)
dataTraining.mol<-data
dataTraining.clin<-Table

load("../../dataset/ccl/testdaten.new.Rdata") # load validation set
dataValid.mol<-testdata
dataValid.clin<-pheno.test
rm(data,Table,testdata,pheno.test)

data.tm<-dataTraining.mol[c('37004_at','226039_at','205255_x_at','221286_s_at','226247_at','212522_at','243010_at','202600_s_at'),]
data.vm<-dataValid.mol[c('SFTPB','MGAT4A','TCF7','MGC29506','PLEKHA1','PDE8A','MSI2','NRIP1'),]
comp.ps.8.herold<-function(sample) c(0.160,-0.151,-0.096,0.089,-0.110,-0.108,0.081,-0.208)%*%sample

ps.8.T<-apply(data.tm,2,comp.ps.8.herold)
ps.8.V<-apply(data.vm,2,comp.ps.8.herold)

data.tc<-cbind(dataTraining.clin[,c('OS','L_STAT','MUT_STAT','AGE','SEX','VH_STAT')],ps.8.T)
colnames(data.tc)<-c('OS','L_STAT','MUT_STAT','AGE','SEX','VH_STAT','ps.8')
data.tc[41,6]<-NA

data.vc<-na.omit(cbind(dataValid.clin[,c('Overall.survival..Tage.ab.Analyse.','L_STAT.ja.1.nein.0..Tod.aus.irgendeinem.Grund.','X5.Dimensionen.FISH.0.13q.solo.1.11q.2.17p.3.T12.4.ohne','Alter.bei.Messung..Jahre.','SEX','VH.Status')],ps.8.V))
colnames(data.vc)<-c('OS','L_STAT','MUT_STAT','AGE','SEX','VH_STAT','ps.8')

rm(data.tm,data.vm,dataValid.clin,dataValid.mol,dataTraining.clin,dataTraining.mol,ps.8.T,ps.8.V)

# Kaplan-Meier curves
pdf('CLL_KM.pdf')
  par(mfrow=c(4,2),mar=c(3,3,3,3),mgp=c(2,1,0))

  plot(survfit(Surv(as.numeric(OS),as.numeric(L_STAT))~I(AGE<60),data=data.tc),col=c(2,3),main='Training set - AGE',ylab='overall survival',xlab='day')
  legend('bottomleft',c('AGE<60','AGE>=60'),lty=1, col=c(3,2))
  plot(survfit(Surv(as.numeric(OS),as.numeric(L_STAT))~I(AGE<60),data=data.vc),col=c(2,3),main='Validation set - AGE',ylab='overall survival',xlab='day')
  legend('bottomleft',c('AGE<60','AGE>=60'),lty=1,,col=c(3,2))

  plot(survfit(Surv(as.numeric(OS),as.numeric(L_STAT))~SEX,data=data.tc),col=c(2,3),main='Training set - SEX',ylab='overall survival',xlab='day')
  legend('bottomleft',c('male','female'),lty=1, col=c(2,3))
  plot(survfit(Surv(as.numeric(OS),as.numeric(L_STAT))~SEX,data=data.vc),col=c(2,3),main='Validation set - SEX',ylab='overall survival',xlab='day')
  legend('bottomleft',c('male','female'),lty=1,,col=c(2,3))

  plot(survfit(Surv(as.numeric(OS),as.numeric(L_STAT))~MUT_STAT,data=data.tc),col=c(2,3,4,5,6),main='Training set - FISH',ylab='overall survival',xlab='day')
  legend('bottomleft',c('0','1','2','3','4'),lty=1, col=c(2:6),ncol=2,cex=0.6)
  plot(survfit(Surv(as.numeric(OS),as.numeric(L_STAT))~MUT_STAT,data=data.vc),col=c(2,3,4,5,6),main='Validation set - FISH',ylab='overall survival',xlab='day')
  legend('bottomleft',c('0','1','2','3','4'),lty=1, col=c(2:6),ncol=2,cex=0.6)

  plot(survfit(Surv(as.numeric(OS),as.numeric(L_STAT))~VH_STAT,data=data.tc),col=c(3,2),main='Training set - IGVH',ylab='overall survival',xlab='day')
  legend('bottomleft',c('not mutated','mutated'),lty=1, col=c(3,2))
  plot(survfit(Surv(as.numeric(OS),as.numeric(L_STAT))~VH_STAT,data=data.vc),col=c(3,2),main='Validation set - IGVH',ylab='overall survival',xlab='day')
  legend('bottomleft',c('not mutated','mutated'),lty=1, col=c(3,2))
dev.off()
par(mfrow=c(1,1))

# comparison between sets
pdf('CCL_KaplanMeier_sets.pdf')
plot(survfit(Surv(as.numeric(data.tc$OS),as.numeric(data.tc$L_STAT))~1),col=2,lwd=c(3,1,1),xlab='days',ylab='overall survival')
lines(survfit(Surv(as.numeric(data.vc$OS),as.numeric(data.vc$L_STAT))~1),col=3,lwd=3)
lines(survfit(Surv(as.numeric(data.vc$OS),as.numeric(data.vc$L_STAT))~1)$time,survfit(Surv(as.numeric(data.vc$OS),as.numeric(data.vc$L_STAT))~1)$upper,col=3,lty=2)
lines(survfit(Surv(as.numeric(data.vc$OS),as.numeric(data.vc$L_STAT))~1)$time,survfit(Surv(as.numeric(data.vc$OS),as.numeric(data.vc$L_STAT))~1)$lower,col=3,lty=2)
legend('bottomleft',c('training set','validation set'),col=c(2,3),lwd=1)
dev.off()

##############
# Strategy A #
##############
# can not be used, due different kind of variable (microarray and LDA data)


##############
# STRATEGY B #
##############
# fit prediction model based on validation data
modBoth.v<-coxph(Surv(as.numeric(OS),as.numeric(L_STAT))~as.factor(MUT_STAT) + as.numeric(AGE) + as.factor(SEX) + as.factor(VH_STAT) + ps.8, data=data.vc)
summary(modBoth.v)
# test the null hypothesis (with Wald test or likelihood ratio <-- asymptotically equal)
2*(min(pnorm(modBoth.v$coeff[8]/sqrt(modBoth.v$var[8,8])),1-pnorm(modBoth.v$coeff[8]/sqrt(modBoth.v$var[8,8]))))


##############
# STRATEGY C #
##############
modClin.v<-coxph(Surv(as.numeric(OS),as.numeric(L_STAT))~as.factor(MUT_STAT)+as.numeric(AGE)+as.factor(SEX)+as.factor(VH_STAT),data=data.vc)

# ---------------------------------- #
# 10-fold cross-validation procedure # 
# ---------------------------------- #
set.seed(123)
test.cv<-pec(list(modClin.v,modBoth.v),formula=modBoth.v$formula,data=data.vc,cens.model='marginal',splitMethod='cv10',B=1e2)
# compute the integrated Brier score up to 1500 (value obtained looking at the Kaplan-Maier curves)
ibs(test.cv,times=1500)[,2]

# plot the prediction error curves
pdf('CLL_approachSD.pdf')
  par(mfrow=c(1,1),mar=c(5,4,4,2)+0.1,mgp=c(3,1,0))
  plot(test.cv,legend=F,xlim=c(1,1500))
  legend(0,0.3,c('Kaplan-Meier','clinical model','combined model'),lty=1,col=1:3)
dev.off()

# -------------------------------------------- #
# random split in learning and validation sets # 
# -------------------------------------------- #
strategyCwithRandomSplit<-function(data,niter,prop,maxtime,split)
{
  inbs<-prec.null<-prec.clin<-prec.both<-NULL
  n<-dim(data)[1]
  for (i in 1:niter)
  {
    label<-sample(c(rep(1,trunc(n*prop)),rep(2,n-trunc(n*prop))))
    modClin.cv<-coxph(Surv(as.numeric(OS),as.numeric(L_STAT))~as.factor(MUT_STAT)+as.numeric(AGE)+as.factor(SEX)+as.factor(VH_STAT),data=data[label==1,])
    modBoth.cv<-coxph(Surv(as.numeric(OS),as.numeric(L_STAT))~as.factor(MUT_STAT) + as.numeric(AGE) + as.factor(SEX) + as.factor(VH_STAT) + ps.8, data=data[label==1,])
    curves<-pec(list(modClin.v,modBoth.v),formula=modBoth.v$formula,data=data[label==2,],cens.model='marginal',times=seq(0,maxtime,le=split),exact=F)
    inbs<-cbind(inbs,ibs(curves,times=maxtime))
    prec.null<-rbind(prec.null,curves$AppErr$Reference)
    prec.clin<-rbind(prec.clin,curves$AppErr$coxph)
    prec.both<-rbind(prec.both,curves$AppErr$coxph.1)
  }
  if(niter>1)
  {
    inbs<-apply(inbs,1,mean)
    prec.null<-apply(prec.null,2,mean)
    prec.clin<-apply(prec.clin,2,mean)
    prec.both<-apply(prec.both,2,mean)
  }
  list(ibs=inbs, pec.null=prec.null, pec.clin=prec.clin, pec.both=prec.both)
}
time<-2e2
maxtime<-15e2
set.seed(123)
res<-strategyCwithRandomSplit(data=data.vc,niter=100,prop=0.75,maxtime=maxtime,split=time)
res$ibs

pdf('CLL_approachSD_withSubsampling.pdf')
  par(mfrow=c(1,1),mar=c(5,4,4,2)+0.1,mgp=c(3,1,0))
  plot(seq(0,maxtime,le=time),res$pec.null,ty='s',col=1,lwd=2,ylab='Prediction error',xaxt='n',ylim=c(0,0.30),xlab='Time',bty='n')
  axis(1,at=seq(0,maxtime,le=4),labels=seq(0,maxtime,le=4))
  lines(seq(0,maxtime,le=time),res$pec.clin,ty='s',col=2,lwd=2)
  lines(seq(0,maxtime,le=time),res$pec.both,ty='s',col=3,lwd=2)
  legend(0,0.3,c('Kaplan-Meier','clinical model','combined model'),lty=1,col=1:3)
dev.off()


#####################
# SUBGROUP ANALYSIS #
#####################
# can not be performed, due to small sample size