###############################################################################
#                                                                             #
#   Reproducible files to the paper 'Pitfalls of hypothesis tests             #
#   and model selection on bootstrap samples: causes and consequences in      #
#   biometrical applications'  (2014)                                         #
#   by Janitza, Binder and Boulesteix                                         #
#                                                                             #
#   Contact: S. Janitza <janitza@ibe.med.uni-muenchen.de>                     #
#                                                                             #
#   File for reproducing Figure 13 (Part 1)                                   #
#                                                                             #
###############################################################################

# NOTE: running this code requires installation of R packages snowfall and mboost

# uncomment the following line and set the working directory where the objects shall be stored
# setwd("...") 

require(snowfall)

sfInit(parallel = TRUE, cpus = 10, type = "MPI") # if you won't do parallel computing set parallel = FALSE



# function to investigate tuning parameter selection using AIC on bootstrap samples

# input parameters: 
#   - seed: a seed number for reproducibility purposes (permits that the same results are obtained when running in sequential mode)
#   - p: number of predictor variables
#   - effect: effect size of influential predictors

# output: (niter x 3) matrix
#         1st column contains mstop values for original samples, 
#         2nd column contains mstop values for bootstrap samples, 
#         3rd column contains mstop values for subsamples

perform.study.AIC <- function(seed, p = 200, effect = 1){
  
  require(mboost)
  
  # set up effect sizes
  beta <- rep(0, p)
  beta[((1:p) * 200/p) %in% c(1, 3, 5, 7, 9)]  <- effect
  beta[((1:p) * 200/p) %in% c(2, 4, 6, 8, 10)] <- -effect
  
  set.seed(seed)
  
  # predictors drawn from N(0,1)
  X <- matrix(rnorm(n = 100 * p), ncol = p)
  
  # response
  eta  <- X %*% beta
  prob <- exp(eta)/(1 + exp(eta))
  y    <- apply(prob, 1, function(z) rbinom(prob = z, n = 1, size = 1))
  
  # original sample
  data <- data.frame(id = 1:100, y = y, X = X)
  
  # bootstrap sample
  index     <- sort(sample(1:100, replace = TRUE, size = 100))
  boot.data <- data[index,]
  
  # subsample
  index.sub <- sort(sample(1:100, replace = FALSE, size = 63))
  sub.data  <- data[index.sub,]
  
  # fit models (on original sample, bootstrap sample and subsample) 
  model      <- glmboost(as.factor(y) ~ . - id, data = data,      control = boost_control(mstop = 1000), family = Binomial())
  model.boot <- glmboost(as.factor(y) ~ . - id, data = boot.data, control = boost_control(mstop = 1000), family = Binomial())
  model.sub  <- glmboost(as.factor(y) ~ . - id, data = sub.data,  control = boost_control(mstop = 1000), family = Binomial())
    
  # obtain optimal mstop using AIC
  boost.steps.orig  <- mstop(AIC(model,      "classical"))
  boost.steps.boot  <- mstop(AIC(model.boot, "classical"))
  boost.steps.sub   <- mstop(AIC(model.sub,  "classical"))
  
  return(as.matrix(data.frame(original   = boost.steps.orig, 
                              bootstrap  = boost.steps.boot, 
                              subsample  = boost.steps.sub)))
}


sfExport("perform.study.AIC")

# settings with weak effects

AIC_setting_uncor_weak_p200 <- t(sfSapply(1:1000, function(z) perform.study.AIC(seed = z, p = 200, effect = 1)))
save(AIC_setting_uncor_weak_p200, file = "AIC_setting_uncor_weak_p200.Rda")
AIC_setting_uncor_weak_p1000 <- t(sfSapply(1:1000, function(z) perform.study.AIC(seed = z, p = 1000, effect = 1)))
save(AIC_setting_uncor_weak_p1000, file = "AIC_setting_uncor_weak_p1000.Rda")
AIC_setting_uncor_weak_p5000 <- t(sfSapply(1:1000, function(z) perform.study.AIC(seed = z, p = 5000, effect = 1)))
save(AIC_setting_uncor_weak_p5000, file = "AIC_setting_uncor_weak_p5000.Rda")  


# settings with medium effects

AIC_setting_uncor_medium_p200 <- t(sfSapply(1:1000, function(z) perform.study.AIC(seed = z, p = 200, effect = 2)))
save(AIC_setting_uncor_medium_p200, file = "AIC_setting_uncor_medium_p200.Rda")
AIC_setting_uncor_medium_p1000 <- t(sfSapply(1:1000, function(z) perform.study.AIC(seed = z, p = 1000, effect = 2)))
save(AIC_setting_uncor_medium_p1000, file = "AIC_setting_uncor_medium_p1000.Rda")
AIC_setting_uncor_medium_p5000 <- t(sfSapply(1:1000, function(z) perform.study.AIC(seed = z, p = 5000, effect = 2)))
save(AIC_setting_uncor_medium_p5000, file = "AIC_setting_uncor_medium_p5000.Rda")  


sfStop()
