###############################################################################
#                                                                             #
#   Reproducible files to the paper 'Pitfalls of hypothesis tests             #
#   and model selection on bootstrap samples: causes and consequences in      #
#   biometrical applications'  (2014)                                         #
#   by Janitza, Binder and Boulesteix                                         #
#                                                                             #
#   Contact: S. Janitza <janitza@ibe.med.uni-muenchen.de>                     #
#                                                                             #
#   File for reproducing Figures 11 and 12 (Part 1)                           #
#                                                                             #
###############################################################################

# NOTE: running this code requires installation of R package mboost
#       running this code requires sourcing the NHANES data 

# uncomment the following line and set the working directory where the objects shall be stored
# setwd("...") 

# load the NHANES data here
#load("NHANES_data.Rda")


# function to investigate tuning parameter selection using AIC on bootstrap samples

# input parameters: 
#   - data: NHANES data comprising the variables described in Table A.1 and the variable CRP (C-reactive protein)
#   - index: sample index, determining which observations are used for model fitting and which are used for model evaluation 


# output: list with 3 entries
#         - number of boosting steps chosen via AIC
#         - model accuracy (measured in terms of MSE using all observations that are not part of the index vector)
#         - names of selected parameter coefficients in the resulting boosting model

perform.study.AIC <- function(data, index){
  
  require(mboost)
  
  traindata <- data[index,]   # used for model fitting
  testdata  <- data[-index,]  # used for model evaluation
    
  # fit model
  model      <- glmboost(CRP ~ ., data = traindata, control = boost_control(mstop = 1000))
    
  # obtain optimal mstop using AIC
  boost.steps  <- mstop(AIC(model))
  
  # refit model with selected number of boosting steps
  model     <- glmboost(CRP ~ ., data = traindata, control = boost_control(mstop = boost.steps))
  
  # extract selected parameter names in a model
  sel_coefs <- names(coef(model))
  
  # evaluate accuracy of the model (using MSE)
  accuracy  <- mean(predict(model, newdata = testdata) - testdata$CRP)^2
  
  # return selected number of boosting steps, accuracy and the parameter names in a model
  return(list(boost.steps = boost.steps, 
              accuracy    = accuracy, 
              sel_coefs   = sel_coefs))
}


# apply function to original NHANES data (NOTE: accuracy cannot be estimated in this case)

NHANES_original_AIC <- perform.study.AIC(data = data, index = 1:nrow(data))

save(NHANES_original_AIC, file = "NHANES_original_AIC.Rda")


# apply function to B = 1000 bootstrap samples 

NHANES_bootstrap_AIC <- lapply(1:1000, function(z){
  set.seed(z)
  bootindex <- sample(1:nrow(data), size = nrow(data), replace = TRUE) 
  perform.study.AIC(data = data, index = bootindex)
})

save(NHANES_bootstrap_AIC, file = "NHANES_bootstrap_AIC.Rda")


# apply function to B = 1000 subsamples 

NHANES_subsample_AIC <- lapply(1:1000, function(z){
  set.seed(z)
  bootindex <- sample(1:nrow(data), size = floor(0.632 * nrow(data)), replace = FALSE)
  perform.study.AIC(data = data, index = bootindex)
})

save(NHANES_subsample_AIC, file = "NHANES_subsample_AIC.Rda")


