###############################################################################
#                                                                             #
#   Reproducible files to the paper 'Pitfalls of hypothesis tests             #
#   and model selection on bootstrap samples: causes and consequences in      #
#   biometrical applications'  (2014)                                         #
#   by Janitza, Binder and Boulesteix                                         #
#                                                                             #
#   Contact: S. Janitza <janitza@ibe.med.uni-muenchen.de>                     #
#                                                                             #
#   File for reproducing Figures 11 and 12 (Part 2)                           #
#                                                                             #
###############################################################################


# uncomment the following line and set the working directory where the R objects are stored
# setwd("...") 


# load relevant R objects
load("NHANES_original_AIC.Rda")
load("NHANES_bootstrap_AIC.Rda")
load("NHANES_subsample_AIC.Rda")

#########################################
#
#  NUMBERS PRESENTED IN THE PAPER AND 
#  ADDITIONAL CALCULATIONS 
#
#########################################

# MODEL COMPLEXITY
# ----------------

#   original sample
#   ---------------

# chosen number of boosting steps and variables in original sample
NHANES_original_AIC$boost.steps # = 309
length(NHANES_original_AIC$sel_coefs) - 1 # = 42 (-1 because intercept shall not be counted)

#   bootstrap samples
#   -----------------

# mean chosen number of boosting steps in bootstrap samples
mean(sapply(NHANES_bootstrap_AIC, function(z) z$boost.steps)) # = 467.773

# number of bootstrap samples in which a higher number of boosting steps than 309 was chosen
sum(sapply(NHANES_bootstrap_AIC, function(z) z$boost.steps > 309)) # = 978

# mean number of parameters included in a model
mean(sapply(NHANES_bootstrap_AIC, function(z) length(z$sel_coefs) - 1)) # = 44.333

# amount of bootstrap samples in which the model includes more than 42 parameters
mean(sapply(NHANES_bootstrap_AIC, function(z) (length(z$sel_coefs) - 1) > 42)) # = 0.683

# amount of bootstrap samples in which the model includes less than 42 parameters
mean(sapply(NHANES_bootstrap_AIC, function(z) (length(z$sel_coefs) - 1) < 42)) # = 0.247

# amount of bootstrap samples in which the model includes exactly 42 parameters
mean(sapply(NHANES_bootstrap_AIC, function(z) (length(z$sel_coefs) - 1) == 42)) # = 0.07


#   subsamples
#   ----------

# mean chosen number of boosting steps in subsamples
mean(sapply(NHANES_subsample_AIC, function(z) z$boost.steps)) # = 254.176

# number of subsamples in which a higher number of boosting steps than 309 was chosen
sum(sapply(NHANES_subsample_AIC, function(z) z$boost.steps > 309)) # = 91

# mean number of parameters included in a model
mean(sapply(NHANES_subsample_AIC, function(z) length(z$sel_coefs) - 1)) # = 34.721

# amount of subsamples in which the model includes more than 42 parameters
mean(sapply(NHANES_subsample_AIC, function(z) (length(z$sel_coefs) - 1) > 42)) # = 0.012

# amount of subsamples in which the model includes less than 42 parameters
mean(sapply(NHANES_subsample_AIC, function(z) (length(z$sel_coefs) - 1) < 42)) # = 0.97

# amount of subsamples in which the model includes exactly 42 parameters
mean(sapply(NHANES_subsample_AIC, function(z) (length(z$sel_coefs) - 1) == 42)) # = 0.018



# MODEL ACCURACY
# --------------

# accuracy for models fit on bootstrap samples (in terms of MSE)
mean(sapply(NHANES_bootstrap_AIC, function(z) z$accuracy)) # = 0.0008505651

# accuracy for models fit on subsamples (in terms of MSE)
mean(sapply(NHANES_subsample_AIC, function(z) z$accuracy)) # = 0.000754256



#########################################
#
#  FIGURES 11 and 12
#
#########################################

# create Figure 11: distribution of number of boosting steps in the bootstrap models and subsample models (uncomment lines to save the figure in the current working directory)

#graphics.off()
#pdf(file = "NHANES_AIC_boosting_steps.pdf", height = 4, width = 3.2)

lablist <- c("bootstrap", "subsample")
par(mar = c(5, 4, 1, 1))

boxplot(sapply(NHANES_bootstrap_AIC, function(z) z$boost.steps), sapply(NHANES_subsample_AIC, function(z) z$boost.steps), ylab = "Optimal number of boosting steps")
abline(h = NHANES_original_AIC$boost.steps, col = "gray", lty = 2)
text(seq(0.9, 1.9, by = 1), -10, labels = lablist, srt = 45, pos = 1, xpd = TRUE, cex = 0.9)

#graphics.off()



# create Figure 12: distribution of number of parameters in the bootstrap models and subsample models (uncomment lines to save the figure in the current working directory)

#graphics.off()
#pdf(file = "NHANES_AIC_model_complexity.pdf", height = 4.5, width = 10)

par(mfrow = c(1,2))

barplot(table(sapply(NHANES_bootstrap_AIC, function(z) length(z$sel_coefs)-1))/1000, ylim = c(0, 0.12), 
        ylab = "Relative frequency", xlab = "Number of parameters in the boosting model", main = "Bootstrap", col = rep(c("gray90", "darkgray", "gray90"), c(9, 1, 15)))
barplot(table(sapply(NHANES_subsample_AIC, function(z) length(z$sel_coefs)-1))/1000, ylim = c(0, 0.12), 
        ylab = "Relative frequency", xlab = "Number of parameters in the boosting model", main = "Subsample", col = rep(c("gray90", "darkgray", "gray90"), c(19, 1, 3)))

#graphics.off()

