###############################################################################
#                                                                             #
#   Reproducible files to the paper 'Pitfalls of hypothesis tests             #
#   and model selection on bootstrap samples: causes and consequences in      #
#   biometrical applications'  (2014)                                         #
#   by Janitza, Binder and Boulesteix                                         #
#                                                                             #
#   Contact: S. Janitza <janitza@ibe.med.uni-muenchen.de>                     #
#                                                                             #
#   File for reproducing Figures 9, 10, A2 and Table A3 (Part 2)              #
#                                                                             #
###############################################################################


# uncomment the following line and set the working directory where the R objects are stored
# setwd("...") 


# load relevant R objects
load("NHANES_AIC.Rda")


# inspect the covariate names and attribute corresponding number of parameters that need to be estimated
names(NHANES_AIC[[1]])
varscale <- c("k = 1", "k = 4", "k = 3", "k = 4", "k = 5", "k = 1", "k = 1", "k = 1", "k = 4", "k = 1", "k = 3", "k = 4", "k = 4", "k = 4", "k = 1", "k = 1", "k = 1", "k = 1", "k = 1", "k = 1", "k = 1", "k = 1", "k = 4", "k = 1", "k = 1", "k = 1", "k = 11", "k = 1")


#########################################
#
#  FIGURE 9
#
#########################################


# create Figure 9: AIC values (in ascending order from top to bottom) obtained for the 28 models (uncomment lines to save the figure in the current working directory)

#graphics.off()
#pdf(file = "NHANES_AIC_ranking.pdf", height = 6, width = 12)

par(mfrow = c(1, 2), mar = c(5, 12, 0.5, 0.2))

plot(c(min(NHANES_AIC$orig_AIC), max(NHANES_AIC$orig_AIC)), c(1, length(NHANES_AIC$orig_AIC)), type="n", xlab="AIC (original sample)", yaxt="n", ylab = "")
grid(ny = c(length(NHANES_AIC$orig_AIC)+1), nx = 0)
points(sort(NHANES_AIC$orig_AIC), c(length(NHANES_AIC$orig_AIC):1), cex = 0.5, col = "black", pch = 16)
axis(2, labels = paste(names(sort(NHANES_AIC$orig_AIC)), " (", varscale[order(NHANES_AIC$orig_AIC)], ")", sep = ""), at = length(NHANES_AIC$orig_AIC):1, las = 2)

plot(c(min(rowMeans(NHANES_AIC$bootstrapped_AIC)), max(rowMeans(NHANES_AIC$bootstrapped_AIC))), c(1, length(rowMeans(NHANES_AIC$bootstrapped_AIC))), 
     type = "n", xlab = "Bootstrapped AIC (averaged value)", yaxt = "n", ylab = "")
grid(ny = c(length(NHANES_AIC$orig_AIC)+1), nx = 0)
points(sort(rowMeans(NHANES_AIC$bootstrapped_AIC)), c(length(rowMeans(NHANES_AIC$bootstrapped_AIC)):1), cex = 0.5, col = "black", pch = 16)
axis(2, labels = paste(names(sort(rowMeans(NHANES_AIC$bootstrapped_AIC))), " (", varscale[order(rowMeans(NHANES_AIC$bootstrapped_AIC))], ")", sep = ""), at = length(rowMeans(NHANES_AIC$bootstrapped_AIC)):1, las = 2)

#graphics.off()

#graphics.off()
#pdf(file = "NHANES_AIC_ranking_subsample.pdf", height = 6, width = 6)

par(mfrow = c(1, 1), mar = c(5, 12, 0.5, 0.2))

plot(c(min(rowMeans(NHANES_AIC$subsampled_AIC)), max(rowMeans(NHANES_AIC$subsampled_AIC))), c(1, length(rowMeans(NHANES_AIC$subsampled_AIC))), 
     type = "n", xlab = "Subsampled AIC (averaged value)", yaxt = "n", ylab = "")
grid(ny = c(length(NHANES_AIC$orig_AIC)+1), nx = 0)
points(sort(rowMeans(NHANES_AIC$subsampled_AIC)), c(length(rowMeans(NHANES_AIC$subsampled_AIC)):1), cex = 0.5, col = "black", pch = 16)
axis(2, labels = paste(names(sort(rowMeans(NHANES_AIC$subsampled_AIC))), " (", varscale[order(rowMeans(NHANES_AIC$subsampled_AIC))], ")", sep = ""), at = length(rowMeans(NHANES_AIC$subsampled_AIC)):1, las = 2)

#graphics.off()



#########################################
#
#  FIGURE 10
#
#########################################


# create Figure 10: Difference between the AIC values for bootstrap samples (uncomment lines to save the figure in the current working directory)

#graphics.off()
#pdf(file = "NHANES_AIC_Bias.pdf", height = 6, width = 8.5)

par(mfrow = c(1, 1), mar = c(5, 12, 0.5, 0.2))

plot(c(min(NHANES_AIC$orig_AIC-rowMeans(NHANES_AIC$bootstrapped_AIC)), max(NHANES_AIC$orig_AIC-rowMeans(NHANES_AIC$bootstrapped_AIC))), c(1, length(NHANES_AIC$orig_AIC)), 
     type = "n", xlab = "Difference in AIC (original sample) and averaged bootstrapped AIC", yaxt = "n", ylab = "")

for(i in 1:length(NHANES_AIC$orig_AIC)){
  points(c(0, sort(NHANES_AIC$orig_AIC-rowMeans(NHANES_AIC$bootstrapped_AIC))[i]), c(i, i), type = "l")
}

axis(2, labels = paste(names(sort(NHANES_AIC$orig_AIC-rowMeans(NHANES_AIC$bootstrapped_AIC))), " (", 
                       varscale[order(NHANES_AIC$orig_AIC-rowMeans(NHANES_AIC$bootstrapped_AIC))], ")", sep = ""), at = 1:length(NHANES_AIC$orig_AIC), las = 2)

#graphics.off()



#########################################
#
#  FIGURE A2
#
#########################################


# create Figure A2: Difference between the AIC values for subsamples (uncomment lines to save the figure in the current working directory)

#graphics.off()
#pdf(file = "NHANES_AIC_Bias_subsample.pdf", height = 6, width = 8.5)

par(mfrow = c(1, 1), mar = c(5, 12, 0.5, 0.2))

plot(c(min(NHANES_AIC$orig_AIC-rowMeans(NHANES_AIC$subsampled_AIC)), max(NHANES_AIC$orig_AIC-rowMeans(NHANES_AIC$subsampled_AIC))), c(1, length(NHANES_AIC$orig_AIC)), 
     type = "n", xlab = "Difference in AIC (original sample) and averaged subsampled AIC", yaxt = "n", ylab = "")

for(i in 1:length(NHANES_AIC$orig_AIC)){
  points(c(0, sort(NHANES_AIC$orig_AIC-rowMeans(NHANES_AIC$subsampled_AIC))[i]), c(i, i), type = "l")
}

axis(2, labels = paste(names(sort(NHANES_AIC$orig_AIC-rowMeans(NHANES_AIC$subsampled_AIC))), " (", 
                       varscale[order(NHANES_AIC$orig_AIC-rowMeans(NHANES_AIC$subsampled_AIC))], ")", sep = ""), at = 1:length(NHANES_AIC$orig_AIC), las = 2)

#graphics.off()



#########################################
#
#  TABLE A3
#
#########################################

# compute numbers in Table A3: Model position according to the AIC

kat2 <- names(sort(NHANES_AIC$orig_AIC)[varscale[order(NHANES_AIC$orig_AIC)] == "k = 1"])
kat4 <- names(sort(NHANES_AIC$orig_AIC)[varscale[order(NHANES_AIC$orig_AIC)]  == "k = 3"])
kat5 <- names(sort(NHANES_AIC$orig_AIC)[varscale[order(NHANES_AIC$orig_AIC)]  == "k = 4"])
kat6 <- names(sort(NHANES_AIC$orig_AIC)[varscale[order(NHANES_AIC$orig_AIC)]  == "k = 5"])
kat12 <- names(sort(NHANES_AIC$orig_AIC)[varscale[order(NHANES_AIC$orig_AIC)] == "k = 11"])

# show part of table for metric and binary variables
(kat2 <- data.frame(
  Original       = sapply(kat2, function(z) which(z == names(NHANES_AIC$orig_AIC[order(NHANES_AIC$orig_AIC)]))),
  Bootstrap      = sapply(kat2, function(z) which(z == names(sort(apply(NHANES_AIC$bootstrapped_AIC, 1, mean))))),
  Bootstrap_diff = (sapply(kat2, function(z) which(z == names(NHANES_AIC$orig_AIC[order(NHANES_AIC$orig_AIC)])))) - (sapply(kat2, function(z) which(z == names(sort(apply(NHANES_AIC$bootstrapped_AIC, 1, mean)))))),
  Subsample      = sapply(kat2, function(z) which(z == names(sort(apply(NHANES_AIC$subsampled_AIC, 1, mean))))),
  Subsample_diff = (sapply(kat2, function(z) which(z == names(NHANES_AIC$orig_AIC[order(NHANES_AIC$orig_AIC)])))) - (sapply(kat2, function(z) which(z == names(sort(apply(NHANES_AIC$subsampled_AIC, 1, mean)))))))
)

# show part of table for 4-category variables
(kat4 <- data.frame(
  Original       = sapply(kat4, function(z) which(z == names(NHANES_AIC$orig_AIC[order(NHANES_AIC$orig_AIC)]))),
  Bootstrap      = sapply(kat4, function(z) which(z == names(sort(apply(NHANES_AIC$bootstrapped_AIC, 1, mean))))),
  Bootstrap_diff = (sapply(kat4, function(z) which(z == names(NHANES_AIC$orig_AIC[order(NHANES_AIC$orig_AIC)])))) - (sapply(kat4, function(z) which(z == names(sort(apply(NHANES_AIC$bootstrapped_AIC, 1, mean)))))),
  Subsample      = sapply(kat4, function(z) which(z == names(sort(apply(NHANES_AIC$subsampled_AIC, 1, mean))))),
  Subsample_diff = (sapply(kat4, function(z) which(z == names(NHANES_AIC$orig_AIC[order(NHANES_AIC$orig_AIC)])))) - (sapply(kat4, function(z) which(z == names(sort(apply(NHANES_AIC$subsampled_AIC, 1, mean)))))))
)

# show part of table for 4-category variables
(kat5 <- data.frame(
  Original       = sapply(kat5, function(z) which(z == names(NHANES_AIC$orig_AIC[order(NHANES_AIC$orig_AIC)]))),
  Bootstrap      = sapply(kat5, function(z) which(z == names(sort(apply(NHANES_AIC$bootstrapped_AIC, 1, mean))))),
  Bootstrap_diff = (sapply(kat5, function(z) which(z == names(NHANES_AIC$orig_AIC[order(NHANES_AIC$orig_AIC)])))) - (sapply(kat5, function(z) which(z == names(sort(apply(NHANES_AIC$bootstrapped_AIC, 1, mean)))))),
  Subsample      = sapply(kat5, function(z) which(z == names(sort(apply(NHANES_AIC$subsampled_AIC, 1, mean))))),
  Subsample_diff = (sapply(kat5, function(z) which(z == names(NHANES_AIC$orig_AIC[order(NHANES_AIC$orig_AIC)])))) - (sapply(kat5, function(z) which(z == names(sort(apply(NHANES_AIC$subsampled_AIC, 1, mean)))))))
)


# show part of table for 4-category variables
(kat6 <- data.frame(
  Original       = sapply(kat6, function(z) which(z == names(NHANES_AIC$orig_AIC[order(NHANES_AIC$orig_AIC)]))),
  Bootstrap      = sapply(kat6, function(z) which(z == names(sort(apply(NHANES_AIC$bootstrapped_AIC, 1, mean))))),
  Bootstrap_diff = (sapply(kat6, function(z) which(z == names(NHANES_AIC$orig_AIC[order(NHANES_AIC$orig_AIC)])))) - (sapply(kat6, function(z) which(z == names(sort(apply(NHANES_AIC$bootstrapped_AIC, 1, mean)))))),
  Subsample      = sapply(kat6, function(z) which(z == names(sort(apply(NHANES_AIC$subsampled_AIC, 1, mean))))),
  Subsample_diff = (sapply(kat6, function(z) which(z == names(NHANES_AIC$orig_AIC[order(NHANES_AIC$orig_AIC)])))) - (sapply(kat6, function(z) which(z == names(sort(apply(NHANES_AIC$subsampled_AIC, 1, mean)))))))
)


# show part of table for 4-category variables
(kat12 <- data.frame(
  Original       = sapply(kat12, function(z) which(z == names(NHANES_AIC$orig_AIC[order(NHANES_AIC$orig_AIC)]))),
  Bootstrap      = sapply(kat12, function(z) which(z == names(sort(apply(NHANES_AIC$bootstrapped_AIC, 1, mean))))),
  Bootstrap_diff = (sapply(kat12, function(z) which(z == names(NHANES_AIC$orig_AIC[order(NHANES_AIC$orig_AIC)])))) - (sapply(kat12, function(z) which(z == names(sort(apply(NHANES_AIC$bootstrapped_AIC, 1, mean)))))),
  Subsample      = sapply(kat12, function(z) which(z == names(sort(apply(NHANES_AIC$subsampled_AIC, 1, mean))))),
  Subsample_diff = (sapply(kat12, function(z) which(z == names(NHANES_AIC$orig_AIC[order(NHANES_AIC$orig_AIC)])))) - (sapply(kat12, function(z) which(z == names(sort(apply(NHANES_AIC$subsampled_AIC, 1, mean)))))))
)

#library(xtable)
#xtable(rbind(kat2, kat4, kat5, kat6, kat12))

