###############################################################################
#                                                                             #
#   Reproducible files to the paper 'Pitfalls of hypothesis tests             #
#   and model selection on bootstrap samples: causes and consequences in      #
#   biometrical applications'  (2014)                                         #
#   by Janitza, Binder and Boulesteix                                         #
#                                                                             #
#   Contact: S. Janitza <janitza@ibe.med.uni-muenchen.de>                     #
#                                                                             #
#   File for reproducing Figures 2 - 6, Tables 2, A2 (Part 1)                 #
#                                                                             #
###############################################################################

# NOTE: running this code requires installation of R package snowfall
#       running this code requires sourcing the NHANES data 

require(snowfall)

sfInit(parallel = TRUE, cpus = 10, type = "MPI") # if you won't do parallel computing set parallel = FALSE


# uncomment the following line and set the working directory where the objects shall be stored
# setwd("...") 

# load the NHANES data here
#load("NHANES_data.Rda")



# function to investigate bootstrapped p-values from LR test

# input parameters: 
#   - index: sample index, determining which observations are used for p-value computation 

# output: vector containing p-values for testing the univariate association between each of the covariates and the CRP level 

boot <- function(index, xdata = data){
  
  varnames <- names(xdata)[names(xdata) != "CRP"] # contain the names of all covariates
  
  pval <- sapply(varnames, function(z){
    
    model     <- lm(xdata$CRP[index] ~ xdata[index,z])
    nullmodel <- lm(xdata$CRP[index] ~ 1)
    anova(model, nullmodel)$Pr[2]
    
  })
  
  return(pval)
}

sfExport("boot")
sfExport("data")




# USING THE ORIGINAL NHANES DATA (is run in sequential mode)
# ------------------------------

# compute median p-values for the unmodified NHANES data (Figures 2 - 6, Table 2)


# p-value for original sample

orig_pval <- boot(1:nrow(data), xdata = data)


# p-values for bootstrap samples

bootstrapped_p_values <- sapply(1:10000, function(z){
  set.seed(z)
  ind <- sample(1:nrow(data), size = nrow(data), replace = TRUE)
  boot(index = ind, xdata = data)})


# p-values for subsamples

subsampled_p_values <- sapply(1:10000, function(z){
  set.seed(z)
  ind <- sample(1:nrow(data), size = floor(0.632 * nrow(data)), replace = FALSE)
  boot(index = ind, xdata = data)})


# store all objects in a list and save this list object

NHANES_unmodified_p_values <- list(orig_pval = orig_pval, bootstrapped_p_values = bootstrapped_p_values, subsampled_p_values = subsampled_p_values)
save(NHANES_unmodified_p_values, file = "NHANES_unmodified_p_values.Rda")




# USING THE NHANES DATA WITH PERMUTED RESPONSE (1000 permuted original datasets)
# ------------------------------------------------------------------------------


# randomly draw 1000 seeds
set.seed(12345)

seeds <- round(runif(min = 1000, max = 1000000, n = 1000))


# compute median p-values for the first 10 permuted datasets (Figures 3 and 5)

NHANES_1000modified_sig <- sfLapply(seeds, function(z){ # for each seed execute the following
    
  set.seed(z) # set seed for permuting response values (= modified NHANES data)
  data$CRP <- data$CRP[sample(1:nrow(data))] # permute response values
  
  
  # p-value for original sample
  
  orig_pval <- boot(1:nrow(data), xdata = data)
  orig_sig <- sum(orig_pval <= 0.05)
  
  
  # table with significant results for bootstrap samples
  
  bootstrapped_p_values <- sapply(1:10000, function(z){
    set.seed(z)
    ind <- sample(1:nrow(data), size = nrow(data), replace = TRUE)
    boot(index = ind, xdata = data)})
  
  bootstrapped_sig <- table(apply(bootstrapped_p_values, 2, function(z) sum(z <= 0.05))) # exclude samples in which problems occur, e.g. if all observations have value 1 for a binary covariate
  
  
  # table with significant results for subsamples
  
  subsampled_p_values <- sapply(1:10000, function(z){
    set.seed(z)
    ind <- sample(1:nrow(data), size = floor(0.632 * nrow(data)), replace = FALSE)
    boot(index = ind, xdata = data)})
    
  subsampled_sig <- table(apply(subsampled_p_values, 2, function(z) sum(z <= 0.05)))
  
  
  #return this list object which contains all relevant objects
  
  return(list(orig_sig = orig_sig, bootstrapped_sig = bootstrapped_sig, subsampled_sig = subsampled_sig))
    
})

save(NHANES_1000modified_sig, file = "NHANES_1000modified_sig.Rda")




# compute median p-values for the first 10 permuted datasets (Figures 2 and 4, Table A2)

first_10_seeds <- seeds[1:10]

NHANES_10modified_pvalues <- sfLapply(first_10_seeds, function(z){
  
  set.seed(z) # set seed for permuting response values (= modified NHANES data)
  data$CRP <- data$CRP[sample(1:nrow(data))] # permute response values
  
  
  # p-value for original sample
  
  orig_pval <- boot(1:nrow(data), xdata = data)
  
  
  # p-values for bootstrap samples
  
  bootstrapped_p_values <- sapply(1:10000, function(z){
    set.seed(z)
    ind <- sample(1:nrow(data), size = nrow(data), replace = TRUE)
    boot(index = ind, xdata = data)})

  bootstrapped_p_values <- apply(bootstrapped_p_values, 1, median)
  
  
  # p-values for subsamples
  
  subsampled_p_values <- sapply(1:10000, function(z){
    set.seed(z)
    ind <- sample(1:nrow(data), size = floor(0.632 * nrow(data)), replace = FALSE)
    boot(index = ind, xdata = data)})
  
  subsampled_p_values <- apply(subsampled_p_values, 1, median)
  
  
  # return this list object which contains all relevant objects
  
  return(data.frame(orig_pval = orig_pval, bootstrapped_p_values = bootstrapped_p_values, subsampled_p_values = subsampled_p_values))
  
})

save(NHANES_10modified_pvalues, file = "NHANES_10modified_pvalues.Rda")


sfStop() # stop parallel execution

