#################################################################################
#                                                                               #
#   Reproducible files to the paper 'A computationally fast variable            #
#   importance test for random forests for high-dimensional data' (2015)        #
#   by Janitza, Celik and Boulesteix                                            #
#                                                                               #
#   Contact: S. Janitza <janitza@ibe.med.uni-muenchen.de>                       #
#                                                                               #
#   File containing all functions for performing studies (without plotting)     #
#                                                                               #
#################################################################################


# function 'apply_approaches' to compute variable importances or p-values according to one of three possible approaches

# input parameters: 
#   - seed: seed for reproducibility purposes
#   - x: design matrix
#   - y: response vector
#   - cv: only used if VIM = "CV". Specifies the number of folds. Chose k = 2 for our novel method (hold-out variable importance)
#   - VIM: either 'CV' (for our novel testing approach), or 'classical' (for the naive testing approach) or 'PIMP' (for Altmann's method, non-parametric and parametric)
#   - mtry: number of candidate predictors that are randomly drawn at each split
#   - nperm: only used if VIM = "PIMP". Number of permutations of the y labels in Altmann's nonparametric approach.

# output: 
#        * for the naive testing approach: vector with importance scores
#        * for the novel testing approach: object returned by function CVPVI
#        * for Altmann's approach: list with two elements:  
#                                           - first element is vector of p-values for the non-parametric approach
#                                           - second element is vector of p-values for the parametric approach

apply_approaches <- function(seed, x, y, cv, VIM, mtry, nperm){
  
  # compute variable importance (for standard approach and CV approach) or p-values (for PIMP approaches)
  
  if(VIM == "CV"){
    
    VI <- CVPVI(x, as.factor(y), k = cv, mtry = mtry, ntree = 5000, replace = FALSE, ncores = 1)
    return(VI)
    
  }else if (VIM == "classical"){
    
    VI <- compute_VI(x = x, y = as.factor(y), mtry = mtry)
    return(VI)  
    
  }else if (VIM == "PIMP"){
    
    # compute original VIs
    set.seed(seed)
    original_VI <- compute_VI(x = x, y = as.factor(y), mtry = mtry)
    
    # compute VIs for permuted response and derive p-value in a non-parametric way
    set.seed(seed)
    y_perm <- sapply(1:nperm, function(x) sample(y))
    
    # compute null importance values
    VI <- apply(y_perm, 2, function(z) compute_VI(x = x, y = as.factor(z), mtry = mtry))
    
    
    # 1. compute pvalues for non-parametric approach
    
    pval_nonparam <- sapply(1:ncol(x), function(s) mean(original_VI[s] <= VI[s,]))
    
    # 2. compute pvalues for parametric approach
    
    standard_dev <- apply(VI, 1, sd)
    mu <- apply(VI, 1, mean)
    
    # compute p-values for each predictor
    pval_param <- sapply(1:ncol(x), function(p){
      used_std <- max(mean(standard_dev), standard_dev[p])
      1 - pnorm(original_VI[p], mu[p], used_std)
    })
    
    # return both pvalues for non-parametric and parametric approaches
    return(list(nonparam = pval_nonparam, param = pval_param))
    
  }else{
    stop("VIM must be specified as either classical, CV, PIMP.")
  }
}


# function 'compute_VI' to compute classical permutation importance scores

# input parameters: 
#   - x: design matrix
#   - y: response vector
#   - mtry: number of candidate predictors that are randomly drawn at each split

# output: vector with elements:
#         * vector containing variable importance scores

compute_VI <- function(x, y, mtry){
  
  # construct random forest
  big_forest <- randomForest(x = x, y = as.factor(y), 
                             importance   = TRUE,
                             replace      = FALSE,
                             mtry         = mtry,
                             ntree        = 5000)
  
  # compute classical permutation importance scores
  imp_big_forest      <- importance(big_forest, scale = FALSE)[,3]
  
}


# function 'null_study' to simulate data and return results for Study I

# input parameters: 
#   - seed: seed for reproducibility purposes
#   - x: design matrix
#   - y: response vector
#   - cv: only used if VIM = "CV". Specifies the number of folds. Chose k = 2 for our novel method (hold-out variable importance)
#   - subset: shall a subset of predictors be used instead of the complete predictor space?
#   - subsetsize: only if subset = TRUE. Specify how many predictors shall be used.
#   - VIM: either 'CV' (for our novel testing approach), or 'classical' (for the naive testing approach) or 'PIMP' (for Altmann's method, non-parametric and parametric)
#   - mtry: number of candidate predictors that are randomly drawn at each split
#   - nperm: only used if VIM = "PIMP". Number of permutations of the y labels in Altmann's nonparametric approach.

# output: 
#        * for the naive testing approach: vector with importance scores
#        * for the novel testing approach: object returned by function CVPVI
#        * for Altmann's approach: list with two elements:  
#                                           - first element is vector of p-values for the non-parametric approach
#                                           - second element is vector of p-values for the parametric approach

null_study <- function(seed, x, y, cv = 2, subset = FALSE, subsetsize = 100, VIM = "CV", mtry, nperm = 500){
  
  # reduce design matrix if using only a subset
  if(subset){
    set.seed(seed)
    predindex <- sample(1:ncol(x), size = subsetsize)
    x <- x[,predindex]
  }
  
  # randomly permute response values
  set.seed(seed)
  y <- sample(y)
  
  # apply selected approach
  res <- apply_approaches(seed = seed, x = x, y = y, cv = cv, VIM = VIM, mtry = mtry, nperm = nperm)
  return(res)
  
}


# function 'power_study' to simulate data and return results for Studies II and III  

# input parameters: 
#   - seed: seed for reproducibility purposes
#   - x: design matrix
#   - cv: only used if VIM = "CV". Specifies the number of folds. Chose k = 2 for our novel method (hold-out variable importance)
#   - nsignal: number of variables with an 'own' effect
#   - subset: shall a subset of predictors be used instead of the complete predictor space?
#   - subsetsize: only if subset = TRUE. Specify how many predictors shall be used.
#   - VIM: either 'CV' (for our novel testing approach), or 'classical' (for the naive testing approach) or 'PIMP' (for Altmann's method, non-parametric and parametric)
#   - independence: for Study II, set independence to FALSE, for Study III set it to TRUE.
#   - mtry: number of candidate predictors that are randomly drawn at each split
#   - effectset: determine effect set for variables with effect
#   - nperm: only used if VIM = "PIMP". Number of permutations of the y labels in Altmann's nonparametric approach.

# output: 
#        * for the naive testing approach: vector with importance scores
#        * for the novel testing approach: object returned by function CVPVI
#        * for Altmann's approach: list with two elements:  
#                                           - first element is vector of p-values for the non-parametric approach
#                                           - second element is vector of p-values for the parametric approach

power_study <- function(seed, x, cv = 2, nsignal = 100, subset = FALSE, subsetsize = 100, VIM = "CV", independence, mtry, effectset = c(-0.5, -1, -2, -3, 0.5, 1, 2, 3), nperm = 500){
  
  # reduce design matrix if using only a subset
  if(subset){
    set.seed(seed)
    predindex <- sample(1:ncol(x), size = subsetsize)
    x <- x[,predindex]
  }
  
  if(independence == TRUE){ # only for Study III
    
    # randomly permute predictor variables to make them independent
    set.seed(seed)
    x <- apply(x, 2, sample)
  }
  
  # scale the design matrix to make effects comparable
  x <- scale(x)
  
  # determine beta coefficients
  beta <- rep(effectset, length = nsignal)[order(abs(rep(effectset, length = nsignal)), decreasing = T)]
  
  # randomly draw nsignal predictors which are defined as signal predictors and reorder data 
  set.seed(seed)
  predno <- sample(1:ncol(x), size = nsignal)
  x <- x[,c(predno, which(!(1:ncol(x)) %in% predno))] # reorder data matrix so that first nsignal predictors are the predictors with effect
  
  # generate response
  set.seed(seed)
  y <- as.factor(rbinom(n = nrow(x), size = 1, p = plogis(as.matrix(x[,1:nsignal]) %*% beta)))
  
  # check if there are enough observations in both classes
  while(min(table(y)) < 10){y <- as.factor(rbinom(n = nrow(x), size = 1, p = plogis(as.matrix(x[,1:nsignal]) %*% beta)))}
  
  # apply selected testing approach
  res <- apply_approaches(seed = seed, x = x, y = y, cv = cv, VIM = VIM, mtry = mtry, nperm = nperm)
  return(res)
}
