#################################################################################
#                                                                               #
#   Reproducible files to the paper 'A computationally fast variable            #
#   importance test for random forests for high-dimensional data' (2015)        #
#   by Janitza, Celik and Boulesteix                                            #
#                                                                               #
#   Contact: S. Janitza <janitza@ibe.med.uni-muenchen.de>                       #
#                                                                               #
#   File containing all functions for plotting results                          #
#                                                                               #
#################################################################################


# function 'compute_pvalue' for computing p-values using all observed non-positive importance scores as described in Janitza et al.

# input parameters: 
#   - obj: vector of observed importance scores
#   - info: shall the number of non-positive observed importance scores be returned?

# output: 
#        * if info = FALSE: vector of p-values
#        * if info = TRUE: number of non-positive importance scores

compute_pvalue <- function(obj, info = FALSE){
    tryCatch({
       
      # count the number of non-positive importance scores
      nnonpositive <- length(c(which(as.numeric(obj) <= 0)))
      
      # compute empirical cumulative distribution function
      Fn <- ecdf(c(as.numeric(obj)[c(which(as.numeric(obj) <= 0))], -as.numeric(obj)[c(which(as.numeric(obj) < 0))]))
      
      # return either number of non-positive importance scores or the p-values
      if(info) return(nnonpositive) else return(1 - Fn(obj))
    },
             error = function(e){
               warning(paste("Some problem occured. Maybe too less non-positive importance scores..."))
             })
}


# function 'plot_power' to detect significant variables among variables with specified absolute effect size

# input parameters: 
#   - holdout_pval: vector of p-values for the novel testing approach 
#   - naive_pval: vector of p-values for the naive testing approach 
#   - altmann_pval: list with two elements, the first containing p-values for the non-parametric method of Altmann et al., the second for the parametric method of Altmann et al.
#   - nsignal: number of variables with an 'own' effect
#   - main: heading of the figure
#   - effectset: determine the effect set used for variables with effect
#   - legend: shall legend be printed?
#   - ylab: shall y-axis be labelled?
#   - ymax: maximal value for y-axis

# output: 
#        plot with average fraction of variables with p-value less than 0.05 among variables with specified absolute effect size for 4 considered tests

plot_power <- function(holdout_pval, naive_pval, altmann_pval, nsignal = 100, main = "", effectset = c(-0.5, -1, -2, -3, 0.5, 1, 2, 3), legend = FALSE, ylab = FALSE, ymax = 0.4){
  
  # number of predictors
  p <- length(holdout_pval[[1]])
  
  # number of different absolute effect sizes
  abs_effects <- sort(unique(abs(effectset)), decreasing = FALSE)
  neffects <- length(abs_effects)
  
  # number many study repetitions
  nrep <- length(holdout_pval)
  
  # plot ylabel if specified
  if(!ylab){ylabel <- ""}else{ylabel <- "Proportion of rejections"}
  
  # compute mean number of significant variables for each absolute effect size
  
  # novel method:
  novel_mean_noise <- mean(sapply(1:nrep, function(s) mean(holdout_pval[[s]][(nsignal+1):p] <= 0.05)))
  novelmeans_signal <- rev(sapply(0:(neffects-1), function(z) mean(sapply(1:nrep, function(s) mean(holdout_pval[[s]][(z * nsignal/neffects + 1):(z * nsignal/neffects + nsignal/neffects)] <= 0.05)))))
  
  # naive method:
  naive_mean_noise <- mean(sapply(1:nrep, function(s) mean(naive_pval[[s]][(nsignal+1):p] <= 0.05)))
  naive_means_signal <- rev(sapply(0:(neffects-1), function(z) mean(sapply(1:nrep, function(s) mean(naive_pval[[s]][(z * nsignal/neffects + 1):(z * nsignal/neffects + nsignal/neffects)] <= 0.05)))))
  
  # Altman method (nonparametric):
  nonparam_mean_noise <- mean(sapply(1:length(altmann_pval), function(s) mean(altmann_pval[[s]][[1]][(nsignal+1):p] <= 0.05)))
  nonparam_means_signal <- rev(sapply(0:(neffects-1), function(z) mean(sapply(1:length(altmann_pval), function(s) mean(altmann_pval[[s]][[1]][(z * nsignal/neffects + 1):(z * nsignal/neffects + nsignal/neffects)] <= 0.05)))))
  
  # Altman method (parametric):
  param_mean_noise <- mean(sapply(1:length(altmann_pval), function(s) mean(altmann_pval[[s]][[2]][(nsignal+1):p] <= 0.05)))
  param_means_signal <- rev(sapply(0:(neffects-1), function(z) mean(sapply(1:length(altmann_pval), function(s) mean(altmann_pval[[s]][[2]][(z * nsignal/neffects + 1):(z * nsignal/neffects + nsignal/neffects)] <= 0.05)))))
  
  # plot mean selection frequencies
  plot(c(0:neffects), c(novel_mean_noise, novelmeans_signal), type = "o", ylab = ylabel, ylim = c(0, ymax), xaxt = "n", xlab = bquote(.("|") ~ beta[j] ~ .("|")), main = main, lty = 1, cex = 0.8, pch = 16)
  points(c(0:neffects), c(naive_mean_noise, naive_means_signal), type = "o", cex = 0.8, lty = 2, pch = 16)   
  points(c(0:neffects), c(nonparam_mean_noise, nonparam_means_signal), type = "o", cex = 0.8, lty = 1, pch = 16, col = "darkgray")
  points(c(0:neffects), c(param_mean_noise, param_means_signal), type = "o", cex = 0.8, lty = 2, pch = 16, col = "darkgray")
  
  axis(side = 1, at = 0:neffects, labels = c(0, abs_effects))
  lines(c(0:neffects), rep(0.05, (neffects + 1)), col = "red", lty = 1, lwd = 0.5)
  
  if(legend) legend("topright", legend = c("New approach", "Naive approach", "Altmann (non-param.)", "Altmann (param.)"), lty = c(1, 2, 1, 2), col = c("black", "black", "darkgray", "darkgray"), pch = 16, bty = "n")
  
}


# function 'plot_power' to detect significant variables among variables with specified absolute effect size

# input parameters: 
#   - holdout_pval: vector of p-values for the novel testing approach 
#   - naive_pval: vector of p-values for the naive testing approach 
#   - altmann_pval: list with two elements, the first containing p-values for the non-parametric method of Altmann et al., the second for the parametric method of Altmann et al.
#   - nsignal: number of variables with an 'own' effect
#   - effectset: determine the effect set used for variables with effect
#   - ylab: shall y-axis be labelled?
#   - mtext: heading of all figures
#   - maxsel: maximal number plotted on the x axis
#   - legend: shall a legend be plotted?

# output: 
#        plot with relative frequency of repetitions in which the specified number of variables with effect was selected (i.e., with p-value less than 0.05) for 4 considered tests

barplot_power <- function(holdout_pval, naive_pval, altmann_pval, nsignal = 100, effectset = c(-0.5, -1, -2, -3, 0.5, 1, 2, 3), ylab = FALSE, mtext = "", maxsel = 10, legend = TRUE){
  
  # number of predictors
  p <- length(holdout_pval[[1]])
  
  # number of different absolute effect sizes
  abs_effects <- sort(unique(abs(effectset)), decreasing = FALSE)
  neffects <- length(abs_effects)
  
  # number many study repetitions
  nrep <- length(holdout_pval)
  
  # plot ylabel if specified
  if(!ylab){ylabel <- ""}else{ylabel <- "Proportion of rejections"}
  
  # compute mean number of significant variables for each absolute effect size
  
  # novel method:
  novel_means_signal <- rev(lapply(0:(neffects-1), function(z) sapply(1:nrep, function(s) mean(holdout_pval[[s]][(z * nsignal/neffects + 1):(z * nsignal/neffects + nsignal/neffects)] <= 0.05))))
  
  # naive method:
  naive_means_signal <- rev(lapply(0:(neffects-1), function(z) sapply(1:nrep, function(s) mean(naive_pval[[s]][(z * nsignal/neffects + 1):(z * nsignal/neffects + nsignal/neffects)] <= 0.05))))
  
  # Altman method (nonparametric):
  nonparam_means_signal <- rev(lapply(0:(neffects-1), function(z) sapply(1:length(altmann_pval), function(s) mean(altmann_pval[[s]][[1]][(z * nsignal/neffects + 1):(z * nsignal/neffects + nsignal/neffects)] <= 0.05))))
  
  # Altman method (parametric):
  param_means_signal <- rev(lapply(0:(neffects-1), function(z) sapply(1:length(altmann_pval), function(s) mean(altmann_pval[[s]][[2]][(z * nsignal/neffects + 1):(z * nsignal/neffects + nsignal/neffects)] <= 0.05))))
  

  ofeach <- nsignal/length(abs_effects)
  
  par(mfrow = c(4, 1), mar = c(2, 6, 2, 6), oma = c(0, 0, 5, 0), xpd = TRUE, cex.lab = 1.2)
  
  for(i in length(abs_effects):1){
    
    barplot(cbind(c(table(factor(novel_means_signal[[i]], levels = c(0:maxsel)/ofeach)))/500, 
                  c(table(factor(naive_means_signal[[i]], levels = c(0:maxsel)/ofeach)))/500, 
                  c(table(factor(nonparam_means_signal[[i]], levels = c(0:maxsel)/ofeach)))/200, 
                  c(table(factor(param_means_signal[[i]], levels = c(0:maxsel)/ofeach)))/200), 
            beside = TRUE, col = rep(c("gray90", "gray80", "gray70", "gray50"), each = (maxsel+1)), ylab = "Frequency of\n repetitions", 
            xaxt = "n", main = bquote(.("|") ~ beta[j] ~ .("| = ") ~ .(abs_effects[i])))
    axis(1, at = c(0:maxsel, (maxsel+2):(2*maxsel+2), (2*maxsel+4):(3*maxsel+4), (3*maxsel+6):(4*maxsel+6)) + 1.5, 
         labels = rep(0:maxsel, 4), lwd = 0, line = -0.4)  
    
  }
  if(legend == TRUE){
    legend("topright", legend = c("New approach", "Naive approach", "Altmann (non-param.)", "Altmann (param.)"), 
           col = c("gray90", "gray80", "gray70", "gray50"), pch = 16, bty = "n", inset = c(-0.14, 0))    
  }

  mtext(side = 3, text = mtext, outer = TRUE, line = 2, cex = 1.4)
}


# function 'plot_power' to detect significant variables among variables with specified absolute effect size

# input parameters: 
#   - obj_cv2: object returned by function CVPVI when using k = 2
#   - obj_cv3: object returned by function CVPVI when using k = 3
#   - obj_cv5: object returned by function CVPVI when using k = 5
#   - obj_cv10: object returned by function CVPVI when using k = 10
#   - obj_classical: vector with classical importance scores
#   - xlim: determine the plot region for the x-axis
#   - breaks: setting for histogram, see ?histogram
#   - ylabel: label of figure
#   - heading: shall heading of figure be printed?
#   - xaxt: set NULL if x-axis should be drawn?

# output: 
#        plot showing the distribution of variable importance scores computed from different variable importance measures

plot_cv_hist <- function(obj_cv2, obj_cv3, obj_cv5, obj_cv10, obj_classical, xlim = c(-0.005, 0.005), breaks = seq(from = -0.50025, to = 0.5, by = 0.0005), ylabel = "", heading = FALSE, xaxt = "n"){
  if(heading){
    main = c("Hold-out importance", "3-fold CV importance", "5-fold CV importance", "10-fold CV importance", "Classical importance")}else{
      main = c("", "", "", "", "")}
  
  hist(unlist(sapply(obj_cv2, function(z) z$cv_varim)), xlim = xlim, breaks = breaks, main = main[1], col = "gray90", cex.main = 1.2, xaxt = xaxt, yaxt = "n")
  grid()
  hist(unlist(sapply(obj_cv2, function(z) z$cv_varim)), xlim = xlim, breaks = breaks, main = main[1], col = "gray90", cex.main = 1.2, add = TRUE, xaxt = xaxt, yaxt = "n")
  mtext(side = 2, text = ylabel, line = 4.5, cex = 1)
  hist(unlist(sapply(obj_cv3, function(z) z$cv_varim)), xlim = xlim, breaks = breaks, main = main[2], col = "gray90", cex.main = 1.2, xaxt = xaxt, yaxt = "n")
  grid()
  hist(unlist(sapply(obj_cv3, function(z) z$cv_varim)), xlim = xlim, breaks = breaks, main = main[2], col = "gray90", cex.main = 1.2, add = TRUE, xaxt = xaxt, yaxt = "n")
  hist(unlist(sapply(obj_cv5, function(z) z$cv_varim)), xlim = xlim, breaks = breaks, main = main[3], col = "gray90", cex.main = 1.2, xaxt = xaxt, yaxt = "n")
  grid()
  hist(unlist(sapply(obj_cv5, function(z) z$cv_varim)), xlim = xlim, breaks = breaks, main = main[3], col = "gray90", cex.main = 1.2, add = TRUE, xaxt = xaxt, yaxt = "n")
  hist(unlist(sapply(obj_cv10, function(z) z$cv_varim)), xlim = xlim, breaks = breaks, main = main[4], col = "gray90", cex.main = 1.2, xaxt = xaxt, yaxt = "n")
  grid()
  hist(unlist(sapply(obj_cv10, function(z) z$cv_varim)), xlim = xlim, breaks = breaks, main = main[4], col = "gray90", cex.main = 1.2, add = TRUE, xaxt = xaxt, yaxt = "n")
  hist(unlist(obj_classical), xlim = xlim, breaks = breaks, main = main[5], col = "gray90", xaxt = xaxt, yaxt = "n")
  grid()
  hist(unlist(obj_classical), xlim = xlim, breaks = breaks, main = main[5], col = "gray90", add = TRUE, xaxt = xaxt, yaxt = "n")
    
}
