######################################################################################
#                                                                                    #
#                         Get VIs for balanced dataset                               #
#                                                                                    #
######################################################################################


# IMPORTANT: Set working directory here
setwd("AUC_VIM/Comparison_Studies/Real_Data/")

require("party")

######################################################################################
# Data preparation                                                                   #
######################################################################################

# Download the data:

arabidopsis_url <- "http://www.biomedcentral.com/content/supplementary/1471-2105-5-132-s4.txt"
arabidopsis     <- read.table(arabidopsis_url, header = TRUE, sep = " ", na.string = "X")

# Remove cases with missing values & variables not to be used in analysis:

arabidopsis <- subset(arabidopsis, complete.cases(arabidopsis))
arabidopsis <- arabidopsis[, !(names(arabidopsis) %in% c("X0", "loc"))]

# Initiate output matrix containing VIs

varimp.AUC <- varimp.ER <- matrix(nrow = dim(arabidopsis[,-1])[2] * 2 , ncol = 100)
rownames(varimp.AUC) <- rownames(varimp.ER) <- c(names(arabidopsis)[-1], 
    paste("noise", 1:dim(arabidopsis[,-1])[2], sep = "")) 


######################################################################################
#         Loop for creating 100 extended versions of the C-to-U dataset              #                   
######################################################################################


arabidopsis_extended <- list()
  
for (i in 1:100){

    set.seed(i)      # set seed for reproducibility
    
    # Add noise predictors:
    # (noise predictors are created by permuting the values of original predictors)
    
    arabidopsis_noise     <- data.frame(apply(arabidopsis[,-1], 2, sample), stringsAsFactors = TRUE) 
    arabidopsis_noise$fe  <- as.numeric(as.character(arabidopsis_noise$fe))
    arabidopsis_noise$dfe <- as.numeric(as.character(arabidopsis_noise$dfe)) 
    colnames(arabidopsis_noise) <- paste("noise", 1:dim(arabidopsis_noise)[2], sep = "")
    
    # Datset containing original 43 predictors and 43 noise predictors
    
    arabidopsis_extended[[i]]  <- cbind(arabidopsis, arabidopsis_noise)        
}  


######################################################################################
#                   Function for creating RF and extracting VIs                      #                   
######################################################################################


get_VIs <- function(data){
  
  # Grow random forest
  
  forest_control <- cforest_control(teststat     = "quad",
                                    testtype     = "Univ", 
                                    mincriterion = 0, 
                                    minsplit     = 0, 
                                    minbucket    = 0, 
                                    ntree        = 500, 
                                    replace      = FALSE)


  forest <- cforest(edit ~ ., data = data,
                              controls = forest_control)

  # Calculate VIs

  varimp.AUC <- varimpAUC(forest)
  varimp.ER  <- varimp(forest)

  return(list(varimp.AUC = varimp.AUC, varimp.ER = varimp.ER))

}



##################################################################
# Call function 'get_VIs()' for all 100 extended datasets        #
##################################################################

res <- lapply(arabidopsis_extended, get_VIs)

# Save VIs in matrices

AUC_VI <- data.frame(res)[, grep("varimp.AUC", colnames(data.frame(res)))]
ER_VI  <- data.frame(res)[, grep("varimp.ER",  colnames(data.frame(res)))]

# Build list object containing VI values

imb_50 <- list(ER_VI = ER_VI, AUC_VI = AUC_VI)


save(imb_50, file = "R_Objects/imb_50.Rda")



