######################################################################################
#                                                                                    #
#           Explorative Data Analysis of the C-to-U Conversion Data                  #
#                                                                                    #
######################################################################################


# Download the data:

arabidopsis_url <- "http://www.biomedcentral.com/content/supplementary/1471-2105-5-132-s4.txt"

arabidopsis <- read.table(arabidopsis_url, header = TRUE, sep = " ", na.string = "X")

dim(arabidopsis)      # 46 variables, 2694 observations
names(arabidopsis)

summary(arabidopsis)  # missing values present
                      # 1 variable X0 without variation
                      # total: 43 categorical + 3 metric
                      # no. edited: 1347, no. non edited: 1347

sum(complete.cases(arabidopsis) == FALSE)  # 81 observations with missing values


# Remove cases with missing values and predictors without variation:

arabidopsis <- subset(arabidopsis, complete.cases(arabidopsis))
arabidopsis <- arabidopsis[, !(names(arabidopsis) %in% c("X0", "loc"))]

dim(arabidopsis)      # 2613 obs. left, 44 variables
summary(arabidopsis)  # no. edited: 1306, no. non edited: 1307


# Plot distribution of nucleotide predictors
par(mfrow = c(3,3))
for (i in 1:length(grep("X", names(arabidopsis)))){
  barplot(table(arabidopsis[,i]), main = names(arabidopsis)[i])
}

# Plot distribution of other predictors
par(mfrow = c(1,3))
barplot(table(arabidopsis$cp), main = "cp")
boxplot(arabidopsis$fe,        main = "fe")
boxplot(arabidopsis$dfe,       main = "dfe")


######################################################################################
#                                                                                    #
# Variable importances of predictors                                                 #
#                                                                                    #
######################################################################################

require(party)

# Function for computing standard permutation VIs

get_VIs <- function(seed){
  
  set.seed(seed)   # set seed

  # Function call to create a random forest

  forest_control <- cforest_control(teststat     = "quad",
                                    testtype     = "Univ", 
                                    mincriterion = 0, 
                                    minsplit     = 0, 
                                    minbucket    = 0, 
                                    ntree        = 1000, 
                                    replace      = FALSE)

  forest <- cforest(edit ~ ., data = arabidopsis,
                              controls = forest_control)

  # Get importances using the standard error-rate-based permutation VIM
  
  varimp <- varimp(forest, nperm = 10) # permute 10 times
  return(varimp)

}
  
# Call function 'get_VIs()' for different random seeds 

VIs.1  <- get_VIs(seed = 8294359)
VIs.2  <- get_VIs(seed = 9879683)
VIs.3  <- get_VIs(seed = 0918304)
VIs.4  <- get_VIs(seed = 1753869)
VIs.5  <- get_VIs(seed = 7728010)
VIs.6  <- get_VIs(seed = 7889213)
VIs.7  <- get_VIs(seed = 4038444)
VIs.8  <- get_VIs(seed = 3446050)
VIs.9  <- get_VIs(seed = 8429405)
VIs.10 <- get_VIs(seed = 2156172)

# Store results in a list

VIs_CU_dataset <- list(VIs.1, VIs.2, VIs.3, VIs.4, VIs.5,
                       VIs.6, VIs.7, VIs.8, VIs.9, VIs.10)


# IMPORTANT: To save the results set working directory where AUC_VIM folder is stored

save(VIs_CU_dataset, file = "AUC_VIM/Comparison_Studies/Real_Data/R_Objects/VIs_CU_dataset.Rda")


# See whether a VI is below 0
# (corresponding to no association between a predictor and the response)

lapply(VIs_CU_dataset, function(x) which(x <= 0))

# -> in all 10 iterations the VIs of all 43 predictors are positive



######################################################################################
#                                                                                    #
# Associations among the 40 nucleotide predictors                                    #
#                                                                                    #
######################################################################################

variables <- arabidopsis[, !(names(arabidopsis)  %in%
                          c("edit", "cp", "fe", "dfe", "loc", "X0"))]

# Get an overview over the 40 nucleotide predictors
summary(variables)
# -> each predictor has 4 categories (A, C, G, T and P0, P1, P2, PX for cp)


# Get chi-square test statistic for each pair of predictors to determine 
# associations using Pearson's (corrected) contingeny coefficient

test_stat <- matrix(nrow = dim(variables)[2], ncol = dim(variables)[2])

for(i in 2 : dim(variables)[2]){
  for (j in 1 : (i-1)){
    test_stat[i,j] <- test_stat[j,i] <- chisq.test(table(variables[,c(i,j)]))$statistic
  }
}

n               <- dim(variables)[1]
cc              <- sqrt(test_stat/(n + test_stat))
correction_term <- sqrt(4/(4-1))
ccc             <- cc * correction_term

# The following plot shows the distribution of contingency coefficients for each variable
# (contingency coefficients of 1 correspond to perfect association between 2 variables,
# contingency coefficients of 0 to no association)

boxplot(ccc, main  = "Associations among predictors", 
             names = names(variables), 
             las   = 2, 
             ylab  = "Corrected contingency coefficient")

# -> associations between nucleotide variables are present 
