################################################################################
# R script for reproducing the results in ''Over-optimism in bioinformatics:   #
# an illustration''.                                                           #
################################################################################

rm(list=ls())
source("initialization.r")
##########
# 2.CLL  #
##########
library(Biobase)
library(tools)
library(AnnotationDbi)
library(DBI)
library(RSQLite)
library(gdata)
library(affy)
library(CMA)
library(corpcor)
library(class)
library(SHIP)
library(CLL)
data(sCLLex)
sCLLex
library(hgu95av2.db)
X <- exprs(sCLLex)
dim(X)
data_2 <- t(X)

phenodata  <- pData(sCLLex)
Y          <- phenodata$Disease
X          <- data_2
l1         <- as.list(hgu95av2PATH2PROBE)
l2         <- as.list(hgu95av2PATH)

################################################################################
# Now we prepare the data for the variant rlda.TG1:
varINgr  <- just.varingroup(l2,X)
Xfish1   <- X[,varINgr]
l2.fish1 <- l2[varINgr]

# Now we prepare the data for the variant rlda.TG2:
f2     <- fish.proc.two(l2,X)
Xfish2 <- X[,f2]
dim(Xfish2)
l2.fish2 <- l2[f2]

# Now we prepare the data for the variant rlda.TG3:
f3         <- fish.proc.three(l2,X,l1)
f3         <- as.list(f3)
list.names <- names(f3)
names(f3)  <- unlist(f3)
for (i in 1:length(f3)) {
     f3[[i]] <- list.names[i]}

Xfish3     <- X[,names(f3)]
dim(Xfish3)
l2.fish3   <- f3

# Now we prepare the data for the variant rlda.TG4:
f4         <- fish.proc.four(l2,X,l1)
f4         <- as.list(f4)
list.names <- names(f4)
names(f4)  <- unlist(f4)
for (i in 1:length(f4)) {
     f4[[i]] <- list.names[i]}

Xfish4     <- X[,names(f4)]
dim(Xfish4)
l2.fish4   <- f4


# Now we prepare the data for the variant rlda.TG5:
f5       <- just.varingroup(l2.fish2,Xfish2)
Xfish5   <- X[,f5]
dim(Xfish5)
l2.fish5 <- l2[f5]
length(l2.fish5)

# Now we prepare the data for the variant rlda.TG6:
f6       <- just.varingroup(l2.fish3,Xfish3)
Xfish6   <- X[,f6]
dim(Xfish6)
l2.fish6 <- l2.fish3[f6]
length(l2.fish6)

# Now we prepare the data for the variant rlda.TG7:
f7       <- just.varingroup(l2.fish4,Xfish4)
Xfish7   <- X[,f7]
dim(Xfish7)
l2.fish7 <- l2.fish4[f7]
length(l2.fish7)
########################################################################################################################

set.seed(1234)
learnset   <- GenerateLearningsets(y=Y, method = "CV", fold=5, niter=10, strat=TRUE)
# For the purpose of gene selection choose between "t.test", "wilcox.test" and "limma".
geneselect       <- GeneSelection(X=X,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish1 <- GeneSelection(X=Xfish1,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish2 <- GeneSelection(X=Xfish2,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish3 <- GeneSelection(X=Xfish3,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish4 <- GeneSelection(X=Xfish4,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish5 <- GeneSelection(X=Xfish5,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish6 <- GeneSelection(X=Xfish6,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish7 <- GeneSelection(X=Xfish7,y=Y, learningsets = learnset, method = "t.test")
# For nsc and svm:
tune.scda  <- tune(X=X,y=Y, learningsets = learnset,classifier=scdaCMA, grids=list())
tune.svm   <- tune(X=X,y=Y, learningsets = learnset, classifier=svmCMA, grids=list(), kernel="radial")

########################################################################################################################
scda          <- classification(X=X,y=Y, learningsets=learnset,classifier=scdaCMA, tuneres=tune.scda)
svm           <- classification(X=X,y=Y, learningsets=learnset,classifier=svmCMA, tuneres=tune.svm, kernel="radial")

########################################################################################################################
cCLL.TD100             <- classification(X=X,y=Y, learningsets=learnset, type="TargetD",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cCLL.TG100             <- classification(X=X,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cCLL.TF100             <- classification(X=X,y=Y, learningsets=learnset, type="TargetF",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cdlda100               <- classification(X=X,y=Y, learningsets=learnset,genesel = geneselect, nbgene = 100, classifier=dldaCMA)
cCLL.fish1.100         <- classification(X=Xfish1,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish1,genesel = geneselect.fish1, nbgene = 100, classifier=rldaCMA)
cCLL.fish2.100         <- classification(X=Xfish2,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish2,genesel = geneselect.fish2, nbgene = 100, classifier=rldaCMA)
cCLL.fish3.100         <- classification(X=Xfish3,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish3,genesel = geneselect.fish3, nbgene = 100, classifier=rldaCMA)
cCLL.fish4.100         <- classification(X=Xfish4,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish4,genesel = geneselect.fish4, nbgene = 100, classifier=rldaCMA)
cCLL.fish5.100         <- classification(X=Xfish5,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish5,genesel = geneselect.fish5, nbgene = 100, classifier=rldaCMA)
cCLL.fish6.100         <- classification(X=Xfish6,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish6,genesel = geneselect.fish6, nbgene = 100, classifier=rldaCMA)
cCLL.fish7.100         <- classification(X=Xfish7,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish7,genesel = geneselect.fish7, nbgene = 100, classifier=rldaCMA)
cCLL.TGstar100         <- classification(X=X,y=Y, learningsets=learnset, type="TargetG*",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cCLL.TGpos100          <- classification(X=X,y=Y, learningsets=learnset, type="TargetGpos",
                         genesINpaths=l2,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cCLL.TGcortest100      <- classification(X=X,y=Y, learningsets=learnset, type="TargetCor",
                         genesINpaths=l2, genesel = geneselect, nbgene = 100, classifier=rldaCMA)


list.CLL100  <- list(cCLL.TD100,cCLL.TF100,cdlda100,nsc,svm,cCLL.TG100,cCLL.fish1.100,cCLL.fish2.100,cCLL.fish3.100,cCLL.fish4.100,cCLL.fish5.100,
                    cCLL.fish6.100,cCLL.fish7.100,cCLL.TGstar100, cCLL.TGpos100,cCLL.TGcortest100)
comp.CLL100  <- compare(list.CLL100, measure=c("misclassification","sensitivity","specifity"),plot=FALSE)
round(comp.CLL100,digits=3)


##########################################################################################################################
cCLL.TD200             <- classification(X=X,y=Y, learningsets=learnset, type="TargetD",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cCLL.TG200             <- classification(X=X,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cCLL.TF200             <- classification(X=X,y=Y, learningsets=learnset, type="TargetF",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cdlda200               <- classification(X=X,y=Y, learningsets=learnset,genesel = geneselect, nbgene = 200, classifier=dldaCMA)
cCLL.fish1.200         <- classification(X=Xfish1,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish1,genesel = geneselect.fish1, nbgene = 200, classifier=rldaCMA)
cCLL.fish2.200         <- classification(X=Xfish2,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish2,genesel = geneselect.fish2, nbgene = 200, classifier=rldaCMA)
cCLL.fish3.200         <- classification(X=Xfish3,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish3,genesel = geneselect.fish3, nbgene = 200, classifier=rldaCMA)
cCLL.fish4.200         <- classification(X=Xfish4,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish4,genesel = geneselect.fish4, nbgene = 200, classifier=rldaCMA)
cCLL.fish5.200         <- classification(X=Xfish5,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish5,genesel = geneselect.fish5, nbgene = 200, classifier=rldaCMA)
cCLL.fish6.200         <- classification(X=Xfish6,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish6,genesel = geneselect.fish6, nbgene = 200, classifier=rldaCMA)
cCLL.fish7.200         <- classification(X=Xfish7,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish7,genesel = geneselect.fish7, nbgene = 200, classifier=rldaCMA)
cCLL.TGstar200         <- classification(X=X,y=Y, learningsets=learnset, type="TargetG*",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cCLL.TGpos200          <- classification(X=X,y=Y, learningsets=learnset, type="TargetGpos",
                         genesINpaths=l2,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cCLL.TGcortest200      <- classification(X=X,y=Y, learningsets=learnset, type="TargetCor",
                         genesINpaths=l2, genesel = geneselect, nbgene = 200, classifier=rldaCMA)

list.CLL200  <- list(cCLL.TD200,cCLL.TF200,cdlda200,nsc,svm,cCLL.TG200,cCLL.fish1.200,cCLL.fish2.200,cCLL.fish3.200,cCLL.fish4.200,
                    cCLL.fish5.200,cCLL.fish6.200,cCLL.fish7.200,cCLL.TGstar200,cCLL.TGpos200,cCLL.TGcortest200)
comp.CLL200  <- compare(list.CLL200, measure=c("misclassification","sensitivity","specifity"),plot=FALSE)
round(comp.CLL200,digits=3)


#########################################################################################################################
cCLL.TD500             <- classification(X=X,y=Y, learningsets=learnset, type="TargetD",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cCLL.TG500             <- classification(X=X,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cCLL.TF500             <- classification(X=X,y=Y, learningsets=learnset, type="TargetF",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cdlda500               <- classification(X=X,y=Y, learningsets=learnset,genesel = geneselect, nbgene = 500, classifier=dldaCMA)
cCLL.fish1.500         <- classification(X=Xfish1,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish1,genesel = geneselect.fish1, nbgene = 500, classifier=rldaCMA)
cCLL.fish2.500         <- classification(X=Xfish2,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish2,genesel = geneselect.fish2, nbgene = 500, classifier=rldaCMA)
cCLL.fish3.500         <- classification(X=Xfish3,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish3,genesel = geneselect.fish3, nbgene = 500, classifier=rldaCMA)
cCLL.fish4.500         <- classification(X=Xfish4,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish4,genesel = geneselect.fish4, nbgene = 500, classifier=rldaCMA)
cCLL.fish5.500         <- classification(X=Xfish5,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish5,genesel = geneselect.fish5, nbgene = 500, classifier=rldaCMA)
cCLL.fish6.500         <- classification(X=Xfish6,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish6,genesel = geneselect.fish6, nbgene = 500, classifier=rldaCMA)
cCLL.fish7.500         <- classification(X=Xfish7,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish7,genesel = geneselect.fish7, nbgene = 500, classifier=rldaCMA)
cCLL.TGstar500         <- classification(X=X,y=Y, learningsets=learnset, type="TargetG*",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cCLL.TGpos500          <- classification(X=X,y=Y, learningsets=learnset, type="TargetGpos",
                         genesINpaths=l2,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cCLL.TGcortest500      <- classification(X=X,y=Y, learningsets=learnset, type="TargetCor",
                         genesINpaths=l2, genesel = geneselect, nbgene = 500, classifier=rldaCMA)



list.CLL500  <- list(cCLL.TD500,cCLL.TF500,cdlda500,nsc,svm,cCLL.TG500,cCLL.fish1.500,cCLL.fish2.500,cCLL.fish3.500,cCLL.fish4.500,cCLL.fish5.500,
                    cCLL.fish6.500,cCLL.fish7.500,cCLL.TGstar500,cCLL.TGpos500,cCLL.TGcortest500)
comp.CLL500  <- compare(list.CLL500, measure=c("misclassification","sensitivity","specifity"),plot=FALSE)
round(comp.CLL500,digits=3)


###########################################################################################################################
cCLL.TD1000             <- classification(X=X,y=Y, learningsets=learnset, type="TargetD",
                          genesINpaths=NA ,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cCLL.TG1000             <- classification(X=X,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2 ,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cCLL.TF1000             <- classification(X=X,y=Y, learningsets=learnset, type="TargetF",
                          genesINpaths=NA ,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cdlda1000               <- classification(X=X,y=Y, learningsets=learnset,genesel = geneselect, nbgene = 1000, classifier=dldaCMA)
cCLL.fish1.1000         <- classification(X=Xfish1,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish1,genesel = geneselect.fish1, nbgene = 1000, classifier=rldaCMA)
cCLL.fish2.1000         <- classification(X=Xfish2,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish2,genesel = geneselect.fish2, nbgene = 1000, classifier=rldaCMA)
cCLL.fish3.1000         <- classification(X=Xfish3,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish3,genesel = geneselect.fish3, nbgene = 1000, classifier=rldaCMA)
cCLL.fish4.1000         <- classification(X=Xfish4,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish4,genesel = geneselect.fish4, nbgene = 1000, classifier=rldaCMA)
cCLL.fish5.1000         <- classification(X=Xfish5,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish5,genesel = geneselect.fish5, nbgene = 1000, classifier=rldaCMA)
cCLL.fish6.1000         <- classification(X=Xfish6,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish6,genesel = geneselect.fish6, nbgene = 1000, classifier=rldaCMA)
cCLL.fish7.1000         <- classification(X=Xfish7,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish7,genesel = geneselect.fish7, nbgene = 1000, classifier=rldaCMA)
cCLL.TGstar1000         <- classification(X=X,y=Y, learningsets=learnset, type="TargetG*",
                          genesINpaths=l2 ,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cCLL.TGpos1000          <- classification(X=X,y=Y, learningsets=learnset, type="TargetGpos",
                          genesINpaths=l2,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cCLL.TGcortest1000      <- classification(X=X,y=Y, learningsets=learnset, type="TargetCor",
                          genesINpaths=l2, genesel = geneselect, nbgene = 1000, classifier=rldaCMA)


list.CLL1000  <- list(cCLL.TD1000,cCLL.TF1000,cdlda1000,nsc,svm,cCLL.TG1000,cCLL.fish1.1000,cCLL.fish2.1000,cCLL.fish3.1000,cCLL.fish4.1000,cCLL.fish5.1000,
                    cCLL.fish6.1000,cCLL.fish7.1000,cCLL.TGstar1000, cCLL.TGpos1000,cCLL.TGcortest1000)
comp.CLL1000  <- compare(list.CLL1000, measure=c("misclassification","sensitivity","specifity"),plot=FALSE)
round(comp.CLL1000,digits=3)
##############################################################################################################################

