################################################################################
# R script for reproducing the results in ''Over-optimism in bioinformatics:   #
# an illustration''.                                                           #
################################################################################
rm(list=ls())
source("initialization.r")

#################
# 1.Golub_Merge #
#################

library(Biobase)
library(tools)
library(AnnotationDbi)
library(DBI)
library(RSQLite)
library(gdata)
library(golubEsets)
library(CMA)
library(corpcor)
library(class)
library(SHIP)
library(golubEsets)
library(hu6800.db)
data(Golub_Merge)
Golub_Merge
X <- exprs(Golub_Merge)
dim(X)
data_1 <-t(X)

phenodata  <- pData(Golub_Merge)
Y          <- phenodata$ALL.AML
X          <- data_1
l1         <- as.list(hu6800PATH2PROBE)
l2         <- as.list(hu6800PATH)

################################################################################
# Now we prepare the data according to fishing procedure 1
varINgr  <- just.varingroup(l2,X)
Xfish1   <- X[,varINgr]
l2.fish1 <- l2[varINgr]

# Now we prepare the data according to fishing procedure 2:
f2     <- fish.proc.two(l2,X)
Xfish2 <- X[,f2]
dim(Xfish2)
l2.fish2 <- l2[f2]

# Now we prepare the data according to fishing procedure 3:
f3         <- fish.proc.three(l2,X,l1)
f3         <- as.list(f3)
list.names <- names(f3)
names(f3)  <- unlist(f3)
for (i in 1:length(f3)) {
     f3[[i]] <- list.names[i]}

Xfish3     <- X[,names(f3)]
dim(Xfish3)
l2.fish3   <- f3

# Now we prepare the data according to fishing procedure 4:
f4         <- fish.proc.four(l2,X,l1)
f4         <- as.list(f4)
list.names <- names(f4)
names(f4)  <- unlist(f4)
for (i in 1:length(f4)) {
     f4[[i]] <- list.names[i]}

Xfish4     <- X[,names(f4)]
dim(Xfish4)
l2.fish4   <- f4


# Now we prepare the data according to fishing procedure 5:
f5       <- just.varingroup(l2.fish2,Xfish2)
Xfish5   <- X[,f5]
dim(Xfish5)
l2.fish5 <- l2[f5]
length(l2.fish5)

# Now we prepare the data according to fishing procedure 6:
f6       <- just.varingroup(l2.fish3,Xfish3)
Xfish6   <- X[,f6]
dim(Xfish6)
l2.fish6 <- l2.fish3[f6]
length(l2.fish6)

# Now we prepare the data according to fishing procedure 7:
f7       <- just.varingroup(l2.fish4,Xfish4)
Xfish7   <- X[,f7]
dim(Xfish7)
l2.fish7 <- l2.fish4[f7]
length(l2.fish7)
################################################################################

set.seed(1234)
learnset   <- GenerateLearningsets(y=Y, method = "CV", fold=5, niter=10, strat=TRUE)
# For the purpose of gene selection choose between "t.test", "Wilcoxon" and "limma".
geneselect       <- GeneSelection(X=X,y=Y, learningsets = learnset, method = "wilcox.test")
geneselect.fish1 <- GeneSelection(X=Xfish1,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish2 <- GeneSelection(X=Xfish2,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish3 <- GeneSelection(X=Xfish3,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish4 <- GeneSelection(X=Xfish4,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish5 <- GeneSelection(X=Xfish5,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish6 <- GeneSelection(X=Xfish6,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish7 <- GeneSelection(X=Xfish7,y=Y, learningsets = learnset, method = "t.test")
# For nsc and svm:
tune.scda  <- tune(X=X,y=Y, learningsets = learnset,classifier=scdaCMA, grids=list())
tune.svm   <- tune(X=X,y=Y, learningsets = learnset, classifier=svmCMA, grids=list(), kernel="radial")

########################################################################################################################
scda          <- classification(X=X,y=Y, learningsets=learnset,classifier=scdaCMA, tuneres=tune.scda)
svm           <- classification(X=X,y=Y, learningsets=learnset,classifier=svmCMA, tuneres=tune.svm, kernel="radial")

########################################################################################################################
cGM.TD100             <- classification(X=X,y=Y, learningsets=learnset, type="TargetD",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cGM.TG100             <- classification(X=X,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cGM.TF100             <- classification(X=X,y=Y, learningsets=learnset, type="TargetF",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cdlda100              <- classification(X=X,y=Y, learningsets=learnset,genesel = geneselect, nbgene = 100, classifier=dldaCMA)
cGM.fish1.100         <- classification(X=Xfish1,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish1,genesel = geneselect.fish1, nbgene = 100, classifier=rldaCMA)
cGM.fish2.100         <- classification(X=Xfish2,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish2,genesel = geneselect.fish2, nbgene = 100, classifier=rldaCMA)
cGM.fish3.100         <- classification(X=Xfish3,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish3,genesel = geneselect.fish3, nbgene = 100, classifier=rldaCMA)
cGM.fish4.100         <- classification(X=Xfish4,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish4,genesel = geneselect.fish4, nbgene = 100, classifier=rldaCMA)
cGM.fish5.100         <- classification(X=Xfish5,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish5,genesel = geneselect.fish5, nbgene = 100, classifier=rldaCMA)
cGM.fish6.100         <- classification(X=Xfish6,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish6,genesel = geneselect.fish6, nbgene = 100, classifier=rldaCMA)
cGM.fish7.100         <- classification(X=Xfish7,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish7,genesel = geneselect.fish7, nbgene = 100, classifier=rldaCMA)
cGM.TGstar100          <- classification(X=X,y=Y, learningsets=learnset, type="TargetG*",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cGM.TGpos100          <- classification(X=X,y=Y, learningsets=learnset, type="TargetGpos",
                         genesINpaths=l2,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cGM.TGcortest100      <- classification(X=X,y=Y, learningsets=learnset, type="TargetCor",
                         genesINpaths=l2, genesel = geneselect, nbgene = 100, classifier=rldaCMA)


list.GM100  <- list(cGM.TD100,cGM.TF100,cdlda100,nsc,svm,cGM.TG100,cGM.fish1.100,cGM.fish2.100,cGM.fish3.100,cGM.fish4.100,cGM.fish5.100,
                    cGM.fish6.100,cGM.fish7.100,cGM.TGstar100, cGM.TGpos100,cGM.TGcortest100)
comp.GM100  <- compare(list.GM100, measure=c("misclassification","sensitivity","specifity"),plot=FALSE)
round(comp.GM100,digits=3)


##########################################################################################################################
cGM.TD200             <- classification(X=X,y=Y, learningsets=learnset, type="TargetD",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cGM.TG200             <- classification(X=X,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cGM.TF200             <- classification(X=X,y=Y, learningsets=learnset, type="TargetF",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cdlda200              <- classification(X=X,y=Y, learningsets=learnset,genesel = geneselect, nbgene = 200, classifier=dldaCMA)
cGM.fish1.200         <- classification(X=Xfish1,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish1,genesel = geneselect.fish1, nbgene = 200, classifier=rldaCMA)
cGM.fish2.200         <- classification(X=Xfish2,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish2,genesel = geneselect.fish2, nbgene = 200, classifier=rldaCMA)
cGM.fish3.200         <- classification(X=Xfish3,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish3,genesel = geneselect.fish3, nbgene = 200, classifier=rldaCMA)
cGM.fish4.200         <- classification(X=Xfish4,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish4,genesel = geneselect.fish4, nbgene = 200, classifier=rldaCMA)
cGM.fish5.200         <- classification(X=Xfish5,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish5,genesel = geneselect.fish5, nbgene = 200, classifier=rldaCMA)
cGM.fish6.200         <- classification(X=Xfish6,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish6,genesel = geneselect.fish6, nbgene = 200, classifier=rldaCMA)
cGM.fish7.200         <- classification(X=Xfish7,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish7,genesel = geneselect.fish7, nbgene = 200, classifier=rldaCMA)
cGM.TGstar200         <- classification(X=X,y=Y, learningsets=learnset, type="TargetG*",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cGM.TGpos200          <- classification(X=X,y=Y, learningsets=learnset, type="TargetGpos",
                         genesINpaths=l2,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cGM.TGcortest200      <- classification(X=X,y=Y, learningsets=learnset, type="TargetCor",
                         genesINpaths=l2, genesel = geneselect, nbgene = 200, classifier=rldaCMA)

list.GM200  <- list(cGM.TD200,cGM.TF200,cdlda200,nsc,svm,cGM.TG200,cGM.fish1.200,cGM.fish2.200,cGM.fish3.200,cGM.fish4.200,
                    cGM.fish5.200,cGM.fish6.200,cGM.fish7.200,cGM.TGstar200,cGM.TGpos200,cGM.TGcortest200)
comp.GM200  <- compare(list.GM200, measure=c("misclassification","sensitivity","specifity"),plot=FALSE)
round(comp.GM200,digits=3)


#########################################################################################################################
cGM.TD500             <- classification(X=X,y=Y, learningsets=learnset, type="TargetD",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cGM.TG500             <- classification(X=X,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cGM.TF500             <- classification(X=X,y=Y, learningsets=learnset, type="TargetF",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cdlda500              <- classification(X=X,y=Y, learningsets=learnset,genesel = geneselect, nbgene = 500, classifier=dldaCMA)
cGM.fish1.500         <- classification(X=Xfish1,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish1,genesel = geneselect.fish1, nbgene = 500, classifier=rldaCMA)
cGM.fish2.500         <- classification(X=Xfish2,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish2,genesel = geneselect.fish2, nbgene = 500, classifier=rldaCMA)
cGM.fish3.500         <- classification(X=Xfish3,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish3,genesel = geneselect.fish3, nbgene = 500, classifier=rldaCMA)
cGM.fish4.500         <- classification(X=Xfish4,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish4,genesel = geneselect.fish4, nbgene = 500, classifier=rldaCMA)
cGM.fish5.500         <- classification(X=Xfish5,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish5,genesel = geneselect.fish5, nbgene = 500, classifier=rldaCMA)
cGM.fish6.500         <- classification(X=Xfish6,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish6,genesel = geneselect.fish6, nbgene = 500, classifier=rldaCMA)
cGM.fish7.500         <- classification(X=Xfish7,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish7,genesel = geneselect.fish7, nbgene = 500, classifier=rldaCMA)
cGM.TGstar500         <- classification(X=X,y=Y, learningsets=learnset, type="TargetG*",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cGM.TGpos500          <- classification(X=X,y=Y, learningsets=learnset, type="TargetGpos",
                         genesINpaths=l2,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cGM.TGcortest500      <- classification(X=X,y=Y, learningsets=learnset, type="TargetCor",
                         genesINpaths=l2, genesel = geneselect, nbgene = 500, classifier=rldaCMA)



list.GM500  <- list(cGM.TD500,cGM.TF500,cdlda500,nsc,svm,cGM.TG500,cGM.fish1.500,cGM.fish2.500,cGM.fish3.500,cGM.fish4.500,cGM.fish5.500,
                    cGM.fish6.500,cGM.fish7.500,cGM.TGstar500,cGM.TGpos500,cGM.TGcortest500)
comp.GM500  <- compare(list.GM500, measure=c("misclassification","sensitivity","specifity"),plot=FALSE)
round(comp.GM500,digits=3)


###########################################################################################################################
cGM.TD1000             <- classification(X=X,y=Y, learningsets=learnset, type="TargetD",
                          genesINpaths=NA ,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cGM.TG1000             <- classification(X=X,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2 ,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cGM.TF1000             <- classification(X=X,y=Y, learningsets=learnset, type="TargetF",
                          genesINpaths=NA ,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cdlda1000              <- classification(X=X,y=Y, learningsets=learnset,genesel = geneselect, nbgene = 1000, classifier=dldaCMA)
cGM.fish1.1000         <- classification(X=Xfish1,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish1,genesel = geneselect.fish1, nbgene = 1000, classifier=rldaCMA)
cGM.fish2.1000         <- classification(X=Xfish2,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish2,genesel = geneselect.fish2, nbgene = 1000, classifier=rldaCMA)
cGM.fish3.1000         <- classification(X=Xfish3,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish3,genesel = geneselect.fish3, nbgene = 1000, classifier=rldaCMA)
cGM.fish4.1000         <- classification(X=Xfish4,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish4,genesel = geneselect.fish4, nbgene = 1000, classifier=rldaCMA)
cGM.fish5.1000         <- classification(X=Xfish5,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish5,genesel = geneselect.fish5, nbgene = 1000, classifier=rldaCMA)
cGM.fish6.1000         <- classification(X=Xfish6,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish6,genesel = geneselect.fish6, nbgene = 1000, classifier=rldaCMA)
cGM.fish7.1000         <- classification(X=Xfish7,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish7,genesel = geneselect.fish7, nbgene = 1000, classifier=rldaCMA)
cGM.TGstar1000         <- classification(X=X,y=Y, learningsets=learnset, type="TargetG*",
                          genesINpaths=l2 ,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cGM.TGpos1000          <- classification(X=X,y=Y, learningsets=learnset, type="TargetGpos",
                          genesINpaths=l2,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cGM.TGcortest1000      <- classification(X=X,y=Y, learningsets=learnset, type="TargetCor",
                          genesINpaths=l2, genesel = geneselect, nbgene = 1000, classifier=rldaCMA)


list.GM1000  <- list(cGM.TD1000,cGM.TF1000,cdlda1000,nsc,svm,cGM.TG1000,cGM.fish1.1000,cGM.fish2.1000,cGM.fish3.1000,cGM.fish4.1000,cGM.fish5.1000,
                    cGM.fish6.1000,cGM.fish7.1000,cGM.TGstar1000, cGM.TGpos1000,cGM.TGcortest1000)
comp.GM1000  <- compare(list.GM1000, measure=c("misclassification","sensitivity","specifity"),plot=FALSE)
round(comp.GM1000,digits=3)
##########################################################################################################################

