################################################################################
# R script for reproducing the results in ''Over-optimism in bioinformatics:   #
# an illustration''.                                                           #
################################################################################

rm(list=ls())
source("initialization.r")

################
# 3. Wang data #
################

library(Biobase)
library(tools)
library(AnnotationDbi)
library(DBI)
library(RSQLite)
library(gdata)
library(class)
library(CMA)
library(SHIP)
library(corpcor)
load("WangX.RData")
WangX@annotation
library(hgu133a.db)
load("WangY.RData")
X <- exprs(WangX)
dim(X)
data_3 <- t(X)

Y          <- WangY
table(Y)
X          <- data_3
l1         <- as.list(hgu133aPATH2PROBE)
l2         <- as.list(hgu133aPATH)


################################################################################
# Now we prepare the data for the variant rlda.TG1:
varINgr  <- just.varingroup(l2,X)
Xfish1   <- X[,varINgr]
l2.fish1 <- l2[varINgr]

# Now we prepare the data for the variant rlda.TG2:
f2     <- fish.proc.two(l2,X)
Xfish2 <- X[,f2]
dim(Xfish2)
l2.fish2 <- l2[f2]

# Now we prepare the data for the variant rlda.TG3:
f3         <- fish.proc.three(l2,X,l1)
f3         <- as.list(f3)
list.names <- names(f3)
names(f3)  <- unlist(f3)
for (i in 1:length(f3)) {
     f3[[i]] <- list.names[i]}

Xfish3     <- X[,names(f3)]
dim(Xfish3)
l2.fish3   <- f3

# Now we prepare the data for the variant rlda.TG4:
f4         <- fish.proc.four(l2,X,l1)
f4         <- as.list(f4)
list.names <- names(f4)
names(f4)  <- unlist(f4)
for (i in 1:length(f4)) {
     f4[[i]] <- list.names[i]}

Xfish4     <- X[,names(f4)]
dim(Xfish4)
l2.fish4   <- f4


# Now we prepare the data for the variant rlda.TG5:
f5       <- just.varingroup(l2.fish2,Xfish2)
Xfish5   <- X[,f5]
dim(Xfish5)
l2.fish5 <- l2[f5]
length(l2.fish5)

# Now we prepare the data for the variant rlda.TG6:
f6       <- just.varingroup(l2.fish3,Xfish3)
Xfish6   <- X[,f6]
dim(Xfish6)
l2.fish6 <- l2.fish3[f6]
length(l2.fish6)

# Now we prepare the data for the variant rlda.TG7:
f7       <- just.varingroup(l2.fish4,Xfish4)
Xfish7   <- X[,f7]
dim(Xfish7)
l2.fish7 <- l2.fish4[f7]
length(l2.fish7)
########################################################################################################################

set.seed(1234)
learnset   <- GenerateLearningsets(y=Y, method = "CV", fold=5, niter=10, strat=TRUE)
# For the purpose of gene selection choose between "t.test", "wilcox.test" and "limma".
geneselect       <- GeneSelection(X=X,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish1 <- GeneSelection(X=Xfish1,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish2 <- GeneSelection(X=Xfish2,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish3 <- GeneSelection(X=Xfish3,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish4 <- GeneSelection(X=Xfish4,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish5 <- GeneSelection(X=Xfish5,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish6 <- GeneSelection(X=Xfish6,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish7 <- GeneSelection(X=Xfish7,y=Y, learningsets = learnset, method = "t.test")
# For nsc and svm:
tune.scda  <- tune(X=X,y=Y, learningsets = learnset,classifier=scdaCMA, grids=list())
tune.svm   <- tune(X=X,y=Y, learningsets = learnset, classifier=svmCMA, grids=list(), kernel="radial")

########################################################################################################################
scda          <- classification(X=X,y=Y, learningsets=learnset,classifier=scdaCMA, tuneres=tune.scda)
svm           <- classification(X=X,y=Y, learningsets=learnset,classifier=svmCMA, tuneres=tune.svm, kernel="radial")

########################################################################################################################

cWA.TD100             <- classification(X=X,y=Y, learningsets=learnset, type="TargetD",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cWA.TG100             <- classification(X=X,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cWA.TF100             <- classification(X=X,y=Y, learningsets=learnset, type="TargetF",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cdlda100              <- classification(X=X,y=Y, learningsets=learnset,genesel = geneselect, nbgene = 100, classifier=dldaCMA)
cWA.fish1.100         <- classification(X=Xfish1,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish1,genesel = geneselect.fish1, nbgene = 100, classifier=rldaCMA)
cWA.fish2.100         <- classification(X=Xfish2,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish2,genesel = geneselect.fish2, nbgene = 100, classifier=rldaCMA)
cWA.fish3.100         <- classification(X=Xfish3,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish3,genesel = geneselect.fish3, nbgene = 100, classifier=rldaCMA)
cWA.fish4.100         <- classification(X=Xfish4,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish4,genesel = geneselect.fish4, nbgene = 100, classifier=rldaCMA)
cWA.fish5.100         <- classification(X=Xfish5,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish5,genesel = geneselect.fish5, nbgene = 100, classifier=rldaCMA)
cWA.fish6.100         <- classification(X=Xfish6,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish6,genesel = geneselect.fish6, nbgene = 100, classifier=rldaCMA)
cWA.fish7.100         <- classification(X=Xfish7,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish7,genesel = geneselect.fish7, nbgene = 100, classifier=rldaCMA)
cWA.TGstar100          <- classification(X=X,y=Y, learningsets=learnset, type="TargetG*",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cWA.TGpos100          <- classification(X=X,y=Y, learningsets=learnset, type="TargetGpos",
                         genesINpaths=l2,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cWA.TGcortest100      <- classification(X=X,y=Y, learningsets=learnset, type="TargetCor",
                         genesINpaths=l2, genesel = geneselect, nbgene = 100, classifier=rldaCMA)


list.WA100  <- list(cWA.TD100,cWA.TF100,cdlda100,scda,svm,cWA.TG100,cWA.fish1.100,cWA.fish2.100,cWA.fish3.100,cWA.fish4.100,cWA.fish5.100,
                    cWA.fish6.100,cWA.fish7.100,cWA.TGstar100, cWA.TGpos100,cWA.TGcortest100)
comp.WA100  <- compare(list.WA100, measure=c("misclassification","sensitivity","specifity"),plot=FALSE)
round(comp.WA100,digits=3)


##########################################################################################################################
cWA.TD200             <- classification(X=X,y=Y, learningsets=learnset, type="TargetD",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cWA.TG200             <- classification(X=X,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cWA.TF200             <- classification(X=X,y=Y, learningsets=learnset, type="TargetF",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cdlda200              <- classification(X=X,y=Y, learningsets=learnset,genesel = geneselect, nbgene = 200, classifier=dldaCMA)
cWA.fish1.200         <- classification(X=Xfish1,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish1,genesel = geneselect.fish1, nbgene = 200, classifier=rldaCMA)
cWA.fish2.200         <- classification(X=Xfish2,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish2,genesel = geneselect.fish2, nbgene = 200, classifier=rldaCMA)
cWA.fish3.200         <- classification(X=Xfish3,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish3,genesel = geneselect.fish3, nbgene = 200, classifier=rldaCMA)
cWA.fish4.200         <- classification(X=Xfish4,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish4,genesel = geneselect.fish4, nbgene = 200, classifier=rldaCMA)
cWA.fish5.200         <- classification(X=Xfish5,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish5,genesel = geneselect.fish5, nbgene = 200, classifier=rldaCMA)
cWA.fish6.200         <- classification(X=Xfish6,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish6,genesel = geneselect.fish6, nbgene = 200, classifier=rldaCMA)
cWA.fish7.200         <- classification(X=Xfish7,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish7,genesel = geneselect.fish7, nbgene = 200, classifier=rldaCMA)
cWA.TGstar200         <- classification(X=X,y=Y, learningsets=learnset, type="TargetG*",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cWA.TGpos200          <- classification(X=X,y=Y, learningsets=learnset, type="TargetGpos",
                         genesINpaths=l2,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cWA.TGcortest200      <- classification(X=X,y=Y, learningsets=learnset, type="TargetCor",
                         genesINpaths=l2, genesel = geneselect, nbgene = 200, classifier=rldaCMA)

list.WA200  <- list(cWA.TD200,cWA.TF200,cdlda200,scda,svm,cWA.TG200,cWA.fish1.200,cWA.fish2.200,cWA.fish3.200,cWA.fish4.200,
                    cWA.fish5.200,cWA.fish6.200,cWA.fish7.200,cWA.TGstar200,cWA.TGpos200,cWA.TGcortest200)
comp.WA200  <- compare(list.WA200, measure=c("misclassification","sensitivity","specifity"),plot=FALSE)
round(comp.WA200,digits=3)


#########################################################################################################################
cWA.TD500             <- classification(X=X,y=Y, learningsets=learnset, type="TargetD",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cWA.TG500             <- classification(X=X,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cWA.TF500             <- classification(X=X,y=Y, learningsets=learnset, type="TargetF",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cdlda500              <- classification(X=X,y=Y, learningsets=learnset,genesel = geneselect, nbgene = 500, classifier=dldaCMA)
cWA.fish1.500         <- classification(X=Xfish1,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish1,genesel = geneselect.fish1, nbgene = 500, classifier=rldaCMA)
cWA.fish2.500         <- classification(X=Xfish2,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish2,genesel = geneselect.fish2, nbgene = 500, classifier=rldaCMA)
cWA.fish3.500         <- classification(X=Xfish3,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish3,genesel = geneselect.fish3, nbgene = 500, classifier=rldaCMA)
cWA.fish4.500         <- classification(X=Xfish4,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish4,genesel = geneselect.fish4, nbgene = 500, classifier=rldaCMA)
cWA.fish5.500         <- classification(X=Xfish5,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish5,genesel = geneselect.fish5, nbgene = 500, classifier=rldaCMA)
cWA.fish6.500         <- classification(X=Xfish6,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish6,genesel = geneselect.fish6, nbgene = 500, classifier=rldaCMA)
cWA.fish7.500         <- classification(X=Xfish7,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish7,genesel = geneselect.fish7, nbgene = 500, classifier=rldaCMA)
cWA.TGstar500         <- classification(X=X,y=Y, learningsets=learnset, type="TargetG*",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cWA.TGpos500          <- classification(X=X,y=Y, learningsets=learnset, type="TargetGpos",
                         genesINpaths=l2,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cWA.TGcortest500      <- classification(X=X,y=Y, learningsets=learnset, type="TargetCor",
                         genesINpaths=l2, genesel = geneselect, nbgene = 500, classifier=rldaCMA)



list.WA500  <- list(cWA.TD500,cWA.TF500,cdlda500,scda,svm,cWA.TG500,cWA.fish1.500,cWA.fish2.500,cWA.fish3.500,cWA.fish4.500,cWA.fish5.500,
                    cWA.fish6.500,cWA.fish7.500,cWA.TGstar500,cWA.TGpos500,cWA.TGcortest500)
comp.WA500  <- compare(list.WA500, measure=c("misclassification","sensitivity","specifity"),plot=FALSE)
round(comp.WA500,digits=3)


###########################################################################################################################
cWA.TD1000             <- classification(X=X,y=Y, learningsets=learnset, type="TargetD",
                          genesINpaths=NA ,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cWA.TG1000             <- classification(X=X,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2 ,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cWA.TF1000             <- classification(X=X,y=Y, learningsets=learnset, type="TargetF",
                          genesINpaths=NA ,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cdlda1000              <- classification(X=X,y=Y, learningsets=learnset,genesel = geneselect, nbgene = 1000, classifier=dldaCMA)
cWA.fish1.1000         <- classification(X=Xfish1,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish1,genesel = geneselect.fish1, nbgene = 1000, classifier=rldaCMA)
cWA.fish2.1000         <- classification(X=Xfish2,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish2,genesel = geneselect.fish2, nbgene = 1000, classifier=rldaCMA)
cWA.fish3.1000         <- classification(X=Xfish3,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish3,genesel = geneselect.fish3, nbgene = 1000, classifier=rldaCMA)
cWA.fish4.1000         <- classification(X=Xfish4,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish4,genesel = geneselect.fish4, nbgene = 1000, classifier=rldaCMA)
cWA.fish5.1000         <- classification(X=Xfish5,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish5,genesel = geneselect.fish5, nbgene = 1000, classifier=rldaCMA)
cWA.fish6.1000         <- classification(X=Xfish6,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish6,genesel = geneselect.fish6, nbgene = 1000, classifier=rldaCMA)
cWA.fish7.1000         <- classification(X=Xfish7,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish7,genesel = geneselect.fish7, nbgene = 1000, classifier=rldaCMA)
cWA.TGstar1000         <- classification(X=X,y=Y, learningsets=learnset, type="TargetG*",
                          genesINpaths=l2 ,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cWA.TGpos1000          <- classification(X=X,y=Y, learningsets=learnset, type="TargetGpos",
                          genesINpaths=l2,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cWA.TGcortest1000      <- classification(X=X,y=Y, learningsets=learnset, type="TargetCor",
                          genesINpaths=l2, genesel = geneselect, nbgene = 1000, classifier=rldaCMA)


list.WA1000  <- list(cWA.TD1000,cWA.TF1000,cdlda1000,scda,svm,cWA.TG1000,cWA.fish1.1000,cWA.fish2.1000,cWA.fish3.1000,cWA.fish4.1000,cWA.fish5.1000,
                    cWA.fish6.1000,cWA.fish7.1000,cWA.TGstar1000, cWA.TGpos1000,cWA.TGcortest1000)
comp.WA1000  <- compare(list.WA1000, measure=c("misclassification","sensitivity","specifity"),plot=FALSE)
round(comp.WA1000,digits=3)
##########################################################################################################################

