################################################################################
# R script for reproducing the results in ''Over-optimism in bioinformatics:   #
# an illustration''.                                                           #
################################################################################

rm(list=ls())
source("initialization.r")

#################
# 4. Singh data #
#################

library(Biobase)
library(tools)
library(AnnotationDbi)
library(DBI)
library(RSQLite)
library(gdata)
library(class)
library(SHIP)
library(CMA)
library(corpcor)
load("SinghX.RData")
SinghX@annotation
library(hgu95av2.db)
load("SinghY.RData")
X <- exprs(SinghX)
dim(X)
data_4 <- t(X)

Y          <- SinghY
X          <- data_4
l1         <- as.list(hgu95av2PATH2PROBE)
l2         <- as.list(hgu95av2PATH)


################################################################################
# Now we prepare the data for the variant rlda.TG1:
varINgr  <- just.varingroup(l2,X)
Xfish1   <- X[,varINgr]
l2.fish1 <- l2[varINgr]

# Now we prepare the data for the variant rlda.TG2:
f2     <- fish.proc.two(l2,X)
Xfish2 <- X[,f2]
dim(Xfish2)
l2.fish2 <- l2[f2]

# Now we prepare the data for the variant rlda.TG3:
f3         <- fish.proc.three(l2,X,l1)
f3         <- as.list(f3)
list.names <- names(f3)
names(f3)  <- unlist(f3)
for (i in 1:length(f3)) {
     f3[[i]] <- list.names[i]}

Xfish3     <- X[,names(f3)]
dim(Xfish3)
l2.fish3   <- f3

# Now we prepare the data for the variant rlda.TG4:
f4         <- fish.proc.four(l2,X,l1)
f4         <- as.list(f4)
list.names <- names(f4)
names(f4)  <- unlist(f4)
for (i in 1:length(f4)) {
     f4[[i]] <- list.names[i]}

Xfish4     <- X[,names(f4)]
dim(Xfish4)
l2.fish4   <- f4


# Now we prepare the data for the variant rlda.TG5:
f5       <- just.varingroup(l2.fish2,Xfish2)
Xfish5   <- X[,f5]
dim(Xfish5)
l2.fish5 <- l2[f5]
length(l2.fish5)

# Now we prepare the data for the variant rlda.TG6:
f6       <- just.varingroup(l2.fish3,Xfish3)
Xfish6   <- X[,f6]
dim(Xfish6)
l2.fish6 <- l2.fish3[f6]
length(l2.fish6)

# Now we prepare the data for the variant rlda.TG6:
f7       <- just.varingroup(l2.fish4,Xfish4)
Xfish7   <- X[,f7]
dim(Xfish7)
l2.fish7 <- l2.fish4[f7]
length(l2.fish7)
#################################################################################################

set.seed(1234)
learnset   <- GenerateLearningsets(y=Y, method = "CV", fold=5, niter=10, strat=TRUE)
# For the purpose of gene selection choose between "t.test", "Wilcoxon" and "limma".
geneselect       <- GeneSelection(X=X,y=Y, learningsets = learnset, method = "wilcox.test")
geneselect.fish1 <- GeneSelection(X=Xfish1,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish2 <- GeneSelection(X=Xfish2,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish3 <- GeneSelection(X=Xfish3,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish4 <- GeneSelection(X=Xfish4,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish5 <- GeneSelection(X=Xfish5,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish6 <- GeneSelection(X=Xfish6,y=Y, learningsets = learnset, method = "t.test")
geneselect.fish7 <- GeneSelection(X=Xfish7,y=Y, learningsets = learnset, method = "t.test")
# For nsc:
tune.scda  <- tune(X=X,y=Y, learningsets = learnset,classifier=scdaCMA, grids=list())
tune.svm   <- tune(X=X,y=Y, learningsets = learnset, classifier=svmCMA, grids=list(), kernel="radial")

########################################################################################################################
scda          <- classification(X=X,y=Y, learningsets=learnset,classifier=scdaCMA, tuneres=tune.scda)
svm           <- classification(X=X,y=Y, learningsets=learnset,classifier=svmCMA, tuneres=tune.svm, kernel="radial")

########################################################################################################################

cSG.TD100             <- classification(X=X,y=Y, learningsets=learnset, type="TargetD",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cSG.TG100             <- classification(X=X,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cSG.TF100             <- classification(X=X,y=Y, learningsets=learnset, type="TargetF",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cdlda100              <- classification(X=X,y=Y, learningsets=learnset,genesel = geneselect, nbgene = 100, classifier=dldaCMA)
cSG.fish1.100         <- classification(X=Xfish1,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish1,genesel = geneselect.fish1, nbgene = 100, classifier=rldaCMA)
cSG.fish2.100         <- classification(X=Xfish2,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish2,genesel = geneselect.fish2, nbgene = 100, classifier=rldaCMA)
cSG.fish3.100         <- classification(X=Xfish3,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish3,genesel = geneselect.fish3, nbgene = 100, classifier=rldaCMA)
cSG.fish4.100         <- classification(X=Xfish4,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish4,genesel = geneselect.fish4, nbgene = 100, classifier=rldaCMA)
cSG.fish5.100         <- classification(X=Xfish5,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish5,genesel = geneselect.fish5, nbgene = 100, classifier=rldaCMA)
cSG.fish6.100         <- classification(X=Xfish6,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish6,genesel = geneselect.fish6, nbgene = 100, classifier=rldaCMA)
cSG.fish7.100         <- classification(X=Xfish7,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish7,genesel = geneselect.fish7, nbgene = 100, classifier=rldaCMA)
cSG.TGstar100          <- classification(X=X,y=Y, learningsets=learnset, type="TargetG*",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cSG.TGpos100          <- classification(X=X,y=Y, learningsets=learnset, type="TargetGpos",
                         genesINpaths=l2,genesel = geneselect, nbgene = 100, classifier=rldaCMA)
cSG.TGcortest100      <- classification(X=X,y=Y, learningsets=learnset, type="TargetCor",
                         genesINpaths=l2, genesel = geneselect, nbgene = 100, classifier=rldaCMA)


list.SG100  <- list(cSG.TD100,cSG.TF100,cdlda100,scda,svm,cSG.TG100,cSG.fish1.100,cSG.fish2.100,cSG.fish3.100,cSG.fish4.100,cSG.fish5.100,
                    cSG.fish6.100,cSG.fish7.100,cSG.TGstar100, cSG.TGpos100,cSG.TGcortest100)
comp.SG100  <- compare(list.SG100, measure=c("misclassification","sensitivity","specifity"),plot=FALSE)
round(comp.SG100,digits=3)


##########################################################################################################################
cSG.TD200             <- classification(X=X,y=Y, learningsets=learnset, type="TargetD",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cSG.TG200             <- classification(X=X,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cSG.TF200             <- classification(X=X,y=Y, learningsets=learnset, type="TargetF",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cdlda200              <- classification(X=X,y=Y, learningsets=learnset,genesel = geneselect, nbgene = 200, classifier=dldaCMA)
cSG.fish1.200         <- classification(X=Xfish1,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish1,genesel = geneselect.fish1, nbgene = 200, classifier=rldaCMA)
cSG.fish2.200         <- classification(X=Xfish2,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish2,genesel = geneselect.fish2, nbgene = 200, classifier=rldaCMA)
cSG.fish3.200         <- classification(X=Xfish3,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish3,genesel = geneselect.fish3, nbgene = 200, classifier=rldaCMA)
cSG.fish4.200         <- classification(X=Xfish4,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish4,genesel = geneselect.fish4, nbgene = 200, classifier=rldaCMA)
cSG.fish5.200         <- classification(X=Xfish5,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish5,genesel = geneselect.fish5, nbgene = 200, classifier=rldaCMA)
cSG.fish6.200         <- classification(X=Xfish6,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish6,genesel = geneselect.fish6, nbgene = 200, classifier=rldaCMA)
cSG.fish7.200         <- classification(X=Xfish7,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish7,genesel = geneselect.fish7, nbgene = 200, classifier=rldaCMA)
cSG.TGstar200         <- classification(X=X,y=Y, learningsets=learnset, type="TargetG*",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cSG.TGpos200          <- classification(X=X,y=Y, learningsets=learnset, type="TargetGpos",
                         genesINpaths=l2,genesel = geneselect, nbgene = 200, classifier=rldaCMA)
cSG.TGcortest200      <- classification(X=X,y=Y, learningsets=learnset, type="TargetCor",
                         genesINpaths=l2, genesel = geneselect, nbgene = 200, classifier=rldaCMA)

list.SG200  <- list(cSG.TD200,cSG.TF200,cdlda200,scda,svm,cSG.TG200,cSG.fish1.200,cSG.fish2.200,cSG.fish3.200,cSG.fish4.200,
                    cSG.fish5.200,cSG.fish6.200,cSG.fish7.200,cSG.TGstar200,cSG.TGpos200,cSG.TGcortest200)
comp.SG200  <- compare(list.SG200, measure=c("misclassification","sensitivity","specifity"),plot=FALSE)
round(comp.SG200,digits=3)


#########################################################################################################################
cSG.TD500             <- classification(X=X,y=Y, learningsets=learnset, type="TargetD",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cSG.TG500             <- classification(X=X,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cSG.TF500             <- classification(X=X,y=Y, learningsets=learnset, type="TargetF",
                         genesINpaths=NA ,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cdlda500              <- classification(X=X,y=Y, learningsets=learnset,genesel = geneselect, nbgene = 500, classifier=dldaCMA)
cSG.fish1.500         <- classification(X=Xfish1,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish1,genesel = geneselect.fish1, nbgene = 500, classifier=rldaCMA)
cSG.fish2.500         <- classification(X=Xfish2,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish2,genesel = geneselect.fish2, nbgene = 500, classifier=rldaCMA)
cSG.fish3.500         <- classification(X=Xfish3,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish3,genesel = geneselect.fish3, nbgene = 500, classifier=rldaCMA)
cSG.fish4.500         <- classification(X=Xfish4,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish4,genesel = geneselect.fish4, nbgene = 500, classifier=rldaCMA)
cSG.fish5.500         <- classification(X=Xfish5,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish5,genesel = geneselect.fish5, nbgene = 500, classifier=rldaCMA)
cSG.fish6.500         <- classification(X=Xfish6,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish6,genesel = geneselect.fish6, nbgene = 500, classifier=rldaCMA)
cSG.fish7.500         <- classification(X=Xfish7,y=Y, learningsets=learnset, type="TargetG",
                         genesINpaths=l2.fish7,genesel = geneselect.fish7, nbgene = 500, classifier=rldaCMA)
cSG.TGstar500         <- classification(X=X,y=Y, learningsets=learnset, type="TargetG*",
                         genesINpaths=l2 ,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cSG.TGpos500          <- classification(X=X,y=Y, learningsets=learnset, type="TargetGpos",
                         genesINpaths=l2,genesel = geneselect, nbgene = 500, classifier=rldaCMA)
cSG.TGcortest500      <- classification(X=X,y=Y, learningsets=learnset, type="TargetCor",
                         genesINpaths=l2, genesel = geneselect, nbgene = 500, classifier=rldaCMA)



list.SG500  <- list(cSG.TD500,cSG.TF500,cdlda500,scda,svm,cSG.TG500,cSG.fish1.500,cSG.fish2.500,cSG.fish3.500,cSG.fish4.500,cSG.fish5.500,
                    cSG.fish6.500,cSG.fish7.500,cSG.TGstar500,cSG.TGpos500,cSG.TGcortest500)
comp.SG500  <- compare(list.SG500, measure=c("misclassification","sensitivity","specifity"),plot=FALSE)
round(comp.SG500,digits=3)


###########################################################################################################################
cSG.TD1000             <- classification(X=X,y=Y, learningsets=learnset, type="TargetD",
                          genesINpaths=NA ,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cSG.TG1000             <- classification(X=X,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2 ,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cSG.TF1000             <- classification(X=X,y=Y, learningsets=learnset, type="TargetF",
                          genesINpaths=NA ,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cdlda1000              <- classification(X=X,y=Y, learningsets=learnset,genesel = geneselect, nbgene = 1000, classifier=dldaCMA)
cSG.fish1.1000         <- classification(X=Xfish1,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish1,genesel = geneselect.fish1, nbgene = 1000, classifier=rldaCMA)
cSG.fish2.1000         <- classification(X=Xfish2,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish2,genesel = geneselect.fish2, nbgene = 1000, classifier=rldaCMA)
cSG.fish3.1000         <- classification(X=Xfish3,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish3,genesel = geneselect.fish3, nbgene = 1000, classifier=rldaCMA)
cSG.fish4.1000         <- classification(X=Xfish4,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish4,genesel = geneselect.fish4, nbgene = 1000, classifier=rldaCMA)
cSG.fish5.1000         <- classification(X=Xfish5,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish5,genesel = geneselect.fish5, nbgene = 1000, classifier=rldaCMA)
cSG.fish6.1000         <- classification(X=Xfish6,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish6,genesel = geneselect.fish6, nbgene = 1000, classifier=rldaCMA)
cSG.fish7.1000         <- classification(X=Xfish7,y=Y, learningsets=learnset, type="TargetG",
                          genesINpaths=l2.fish7,genesel = geneselect.fish7, nbgene = 1000, classifier=rldaCMA)
cSG.TGstar1000         <- classification(X=X,y=Y, learningsets=learnset, type="TargetG*",
                          genesINpaths=l2 ,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cSG.TGpos1000          <- classification(X=X,y=Y, learningsets=learnset, type="TargetGpos",
                          genesINpaths=l2,genesel = geneselect, nbgene = 1000, classifier=rldaCMA)
cSG.TGcortest1000      <- classification(X=X,y=Y, learningsets=learnset, type="TargetCor",
                          genesINpaths=l2, genesel = geneselect, nbgene = 1000, classifier=rldaCMA)


list.SG1000  <- list(cSG.TD1000,cSG.TF1000,cdlda1000,scda,svm,cSG.TG1000,cSG.fish1.1000,cSG.fish2.1000,cSG.fish3.1000,cSG.fish4.1000,cSG.fish5.1000,
                    cSG.fish6.1000,cSG.fish7.1000,cSG.TGstar1000, cSG.TGpos1000,cSG.TGcortest1000)
comp.SG1000  <- compare(list.SG1000, measure=c("misclassification","sensitivity","specifity"),plot=FALSE)
round(comp.SG1000,digits=3)
##########################################################################################################################

