########## R-File to reproduce the calculation of the misclassification rates with the package CMA ##########



## 1 ##

## Data import
datasetnames<-c("adrenal_dahia","bladder_blaveri","bladder_dyrskjot",
                  "bladder_sanchez-carbayo","breast_desmedt","breast_farmer","breast_gruvberger","breast_kreike",
                  "breast_ma_2","breast_minn","breast_perou","breast_sharma","breast_sotiriou","breast_veer",
                  "breast_wang","breast_west","cervical_wong","cns_pomeroy_2","colon_alon","colon_laiho",
                  "colon_lin_1","colon_watanabe","gastric_hippo","glioma_freije","glioma_nutt","glioma_phillips",
                  "glioma_rickman","head_neck_chung","headneck_pyeon_2","leukemia_armstrong","leukemia_bullinger_2",
                  "leukemia_golub","leukemia_gutierrez","leukemia_haslinger","leukemia_wei","leukemia_yagi",
                  "liver_chen","liver_iizuka","liver_ye","lung_barret","lung_beer","lung_bhattacharjee_2","lung_bild",
                  "lung_wigle","lymphoma_alizadeh","lymphoma_booman","lymphoma_rosenwald","lymphoma_shipp",
                  "medulloblastoma_macdonald","melanoma_talantov","mixed_chowdary","mixed_ramaswamy","myeloma_tian",
                  "oral_odonnell","ovarian_gilks","ovarian_jazaeri_3","ovarian_li_and_campbell","ovarian_schwartz",
                  "pancreas_ishikawa","prostate_singh","prostate_tomlins","prostate_true_2","renal_williams",
                  "sarcoma_detwiller","srbct_khan")

for (i in 1:length(datasetnames))
 {
 print(i)
 datasetname<-datasetnames[i]
 dataset<-read.table(file=paste("data_txt/dataset_",datasetname,".txt",sep=""),skip=1,header=FALSE)
 dataset<-t(dataset)
 dataset<-list(X=dataset[,-1],Y=as.factor(dataset[,1]))
 save(dataset, file=paste("data_R/",datasetname,".RData",sep=""))
 }


## 2 ##

## Function to calulate the misclassification rates via MCCV  for the 4 DLDA classifiers ## 

library(CMA)


MCCV<-function(ratio,niter,datasetnames,methodnames)
{
  MCCV<-matrix(NA,length(datasetnames),length(methodnames))
  for (i in 1:length(datasetnames))
  {
    print(i)
    datasetname<-datasetnames[i]
    load(paste("data_R/",datasetname,".RData",sep=""))
    X<-dataset$X
    Y<-dataset$Y
    if (nlevels(Y)==2) # we just consider two class problems here
    { set.seed(1011) # ! reproducibility
      learn<- GenerateLearningsets(y=Y,method="MCCV",niter=niter,ntrain=round(length(Y)*ratio))
      varsel<-GeneSelection(X=X,y=Y,learningsets=learn,method="t.test")
      
      
      dlda<-evaluation(classification(X=X,y=Y,learningsets=learn,classifier=dldaCMA))
      dldanbgene500<-evaluation(classification(X=X,y=Y,learningsets=learn,genesel=varsel,nbgene=500,classifier=dldaCMA))
      dldanbgene20<-evaluation(classification(X=X,y=Y,learningsets=learn,genesel=varsel,nbgene=20,classifier=dldaCMA))
      dldanbgene10<-evaluation(classification(X=X,y=Y,learningsets=learn,genesel=varsel,nbgene=10,classifier=dldaCMA))
      
     
      
      MCCV[i,]<-c(mean(dlda@score),mean(dldanbgene500@score),mean(dldanbgene20@score),
                  mean(dldanbgene10@score))
    }
  }
  
  MCCV.matrix <- data.frame(MCCV)
  row.names(MCCV.matrix) <- datasetnames
  colnames(MCCV.matrix) <- methodnames
  MCCV_data <- stack(MCCV.matrix) # obtaining a proper data frame
  MCCV_data <- data.frame(datasetnames,MCCV_data)
  colnames(MCCV_data) <- c("data","mc","algo")
  save(MCCV, file="MCCV_matrix.RData") # saves the MC-matrix
  save(MCCV_data, file = "MCCV_data.RData") # saves the MC as dataframe
}



datasetnames<-c("adrenal_dahia","bladder_blaveri","bladder_dyrskjot",
                
                "bladder_sanchez-carbayo","breast_desmedt","breast_farmer","breast_gruvberger","breast_kreike",
                
                "breast_ma_2","breast_minn","breast_perou","breast_sharma","breast_sotiriou","breast_veer",
                
                "breast_wang","breast_west","cervical_wong","cns_pomeroy_2","colon_alon","colon_laiho",
                
                "colon_lin_1","colon_watanabe","gastric_hippo","glioma_freije","glioma_nutt","glioma_phillips",
                
                "glioma_rickman","head_neck_chung","headneck_pyeon_2","leukemia_armstrong","leukemia_bullinger_2",
                
                "leukemia_golub","leukemia_gutierrez","leukemia_haslinger","leukemia_wei","leukemia_yagi",
                
                "liver_chen","liver_iizuka","liver_ye","lung_barret","lung_beer","lung_bhattacharjee_2","lung_bild",
                
                "lung_wigle","lymphoma_alizadeh","lymphoma_booman","lymphoma_rosenwald","lymphoma_shipp",
                
                "medulloblastoma_macdonald","melanoma_talantov","mixed_chowdary","mixed_ramaswamy","myeloma_tian",
                
                "oral_odonnell","ovarian_gilks","ovarian_jazaeri_3","ovarian_li_and_campbell","ovarian_schwartz",
                
                "pancreas_ishikawa","prostate_singh","prostate_tomlins","prostate_true_2","renal_williams",
                "sarcoma_detwiller","srbct_khan")
methodnames<-c("dlda","dldanbgene500","dldanbgene20","dldanbgene10")


## 3 ## 

## Computation of the misclassification rates using MCCV with niter= 300 Iterations and test and training set ratio = 4/5

MCCV(niter=300,ratio=4/5,datasetnames,methodnames)







