# Code by Anne-Laure Boulesteix with contributions from Myriam Hatz to reproduce the analyses presented in the paper
# Benchmarking for clustering methods based on real data: a statistical view
# by Anne-Laure Boulesteix and Myriam Hatz
# December 10th 2015

###################
# Import the data #
###################

# The datasets must be saved as R-objects in the directory data_R.
# To download the raw data (.txt files) and obtain the R-objects, please follow the instructions at
# http://www.ibe.med.uni-muenchen.de/organisation/mitarbeiter/020_professuren/boulesteix/compstud2013/


datasetnames<-c("adrenal_dahia","bladder_blaveri","bladder_dyrskjot",
                "bladder_sanchez-carbayo","breast_desmedt","breast_farmer","breast_gruvberger","breast_kreike",
                "breast_ma_2","breast_minn","breast_perou","breast_sharma","breast_sotiriou","breast_veer",
                "breast_wang","breast_west","cervical_wong","cns_pomeroy_2","colon_alon","colon_laiho",
                "colon_lin_1","colon_watanabe","gastric_hippo","glioma_freije","glioma_nutt","glioma_phillips",
                "glioma_rickman","head_neck_chung","headneck_pyeon_2","leukemia_armstrong","leukemia_bullinger_2",
                "leukemia_golub","leukemia_gutierrez","leukemia_haslinger","leukemia_wei","leukemia_yagi",
                "liver_chen","liver_iizuka","liver_ye","lung_barret","lung_beer","lung_bhattacharjee_2","lung_bild",
                "lung_wigle","lymphoma_alizadeh","lymphoma_booman","lymphoma_rosenwald","lymphoma_shipp",
                "medulloblastoma_macdonald","melanoma_talantov","mixed_chowdary","mixed_ramaswamy","myeloma_tian",
                "oral_odonnell","ovarian_gilks","ovarian_jazaeri_3","ovarian_li_and_campbell","ovarian_schwartz",
                "pancreas_ishikawa","prostate_singh","prostate_tomlins","prostate_true_2","renal_williams",
                "sarcoma_detwiller","srbct_khan")


# Select the datasets with binary response variable

datasetnames2<-c()
for(i in 1:length(datasetnames))
  {
  load(paste("data_R/", datasetnames[i], ".RData", sep=""))
  if (sum(table(dataset$Y)!=0)==2)
   {
   datasetnames2<-c(datasetnames2,datasetnames[i])
   }
  }



######################
# Perform clustering #
######################

# Load the packages cluster and flexclust
library(cluster)
library(flexclust)

# Matrix adjrand will contain the ARI values for the two considered methods.
adjrand<-matrix(NA,length(datasetnames2),2)
names(adjrand)<-c("pam","complete")

# Perform clustering and compute ARI for all datasets
for(i in 1:length(datasetnames2))
  {
  print(i)
  set.seed(i)
  load(paste("data_R/", datasetnames[i], ".RData", sep=""))
  tabY<-as.numeric(table(dataset$Y))
  # Perform clustering
  cluster_pam <- pam(dataset$X, k=2)$cluster
  cluster_complete<-cutree(hclust(dist(scale(dataset$X,scale=TRUE,center=TRUE)),method="complete"),k=2)
 # Compute ARI
  adjrand[i,1]<-randIndex(table(cluster_pam,dataset$Y),correct=TRUE)  
  adjrand[i,2]<-randIndex(table(cluster_complete,dataset$Y),correct=TRUE)
}

####################################
# Perform t-test and Wilcoxon test #
####################################

t.test(adjrand[,1]-adjrand[,2])
wilcox.test(adjrand[,1]-adjrand[,2])


#####################
# Generate Figure 1 #
#####################

pdf("Figure1.pdf",height=5,width=5)
par(mfrow=c(1,1))
boxplot(adjrand[,1],adjrand[,2],adjrand[,1]-adjrand[,2],names=c("pam","hccomp","pam-hccomp"),ylab="ARI or ARI difference")
for (i in 1:50)
 {
 lines(x=c(1,2),y=c(adjrand[i,1],adjrand[i,2]))
 }
library(boot)
medianboot<-function(x,inds)
 {
 return(median(x[inds]))
 }
set.seed(1)
# Compute bootstrap confidence intervals
bootdiff<-boot(adjrand[,1]-adjrand[,2],statistic=medianboot,stype="i",R=10000)
bootdiff.ci<-boot.ci(bootdiff,type="all")
lines(y=rep(bootdiff.ci$bca[4],2),x=c(2.55,3.45),lty=2,col=1)
lines(y=rep(bootdiff.ci$bca[5],2),x=c(2.55,3.45),lty=2,col=1)
dev.off()


#####################################################################################################
# Compute median ARI difference and perform Wilcoxon tests based on 1000 random subsets of datasets #
#####################################################################################################

niter<-1000
median3<-numeric(niter)
pvalue3<-numeric(niter)
median5<-numeric(niter)
pvalue5<-numeric(niter)
median10<-numeric(niter)
pvalue10<-numeric(niter)
median25<-numeric(niter)
pvalue25<-numeric(niter)

for (i in 1:niter)
{
set.seed(i)
samp3<-sample(50,3)
median3[i]<-median(adjrand[samp3,1]-adjrand[samp3,2])
pvalue3[i]<-wilcox.test(adjrand[samp3,1]-adjrand[samp3,2])$p.value
samp5<-sample(50,5)
median5[i]<-median(adjrand[samp5,1]-adjrand[samp5,2])
pvalue5[i]<-wilcox.test(adjrand[samp5,1]-adjrand[samp5,2])$p.value
samp10<-sample(50,10)
median10[i]<-median(adjrand[samp10,1]-adjrand[samp10,2])
pvalue10[i]<-wilcox.test(adjrand[samp10,1]-adjrand[samp10,2])$p.value
samp25<-sample(50,25)
median25[i]<-median(adjrand[samp25,1]-adjrand[samp25,2])
pvalue25[i]<-wilcox.test(adjrand[samp25,1]-adjrand[samp25,2])$p.value
}


#####################
# Generate Figure 2 #
#####################

pdf("Figure2.pdf",width=8,height=4.5)
par(mfrow=c(1,2))
boxplot(median3,median5,median10,median25,names=c("J=3","J=5","J=10","J=25"),ylab="median ARI difference",main="1000 subsets of datasets\n median ARI difference")
boxplot(pvalue3,pvalue5,pvalue10,pvalue25,names=c("J=3","J=5","J=10","J=25"),ylab="p-value Wilcoxon test",main="1000 subsets of datasets\n p-value Wilcoxon test")
dev.off()

# Compute standard deviation of ARI difference over 50 datasets
sd(adjrand[,1]-adjrand[,2])
# Perform sample size calculation
sigma<-0.18
delta<-0.1
(qnorm(0.2)+qnorm(0.025))^2/(delta/sigma)^2
delta<-0.05
(qnorm(0.2)+qnorm(0.025))^2/(delta/sigma)^2

