# Load results:
load("./FAbatchPaper/Results/RealdataanalysisResults.Rda")


# Combine results to a data frame:

dataset <- sapply(RealdataanalysisResults, function(x) x$dataset)
batchremmethod <- sapply(RealdataanalysisResults, function(x) x$batchremmethod)
metric <- sapply(RealdataanalysisResults, function(x) x$metric)
metrval <- sapply(RealdataanalysisResults, function(x) x$metrval)

results <- data.frame(dataset=dataset, batchremmethod=batchremmethod, 
  metric=metric, metrval=metrval)

results$batchremmethod <- as.character(results$batchremmethod)
results$batchremmethod[results$batchremmethod=="standardize"] <- "stand"
results$batchremmethod[results$batchremmethod=="meancenter"] <- "meanc"
results$batchremmethod <- factor(results$batchremmethod, levels=c("none", 
  "fabatch", "combat", "sva", "meanc", "stand", "ratiog", "ratioa")) 

results$metric <- as.character(results$metric)
results$metric[results$metric=="sep"] <- "sepscore"
results$metric[results$metric=="kldist"] <- "klmetr"
results$metric[results$metric=="skew"] <- "skewdiv"
results$metric[results$metric=="cor"] <- "corbeaf"
results$metric <- factor(results$metric, levels=c("sepscore", "avedist", 
  "klmetr", "pvca", "diffexpr", "skewdiv", "corbeaf"))





# Boxplots of metric values:
##############################

library("ggplot2")

p1 <- ggplot(data=results, aes(x=batchremmethod, y=metrval)) + 
  geom_boxplot() + facet_wrap(~ metric, nrow=4, ncol=2, scales="free_y") + 
  geom_line(aes(x=as.numeric(batchremmethod), y=metrval, group=dataset), 
   color="grey") + geom_boxplot(fill="transparent") + scale_x_discrete(labels=gsub("Standardize", "Stand.", 
   gsub("Meancenter", "Meanc.", levels(results$batchremmethod)))) +
  theme(axis.title=element_blank(), axis.text=element_text(size=13), 
    axis.text.x=element_text(angle=45, vjust=1, hjust=1), strip.text.x = element_text(size = 14))
 
p1

ggsave(file="./FAbatchPaper/Results/Figure2.pdf", width=10, height=11.5)








# Calculate mean performances of the methods for the individual metrics,
# with respect to the original metric values and also with respect to
# the ranks
###########################################################################


# Take means over datasets: 
  
library("plyr")
resultsum <- ddply(results, .variables=c("metric", "batchremmethod"), .fun=summarise, mean.metrval=mean(metrval, na.rm=TRUE))


# Calculate ranks of the methods with respect to the individual metrics:

resultsranks <- results
isdecreasing <- c(FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE)

for(i in 1:length(levels(results$metric))) {
  for(j in 1:length(levels(results$dataset))) {
    if(isdecreasing[i])
      ranktemp <- rank(-results[(results$metric == levels(results$metric)[i]) & (results$dataset == levels(results$dataset)[j]),]$metrval)
    else
      ranktemp <- rank(results[(results$metric == levels(results$metric)[i]) & (results$dataset == levels(results$dataset)[j]),]$metrval)
    resultsranks[(results$metric == levels(results$metric)[i]) & (results$dataset == levels(results$dataset)[j]),]$metrval <- ranktemp
  }
}


# Take means of the ranks over datasets: 

library("plyr")
resultsumranks <- ddply(resultsranks, .variables=c("metric", "batchremmethod"), .fun=summarise, mean.metrval=mean(metrval))
head(resultsumranks)




# Generate list containing the means of the metrics for the individual methods
# sorted by mean performance:

isdecreasing <- c(FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE)
ranklist <- list()

for(metind in 1:length(levels(resultsum$metric))) {

subsetbool <- (resultsum$metric==levels(resultsum$metric)[metind])
ordering <- order(resultsum$mean.metrval[subsetbool], decreasing=isdecreasing[metind])
namesmet <- as.character(resultsum$batchremmethod[subsetbool][ordering])
valuesmet <- resultsum$mean.metrval[subsetbool][ordering]
names(valuesmet) <- namesmet
ranklist[[metind]] <- valuesmet

}

names(ranklist) <- levels(resultsum$metric)




# Generate list containing the means of the ranks of the metrics for the individual methods
# sorted by mean performance according to the ranks:

rankranklist <- list()

for(metind in 1:length(levels(resultsumranks$metric))) {

subsetbool <- (resultsumranks$metric==levels(resultsumranks$metric)[metind])
ordering <- order(resultsumranks$mean.metrval[subsetbool])
namesmet <- as.character(resultsumranks$batchremmethod[subsetbool][ordering])
valuesmet <- resultsumranks$mean.metrval[subsetbool][ordering]
names(valuesmet) <- namesmet
rankranklist[[metind]] <- valuesmet

}

names(rankranklist) <- levels(resultsumranks$metric)




# Generate LaTeX-Code for Tables 2 and 3 of the main paper:

contents <- unlist(lapply(1:4, function(y) {
    content <- round(ranklist[[y]], 5); 
    namescontent <- names(ranklist[[y]]);
    whichfa <- which(names(ranklist[[y]])=="fabatch");
    content[whichfa] <- paste("\\textbf{", content[whichfa], "}", sep="");
    namescontent[whichfa] <- paste("\\textbf{", namescontent[whichfa], "}", sep="");
	contentranks <- round(rankranklist[[y]], 5); 
    namescontentranks <- names(rankranklist[[y]]);
    whichfaranks <- which(names(rankranklist[[y]])=="fabatch");
    contentranks[whichfaranks] <- paste("\\textbf{", contentranks[whichfaranks], "}", sep="");
    namescontentranks[whichfaranks] <- paste("\\textbf{", namescontentranks[whichfaranks], "}", sep="");
c(paste("\\multicolumn{9}{c}{", levels(resultsumranks$metric)[y], "}\\\\[-1pt]", sep=""),
"\\hline",
paste("\\multirow{2}{*}{mean values} & ", paste(namescontent, collapse=" & "), 
"\\\\", sep=""),
paste(" & ", paste(content, collapse=" & "), "\\\\", sep=""),
"\\hline\\hline",
paste("\\multirow{2}{*}{mean ranks} & ", paste(namescontentranks, collapse=" & "), 
"\\\\", sep=""),
paste(" & ", paste(contentranks, collapse=" & "), "\\\\", sep=""),
"\\hline",
"\\multicolumn{9}{c}{}\\\\[-10pt]")}))

tableranks <- c("\\begin{sidewaystable}",
"\\hspace*{}",
"\\vspace*{}",
"\\captionbox{mycaption}{\\begin{tabular}{| l | c | c | c | c | c | c | c | c |}", 
contents, 
"\\end{tabular}}",
"\\end{sidewaystable}")


for(i in seq(along=tableranks))
  cat(tableranks[i], "\n")



contents <- unlist(lapply(5:7, function(y) {
    content <- round(ranklist[[y]], 5); 
    namescontent <- names(ranklist[[y]]);
    whichfa <- which(names(ranklist[[y]])=="fabatch");
    content[whichfa] <- paste("\\textbf{", content[whichfa], "}", sep="");
    namescontent[whichfa] <- paste("\\textbf{", namescontent[whichfa], "}", sep="");
	contentranks <- round(rankranklist[[y]], 5); 
    namescontentranks <- names(rankranklist[[y]]);
    whichfaranks <- which(names(rankranklist[[y]])=="fabatch");
    contentranks[whichfaranks] <- paste("\\textbf{", contentranks[whichfaranks], "}", sep="");
    namescontentranks[whichfaranks] <- paste("\\textbf{", namescontentranks[whichfaranks], "}", sep="");
c(paste("\\multicolumn{9}{c}{", levels(resultsumranks$metric)[y], "}\\\\[-1pt]", sep=""),
"\\hline",
paste("\\multirow{2}{*}{mean values} & ", paste(namescontent, collapse=" & "), 
"\\\\", sep=""),
paste(" & ", paste(content, collapse=" & "), "\\\\", sep=""),
"\\hline\\hline",
paste("\\multirow{2}{*}{mean ranks} & ", paste(namescontentranks, collapse=" & "), 
"\\\\", sep=""),
paste(" & ", paste(contentranks, collapse=" & "), "\\\\", sep=""),
"\\hline",
"\\multicolumn{9}{c}{}\\\\[-10pt]")}))



tableranks <- c("\\begin{sidewaystable}",
"\\hspace*{}",
"\\vspace*{}",
"\\captionbox{mycaption}{\\begin{tabular}{| l | c | c | c | c | c | c | c | c |}", 
contents, 
"\\end{tabular}}",
"\\end{sidewaystable}")


for(i in seq(along=tableranks))
  cat(tableranks[i], "\n")



  
  
  

  
# Visualizations of the batch effects in the used datasets:
# plots of the first two principal components out of Principal Component Analysis.
###################################################################################

# First calculate principal components for all datasets:

datasets <- levels(results$dataset)

datasetpcs <- c()
pcs1 <- c()
pcs2 <- c()
batchpcs <- c()
ypcs <- c()

for(i in seq(along=datasets)) {

  load(paste("./FAbatchPaper/Datasets/ProcessedData/", datasets[i], ".Rda", sep=""))

  xpr <- prcomp(X, scale. = FALSE)
  PCs <- predict(xpr)[,1:2]

  datasetpcs <- c(datasetpcs, rep(datasets[i], nrow(PCs)))
  pcs1 <- c(pcs1, PCs[,1])
  pcs2 <- c(pcs2, PCs[,2])
  batchpcs <- c(batchpcs, as.numeric(batch))
  ypcs <- c(ypcs, as.numeric(y))

}

resultsPC <- data.frame(dataset=datasetpcs, batch=batchpcs, 
  y=ypcs, pc1=pcs1, pc2=pcs2)

datasets <- levels(resultsPC$dataset)
batcheffstrength <- 0

for(i in seq(along=datasets)) {
  PCtemp <- cbind(resultsPC$pc1, resultsPC$pc2)[resultsPC$dataset==datasets[i],]
  batchtemp <- resultsPC$batch[resultsPC$dataset==datasets[i]]
  batchdistmat <- dist(cbind(batchtemp))
  distmat <- dist(PCtemp)
  batcheffstrength[i] <- mean(distmat[batchdistmat!=0])/mean(distmat[batchdistmat==0])
}

datasets[order(batcheffstrength)]

sort(batcheffstrength)


resultsPC$dataset <- factor(resultsPC$dataset, levels=datasets[order(batcheffstrength)])


# Now generate a ggplot for each dataset and store them in a list:

library("MASS")

plist <- list()

for(i in seq(along=levels(resultsPC$dataset))) {

resultsPCtemp <- resultsPC[resultsPC$dataset==levels(resultsPC$dataset)[i],]
  
batches <- unique(resultsPCtemp$batch)

xall <- yall <- zall <- batchall <- c()

for(j in seq(along=batches)) {

pc1temp <- resultsPCtemp$pc1[resultsPCtemp$batch==batches[j]]
pc2temp <- resultsPCtemp$pc2[resultsPCtemp$batch==batches[j]]
dens2 <- kde2d(pc1temp, pc2temp, lims=c(min(pc1temp)-sd(pc1temp), max(pc1temp)+sd(pc1temp), min(pc2temp)-
  sd(pc2temp), max(pc2temp)+sd(pc2temp)))

densdf <- expand.grid(dens2$x, dens2$y)[,2:1]
densdf$z <- as.vector(dens2$z)

names(densdf) <- c("y", "x", "z")

xall <- c(xall, densdf$x)
yall <- c(yall, densdf$y)
zall <- c(zall, densdf$z)
batchall <- c(batchall, rep(batches[j], nrow(densdf)))

}

###resultsPCtemp$dataset <- factor(letters[1:length(levels(resultsPCtemp$dataset))][as.numeric(resultsPCtemp$dataset)],
###  levels=letters[1:length(levels(resultsPCtemp$dataset))])

densdfall <- data.frame(x=xall, y=yall, z=zall, batch=batchall)

plist[[i]] <- ggplot(data=resultsPCtemp, aes(x=pc1, y=pc2, colour=factor(batch))) + 
  stat_contour(data=densdfall, aes(x=x, y=y, z=z, linetype=factor(batch)), bins=5) +
  geom_text(label=as.character(as.numeric(resultsPCtemp$y)), aes(colour=factor(batch)), size=3, fontface="bold") +
  labs(x="PC 1", y="PC 2") + facet_wrap(~ dataset, nrow=1, ncol=1) +
  geom_point(data=data.frame(x=sapply(batches, function(x) mean(resultsPCtemp$pc1[resultsPCtemp$batch==x])), 
    y=sapply(batches, function(x) mean(resultsPCtemp$pc2[resultsPCtemp$batch==x])), batch=batches), 
    aes(x=x, y=y), shape=18, size=5, colour="black") + 
  geom_point(data=data.frame(x=sapply(batches, function(x) mean(resultsPCtemp$pc1[resultsPCtemp$batch==x])), 
    y=sapply(batches, function(x) mean(resultsPCtemp$pc2[resultsPCtemp$batch==x])), batch=batches), 
    aes(x=x, y=y), shape=18, size=3.5) + 
  theme(legend.position="none", axis.title=element_text(size=10), axis.text=element_blank(),
    strip.text.x = element_text(size = 10), axis.ticks=element_blank())

}

# Combine plots:

library("gridExtra")
grobframe <- marrangeGrob(plist, ncol=3, nrow=5, top=NULL)

grobframe

ggsave <- ggplot2::ggsave; body(ggsave) <- body(ggplot2::ggsave)[-2]

ggsave(file="./FAbatchPaper/Results/SupplementaryFigure7.pdf", plot=grobframe, width=8.4, height=10)
