# Function to simulate a dataset:

# Input parameters:

# batchsize - vector of length four. The j-th entry corresponds to the sample size of the j-th batch.
#
# indscen - character. simulation scenario. has to be one of the following:
#  "b0scen": common correlation structure in all batches - Design A
#  "bjscen": batch-specific correlation structures - Design A
#  "btildescen": batch- and class-specific correlation structures - Design A
#  "CommonCor": common correlation structure in all batches - Design B
#  "BatchspecificCor": batch-specific correlation structures - Design B
#  "BatchClassspecificCor": batch- and class-specific correlation structures - Design B

simuldata <- function(batchsize = c(25, 25, 25, 25), indscen) {

  # Load simulation parameters:
  objectsnow <- ls()
  eval(parse(text=paste("load(\"./FAbatchPaper/InterimResults/SimulationParameters", indscen, ".Rda\")", sep="")))
  
  if(!exists("scendesign")) {

  # Get a couple of properties:
  numbatch <- 4
  if(length(batchsize)!=numbatch)
    stop(paste("Number of batches has to be equal to ", numbatch, sep=""))
  n <- sum(batchsize)
  batch <- rep(1:numbatch, times=batchsize)

  # Load simulation parameters:
  p <- length(alphag)

  # Simulate random parts:
  y <- rep(NA, length(batch))
  for(j in 1:length(unique(batch)))
    y[batch==j] <- sample(c(rep(1, floor(sum(batch==j)/2)), rep(2, sum(batch==j)-floor(sum(batch==j)/2))))
  eps01 <- matrix(nrow=n, ncol=p, data=rnorm(n*p))
  Z <- matrix(nrow=n, ncol=5, data=rnorm(n*5))
  Zbatch <- matrix(nrow=n, ncol=5, data=rnorm(n*5))

  # Put the parameters into matrices:
  alphagmat <- matrix(nrow=n, ncol=p, data=rep(alphag, each=n))
  betagmat <- matrix(nrow=n, ncol=p, data=rep(betag, each=n))
  sigmag2mat <- matrix(nrow=n, ncol=p, data=rep(sigmag2, each=n))

  gammajgmat <- deltajg2mat <- matrix(nrow=n, ncol=p)

  for(i in 1:numbatch) {
    gammajgmat[batch==i,] <- matrix(nrow=sum(batch==i), ncol=p, data=rep(gammajg[i,], each=sum(batch==i)))
    deltajg2mat[batch==i,] <- matrix(nrow=sum(batch==i), ncol=p, data=rep(deltajg2[i,], each=sum(batch==i)))
  }

  # Generate the simulated data matrix without batch- and class-specific correlations:
  X <- alphagmat + apply(t(betagmat), 1, function(x) x*(y-1)) + gammajgmat +
    Z%*%B0mat + sqrt(deltajg2mat)*sqrt(sigmag2mat)*eps01

  # Add class-specific correlations (Btildemat = 0 in case of no class-specific correlation):
  X[y==2,] <- X[y==2,] + Z[y==2,]%*%Btildemat

  # Add batch-specific correlations (loadings 0 in case of no class-specific correlation):
  for(i in 1:numbatch) {
    X[batch==i,] <- X[batch==i,] + Zbatch[batch==i,]%*%Bjmat[[i]] 
  }

  }
  else {

  # Get a couple of properties:
  numbatch <- 4
  if(length(batchsize)!=numbatch)
    stop(paste("Number of batches has to be equal to ", numbatch, sep=""))
  n <- sum(batchsize)
  batch <- rep(1:numbatch, times=batchsize)

  # Load simulation parameters:
  p <- length(alphag)

  # Simulate random parts:
  y <- rep(NA, length(batch))
  for(j in 1:length(unique(batch)))
    y[batch==j] <- sample(c(rep(1, floor(sum(batch==j)/2)), rep(2, sum(batch==j)-floor(sum(batch==j)/2))))

  if(scendesign %in% c("CommonCor", "BatchCor")) {
    require("mvtnorm")
    X <- matrix(nrow=n, ncol=length(alphag))
    for(j in 1:length(batchsize))
      X[batch==j,] <- rmvnorm(n = batchsize[j], mean = alphag + gammajg[j,], sigma = covbatches[[j]])
  }

  if(scendesign=="BatchClassCor") {
    require("mvtnorm")
    X <- matrix(nrow=n, ncol=length(alphag))
    for(j in 1:length(batchsize)) {
      for(k in 1:2)
        X[batch==j,][y[batch==j]==k,] <- rmvnorm(n = sum((batch==j) & (y==k)), mean = alphag + gammajg[j,], sigma = covbatches[[j]][[k]])
    }
  }

  betagmat <- matrix(nrow=n, ncol=p, data=rep(betag, each=n))
  X <- X + apply(t(betagmat), 1, function(x) x*(y-1))

  }

  rm(list=setdiff(ls(), c(objectsnow, "X", "y", "batch"))); gc()

  # Return the data:
  return(list(X=X, y=factor(y), batch=factor(batch)))

}




# This function is a wrapper function for 'simuldata' from above.
# It takes the whole number 'z', which corresponds to the z-th line of 'simulgrid',
# which contains the informations on the datasets to simulate.

simulateData <- function(z) {

load("./FAbatchPaper/InterimResults/simulgrid.Rda")

seed <- simulgrid$seed[z]
set.seed(seed)
simdata <- simuldata(batchsize = c(25, 25, 25, 25), simulgrid$indscen[z])

save(simdata, seed, file=paste("./FAbatchPaper/InterimResults/dataset", z, "sim.Rda", sep=""))

}




# This function performs the batch effect removal on the simulated datasets.
# It takes the whole number 'z', which corresponds to the z-th line of 'batchremovalgrid',
# which contains informations stating which batch effect removal method to apply
# to which simulated dataset.

performBatchremoval <- function(z) {

  require("bapred")

  load("./FAbatchPaper/InterimResults/batchremovalgrid.Rda")

  load(paste("./FAbatchPaper/InterimResults/dataset", batchremovalgrid$datasetind[z], "sim.Rda", sep=""))

  set.seed(seed)

  Xbr <- ba(x=simdata$X, y=simdata$y, batch=simdata$batch, method = batchremovalgrid$batchremmethod[z])$xadj
    
   # Leave out variables with constant values within batches:
	
	sdb = as.list(rep(0,length(levels(simdata$batch))))
   for (i in 1:length(levels(simdata$batch))) {
      sdb[[i]] = apply(simdata$X[simdata$batch==levels(simdata$batch)[i],],2,sd)
   }

   badvariables <- sort(unique(unlist(lapply(sdb, function(x) which(x==0)))))
   goodvariables <- setdiff(1:ncol(simdata$X), badvariables)
  
  save(Xbr, goodvariables, seed, file=paste("./FAbatchPaper/InterimResults/datasetbr", z, "sim.Rda", sep=""))

}





# This function applies the different metrics to the datasets (after batch effect removal).
# It takes the whole number 'z', which corresponds to the z-th line of 'metricgrid',
# which contains informations stating which metric to apply to which simulated dataset 
# (after batch effect removal).

evaluateMetric <- function(z) {

  require("bapred")

  load("./FAbatchPaper/InterimResults/batchremovalgrid.Rda")
  load("./FAbatchPaper/InterimResults/metricgrid.Rda")

  load(paste("./FAbatchPaper/InterimResults/dataset", batchremovalgrid$datasetind[batchremovalgrid$datasetbrind==metricgrid$datasetbrind[z]], "sim.Rda", sep=""))
  load(paste("./FAbatchPaper/InterimResults/datasetbr", metricgrid$datasetbrind[z], "sim.Rda", sep=""))

  set.seed(seed)
  
  Xgoodvariables <- simdata$X[,goodvariables]
  
  if(metricgrid$metric[z]!="diffexpr")
    metrval <- bametric(xba=Xbr, batch=simdata$batch, y=simdata$y, x=simdata$X, metric = metricgrid$metric[z])
  else
    metrval <- bametric(x=Xgoodvariables, batch=simdata$batch, y=simdata$y, metric = "diffexpr", method = batchremovalgrid$batchremmethod[batchremovalgrid$datasetbrind==metricgrid$datasetbrind[z]])

  return(list(indscen=batchremovalgrid$indscen[batchremovalgrid$datasetbrind==metricgrid$datasetbrind[z]],
    batchremmethod=batchremovalgrid$batchremmethod[batchremovalgrid$datasetbrind==metricgrid$datasetbrind[z]],
    metric=metricgrid$metric[z],
    iteration=batchremovalgrid$iteration[batchremovalgrid$datasetbrind==metricgrid$datasetbrind[z]],
    metrval=metrval))
	  
}








# This function extends the above function "simuldata()" by the additional option "indscen = 'NoCor'"
# corresponding to the setting with uncorrelated predictors, which is used (among others) in the 
# simulation for cross-batch prediction.

simuldatacrossbatch <- function(batchsize = c(50, 50, 50, 50), indscen) {

  # Load simulation parameters:

  objectsnow <- ls()
  eval(parse(text=paste("load(\"./FAbatchPaper/InterimResults/SimulationParameters", indscen, ".Rda\")", sep="")))

  if(!exists("scendesign")) {

  # Get a couple of properties:
  numbatch <- 4
  if(length(batchsize)!=numbatch)
    stop(paste("Number of batches has to be equal to ", numbatch, sep=""))
  n <- sum(batchsize)
  batch <- rep(1:numbatch, times=batchsize)

  # Load simulation parameters:
  p <- length(alphag)

  # Simulate random parts:
  y <- rep(NA, length(batch))
  for(j in 1:length(unique(batch)))
    y[batch==j] <- sample(c(rep(1, floor(sum(batch==j)/2)), rep(2, sum(batch==j)-floor(sum(batch==j)/2))))
  eps01 <- matrix(nrow=n, ncol=p, data=rnorm(n*p))
  Z <- matrix(nrow=n, ncol=5, data=rnorm(n*5))
  Zbatch <- matrix(nrow=n, ncol=5, data=rnorm(n*5))

  # Put the parameters into matrices:
  alphagmat <- matrix(nrow=n, ncol=p, data=rep(alphag, each=n))
  betagmat <- matrix(nrow=n, ncol=p, data=rep(betag, each=n))
  sigmag2mat <- matrix(nrow=n, ncol=p, data=rep(sigmag2, each=n))

  gammajgmat <- deltajg2mat <- matrix(nrow=n, ncol=p)

  for(i in 1:numbatch) {
    gammajgmat[batch==i,] <- matrix(nrow=sum(batch==i), ncol=p, data=rep(gammajg[i,], each=sum(batch==i)))
    deltajg2mat[batch==i,] <- matrix(nrow=sum(batch==i), ncol=p, data=rep(deltajg2[i,], each=sum(batch==i)))
  }

  # Generate the simulated data matrix without batch- and class-specific correlations:
  X <- alphagmat + apply(t(betagmat), 1, function(x) x*(y-1)) + gammajgmat +
    Z%*%B0mat + sqrt(deltajg2mat)*sqrt(sigmag2mat)*eps01

  # Add class-specific correlations (Btildemat = 0 in case of no class-specific correlation):
  X[y==2,] <- X[y==2,] + Z[y==2,]%*%Btildemat

  # Add batch-specific correlations (loadings 0 in case of no class-specific correlation):
  for(i in 1:numbatch) {
    X[batch==i,] <- X[batch==i,] + Zbatch[batch==i,]%*%Bjmat[[i]] 
  }

  }
  else {

  # Get a couple of properties:
  numbatch <- 4
  if(length(batchsize)!=numbatch)
    stop(paste("Number of batches has to be equal to ", numbatch, sep=""))
  n <- sum(batchsize)
  batch <- rep(1:numbatch, times=batchsize)

  # Load simulation parameters:
  p <- length(alphag)

  # Simulate random parts:
  y <- rep(NA, length(batch))
  for(j in 1:length(unique(batch)))
    y[batch==j] <- sample(c(rep(1, floor(sum(batch==j)/2)), rep(2, sum(batch==j)-floor(sum(batch==j)/2))))

  if(scendesign %in% c("CommonCor", "BatchCor", "NoCor")) {
    require("mvtnorm")
    X <- matrix(nrow=n, ncol=length(alphag))
    for(j in 1:length(batchsize))
      X[batch==j,] <- rmvnorm(n = batchsize[j], mean = alphag + gammajg[j,], sigma = covbatches[[j]])
  }

  if(scendesign=="BatchClassCor") {
    require("mvtnorm")
    X <- matrix(nrow=n, ncol=length(alphag))
    for(j in 1:length(batchsize)) {
      for(k in 1:2)
        X[batch==j,][y[batch==j]==k,] <- rmvnorm(n = sum((batch==j) & (y==k)), mean = alphag + gammajg[j,], sigma = covbatches[[j]][[k]])
    }
  }

  betagmat <- matrix(nrow=n, ncol=p, data=rep(betag / 1, each=n))
  X <- X + apply(t(betagmat), 1, function(x) x*(y-1))

  }

  rm(list=setdiff(ls(), c(objectsnow, "X", "y", "batch")))

  # Return the data:
  return(list(X=X, y=factor(y), batch=factor(batch)))

}



# This function is a wrapper function for 'simuldatacrossbatch' from above.
# It takes the whole number 'z', which corresponds to the z-th line of 'simulgrid',
# which contains the informations on the datasets to simulate.

simulateDatacrossbatch <- function(z) {

seed <- simulgrid$seed[z]
set.seed(seed)
simdata <- simuldatacrossbatch(batchsize = c(25, 25, 25, 25), simulgrid$indscen[z])

save(simdata, seed, file=paste("./FAbatchPaper/InterimResults/dataset", simulgrid$datasetind[z], "simcrossbatch.Rda", sep=""))

}



# This function performs the cross-batch prediction analysis on the simulated datasets.
# It takes the whole number 'z', which corresponds to the z-th line of 'scenariogrid',
# which contains informations stating which batch effect removal method to apply
# to which simulated dataset.
# For each simulated dataset each possible pair of training and validation batch
# is considered.

performCrossBatchPredictionSimulation <- function(z) {

require("bapred")

load(paste("./FAbatchPaper/InterimResults/dataset", scenariogrid$datasetind[z], "simcrossbatch.Rda", sep=""))
set.seed(seed)

X <- simdata$X
y <- simdata$y
batch <- simdata$batch
batchun <- unique(batch)

combns <- cbind(combn(length(batchun), 2), combn(length(batchun), 2)[2:1,])
true1pred1 <- true1pred2 <- true2pred1 <- true2pred2 <- 0

for(i in 1:ncol(combns)) {

  Xtrain <- X[batch==combns[1,i],]
  ytrain <- y[batch==combns[1,i]]
  batchtrain <- factor(rep(1, sum(batch==combns[1,i])))

  Xtest <- X[batch==combns[2,i],]
  ytest <- y[batch==combns[2,i]]
  batchtest <- factor(rep(1, sum(batch==combns[2,i])))

  if(!(scenariogrid$batchremmethod[z] %in% c("fsvafast", "fsvaexact"))) {
    params <- ba(x=Xtrain, y=ytrain, batch=batchtrain, method = scenariogrid$batchremmethod[z])
  }
  else {
    if(scenariogrid$batchremmethod[z]=="fsvafast")
      params <- ba(x=Xtrain, y=ytrain, batch=batchtrain, method = "sva", algorithm = "fast")
    if(scenariogrid$batchremmethod[z]=="fsvaexact")
      params <- ba(x=Xtrain, y=ytrain, batch=batchtrain, method = "sva", algorithm = "exact")
  }

  Xtrain <- params$xadj

  Xtest <- baaddon(params, x=Xtest, batch=batchtest)

  traintestval <- performValidation(Xtrain, ytrain, Xtest, ytest)
  
  confmatrix <- traintestval$confmatrix
  true1pred1[i] <- confmatrix[1,1]
  true1pred2[i] <- confmatrix[2,1]
  true2pred1[i] <- confmatrix[1,2]
  true2pred2[i] <- confmatrix[2,2]  
  
}
  
return(list(indscen=simulgrid$indscen[simulgrid$datasetind==scenariogrid$datasetind[z]],
  iteration=simulgrid$iteration[simulgrid$datasetind==scenariogrid$datasetind[z]],
  method=scenariogrid$batchremmethod[z],
  true1pred1=true1pred1, true1pred2=true1pred2,
  true2pred1=true2pred1, true2pred2=true2pred2))
  
}
  
 
  

# This function is used by 'performCrossBatchPredictionSimulation', see above.
#
# It does the following for a pair of training and validation batch after 
# batch effect adjustment and addon batch effect adjustment, respectively:
#
# 1) On the training batch fit LDA using PLS components 
# as covariates, tuning the number of components by 3-fold CV;
# 2) On the test batch estimate the MCC-value of the prediction rule
# fitted in 1).
#
# Input parameters:
# Xtrain - covariate matrix in the training batch (after batch effect removal)
# ytrain - target variable in the training batch
# Xtest - covariate matrix in the test batch after addon-batch effect removal
# ytest - target variable in the test batch

performValidation <- function(Xtrain, ytrain, Xtest, ytest) {

  require("CMA")
  comp.grid <- 1:10

  compbestvec <- 0
  for(i in 1:10) {
    tunecv <- tune(Xtrain, ytrain, classifier = pls_ldaCMA, grids = list(comp = comp.grid))
    compbestvec[i] <- comp.grid[which.min(tunecv@tuneres[[1]])]
  }
  compbest <- floor(median(compbestvec))

  plsldaobject <- pls_ldaCMA(rbind(Xtrain, Xtest), c(ytrain, ytest), 
    learnind=1:length(ytrain), comp = compbest)

  ytesthat <- plsldaobject@yhat + 1
  ytesthat <- factor(ytesthat, levels=c(1,2))
	
  confmatrix <- table(ytesthat, ytest)

  return(list(confmatrix=confmatrix, ytest=ytest, ytesthat=ytesthat))
  
}
