##################################################################################
#                                                                                # 
#  First part: Set up those simulation scenarios, which involve latent factors   #
#  (Design A).                                                                   #
#                                                                                # 
##################################################################################


# Load dataset 'ColoncbTranscr', which was used for orientation when generating the
# simulation parameters.
# NOTE: The Rda-file loaded here is NOT included in the electronic appendix. To obtain it,
# one has to run the R-script 'ColoncbTranscr_preparationinfos.R' found in the folder
# 'FAbatchPaper'.

load("./FAbatchPaper/InterimResults/ColoncbTranscr.Rda")


# Estimate the differences of the class-specific means:
betag2 <- colMeans(X[(batch==1) & (y==1),]) - colMeans(X[(batch==1) & (y==2),])


# Perform a t-test between the classes for each variable and record the p-value:
pvaluesttest <- apply(X[batch==1,], 2, function(x, z) t.test(x ~ z)$p.value, z=y[batch==1])


# Fit gamma distributions to those negative and positive class-specific means, 
# which correspond to variables with p-values smaller than 0.05.
library("MASS")
paramsgamma1 <- fitdistr(-betag2[pvaluesttest<0.05][betag2[pvaluesttest<0.05]<0], dgamma, list(shape=1, rate=1))
paramsgamma2 <- fitdistr(betag2[pvaluesttest<0.05][betag2[pvaluesttest<0.05]>0], dgamma, list(shape=1, rate=1))


# Function to simulate class differences:
simulatebetag <- function(p) {
  nzero <- p - 2*floor(0.15*p)
  nplus <- nminus <- floor(0.15*p)
  diffplus <- rgamma(nplus, shape=paramsgamma2$estimate[1], rate=paramsgamma2$estimate[2])
  diffminus <- -rgamma(nminus, shape=paramsgamma1$estimate[1], rate=paramsgamma1$estimate[2])
  diffzero <- rep(0, nzero)
  simdiffs <- sample(c(diffplus, diffminus, diffzero))
  simdiffsresc <- simdiffs*0.17
  return(simdiffsresc)
}




# The simulation parameters were determined by different means:
#
# First, we estimated the corresponding parameters on the 'ColoncTranscr'-dataset.
# While we did not always fix the simulation parameters to ranges
# similar to that of the estimates (see below), we did specify the
# parameters of a type such that they exhibit roughly similar relations across
# batch or in the cases of the latent factor parts across factors as the corresponding 
# relations of the parameters estimated on real data.
#
# The ranges of the parameters were determined in a kind of sensitivity
# analysis, where we varied the parameter values to ensure behaviours 
# comparable to those, which we had commonly observed in real data analyses. Here we used
# graphical means to study the behaviour of the simulated data: boxplots 
# marking the batch-specific means of the variables, plots of the first two
# principal components highlighting the different batches, boxplots of the
# batch-specific correlations, boxplots of the differences of the batch-
# specific correlations and boxplots of the class-specific correlations.
#
# Moreover the strength of the signal was guided by the cross-batch prediction
# errors of the Lasso model estimated using the simulated datasets.
#
# For the sake of completeness we declare that the specification of the 
# simulation design was not in any way guided by the results of the simulation.




# Additive batch effects:

# finally chosen parameters:
gammajgmeans <- c(0.15, 0, 0.1, -0.25)
gammajgsds <- c(0.12, 0.07, 0.1, 0.15)

gammajg <- list(gammajgmeans,
  gammajgsds)  

  
library("Hmisc")

par(mfrow=c(1,2))

errbar(x=1:4, y=gammajgmeans, yminus=gammajgmeans-gammajgsds*qnorm(0.975), yplus=gammajgmeans+gammajgsds*qnorm(0.975))
title("simulation")

gammajgreal <- sapply(1:2, function(x) colMeans(X[batch==x,]) - colMeans(X))
gammajgrealmeans <- apply(gammajgreal, 2, mean)
gammajgrealsds <- apply(gammajgreal, 2, sd)

gammajgrealmeans
gammajgrealsds

errbar(x=1:2, y=gammajgrealmeans, yminus=gammajgrealmeans-gammajgrealsds*qnorm(0.975), yplus=gammajgrealmeans+gammajgrealsds*qnorm(0.975))
title("real data")

par(mfrow=c(1,1))




# Scale batch effects:

# finally chosen parameters:
deltajg2alphas <- c(11, 7, 7.2, 4.8)
deltajg2betas <- c(12, 6, 4.8, 5.2)
  
deltajg2 <- list(deltajg2alphas,
  deltajg2betas)

library("pscl")
qigamma(c(0.05,0.95), alpha=deltajg2alphas[1], beta=deltajg2betas[1])

par(mfrow=c(1,2))

errbar(x=1:4, y=sapply(1:4, function(x) qigamma(0.5, alpha=deltajg2alphas[x], beta=deltajg2betas[x])),
  yminus=sapply(1:4, function(x) qigamma(0.05, alpha=deltajg2alphas[x], beta=deltajg2betas[x])), 
  yplus=sapply(1:4, function(x) qigamma(0.95, alpha=deltajg2alphas[x], beta=deltajg2betas[x])), ylab="")
title("simulation")
  
deltajg2real <- sapply(1:2, function(x) apply(X[batch==x,], 2, var)/apply(X, 2, var))  

f <- function(x, alpha, beta)
  (beta^alpha)/gamma(alpha) * (x^(-alpha-1)) * exp(-beta/x)
 
deltagj2realpar <- sapply(1:2, function(x) {
  ests <- fitdistr(deltajg2real[,x], f, list(alpha=10, beta=10))
  return(list(alpha=ests$estimate[1], beta=ests$estimate[2]))
})

deltajg2realalphas <- unlist(deltagj2realpar[1,])
names(deltajg2realalphas) <- NULL
deltajg2realbetas <- unlist(deltagj2realpar[2,])
names(deltajg2realbetas) <- NULL

deltajg2realalphas
deltajg2realbetas

errbar(x=1:2, y=sapply(1:2, function(x) qigamma(0.5, alpha=deltajg2realalphas[x], beta=deltajg2realbetas[x])),
  yminus=sapply(1:2, function(x) qigamma(0.05, alpha=deltajg2realalphas[x], beta=deltajg2realbetas[x])), 
  yplus=sapply(1:2, function(x) qigamma(0.95, alpha=deltajg2realalphas[x], beta=deltajg2realbetas[x])), ylab="")
title("real data")
  
par(mfrow=c(1,1))





# Factor loadings:


# Real data:

library("bapred")

Bjmatreal <- lapply(1:2, function(x) emfahighdim(eps=scale(X[batch==x,], scale=FALSE), nbf=5)$B)
bjrealmeans <- lapply(Bjmatreal, colMeans)
bjrealsds <- lapply(Bjmatreal, function(x) apply(x, 2, sd))

bjrealmeans
bjrealsds

par(mfrow=c(1,2))
for(j in 1:2) {
errbar(x=1:5, y=bjrealmeans[[j]], yminus=bjrealmeans[[j]]-bjrealsds[[j]]*qnorm(0.975), yplus=bjrealmeans[[j]]+bjrealsds[[j]]*qnorm(0.975))# , ylim=c(-1,1))
title(paste("batch", j))
}
par(mfrow=c(1,1))



# Common factor loadings:

# finally chosen parameters:
b0means <- c(-0.02, 0, -0.02, 0, 0)
b0sds <- c(0.1, 0.07, 0.1, 0.07, 0.05)

b0 <- list(b0means, b0sds)

errbar(x=1:5, y=b0means, yminus=b0means-b0sds*qnorm(0.975), yplus=b0means+b0sds*qnorm(0.975))
title("simulation")

# Batch-specific factor loadings:

# finally chosen parameters:
bjmeans <- list(c(0.02, -0.02, 0.00, 0.00, 0.00), c(-0.07, 0.07, 0.00, 0.00, 0.00),
  c(0.00, 0.00, -0.03, 0.00, 0.00), c(0.00, 0.00, 0.03, 0.00, 0.00))
bjsds <- list(c(0.06, 0.06, 0.03, 0.03, 0.01), c(0.06, 0.06, 0.03, 0.01, 0.01),
  c(0.06, 0.01, 0.03, 0.03, 0.011), c(0.15, 0.15, 0.08, 0.01, 0.04))

bj <- list(bjmeans, bjsds)

par(mfrow=c(2,2))
for(j in 1:4) {
errbar(x=1:5, y=bjmeans[[j]], yminus=bjmeans[[j]]-bjsds[[j]]*qnorm(0.975), yplus=bjmeans[[j]]+bjsds[[j]]*qnorm(0.975))# , ylim=c(-1,1))
title(paste("batch", j))
}
par(mfrow=c(1,1))



# Class-difference in factor loadings for common factor loadings:  

# finally chosen parameters:
btildemeans <- c(-0.005, 0, +0.005, 0, 0)
btildesds <- c(0.03, 0.02, 0.03, 0.02, 0.01)  

btilde <- list(btildemeans, btildesds)


factorsreal <- emfahighdim(eps=scale(X, scale=FALSE), nbf=5)

Xb1sc <- scale(X[batch==1,], scale=FALSE)
fac1 <- factorsreal$Factors[batch==1,]
Bb1 <- t(lm(Xb1sc ~ 0 + fac1)$coefficients)
Btilde1 <- Bb1 - factorsreal$B

Xb2sc <- scale(X[batch==2,], scale=FALSE)
fac2 <- factorsreal$Factors[batch==2,]
Bb2 <- t(lm(Xb2sc ~ 0 + fac2)$coefficients)
Btilde2 <- Bb2 - factorsreal$B

btilde1realmeans <- colMeans(Btilde1)
btilde2realmeans <- colMeans(Btilde2)

btilde1realsds <- apply(Btilde1, 2, sd)
btilde2realsds <- apply(Btilde2, 2, sd)

btilde1realmeans
btilde2realmeans

btilde1realsds
btilde2realsds

# Real vs. simulated data:
plot(rep(1:3, times=c(length(btildemeans), length(btilde1realmeans), 
  length(btilde2realmeans))), c(btildemeans, btilde1realmeans, btilde2realmeans), xaxt="n", xlab="", main="means")
axis(1, at=1:3, labels=c("simulation", "real data - batch 1", "real data - batch 2"))
plot(rep(1:3, times=c(length(btildesds), length(btilde1realsds), 
  length(btilde2realsds))), c(btildesds, btilde1realsds, btilde2realsds), xaxt="n", xlab="", main="standard deviations")
axis(1, at=1:3, labels=c("simulation", "real data - batch 1", "real data - batch 2"))





# Now generate the parameters for all simulation scenarios as described in the paper
# and store them in different Rda-files:

scenario <- c("b0scen", "bjscen", "btildescen")

set.seed(4321) 
  
p <- 1000 
numbatch <- 4 
batchn <- 25

alphag <- rnorm(p, mean=1.8, sd=0.4) 

betag <- simulatebetag(1000)

gammajg <- matrix(nrow=numbatch, ncol=p, 
  data=c(rnorm(p, mean=gammajgmeans[1], sd=gammajgsds[1]), 
  rnorm(p, mean=gammajgmeans[2], sd=gammajgsds[2]), 
  rnorm(p, mean=gammajgmeans[3], sd=gammajgsds[3]), 
  rnorm(p, mean=gammajgmeans[4], sd=gammajgsds[4])), byrow=TRUE) 

sigmag2 <- rigamma(p, alpha = 3, beta = 0.13) 

deltajg2 <- matrix(nrow=numbatch, ncol=p, 
  data=c(rigamma(p, alpha=deltajg2alphas[1], beta=deltajg2betas[1]), 
  rigamma(p, alpha=deltajg2alphas[2], beta=deltajg2betas[2]), 
  rigamma(p, alpha=deltajg2alphas[3], beta=deltajg2betas[3]), 
  rigamma(p, alpha=deltajg2alphas[4], beta=deltajg2betas[4])), byrow=TRUE) 

B0mat <- rbind(rnorm(p, mean=b0means[1], sd=b0sds[1]), 
  rnorm(p, mean=b0means[2], sd=b0sds[2]), 
  rnorm(p, mean=b0means[3], sd=b0sds[3]), 
  rnorm(p, mean=b0means[4], sd=b0sds[4]), 
  rnorm(p, mean=b0means[5], sd=b0sds[5])) 

for(scenind in 1:length(scenario)) {

if(scenario[scenind]=="b0scen") {
  Bjmat <- list(matrix(nrow=nrow(B0mat), ncol=ncol(B0mat), data=0),
    matrix(nrow=nrow(B0mat), ncol=ncol(B0mat), data=0),
    matrix(nrow=nrow(B0mat), ncol=ncol(B0mat), data=0),
    matrix(nrow=nrow(B0mat), ncol=ncol(B0mat), data=0))

  Btildemat <- matrix(nrow=nrow(B0mat), ncol=ncol(B0mat), data=0)
}

if(scenario[scenind]!="b0scen") {
  Bjmat <- list() 
  for(i in 1:4) 
  Bjmat[[i]] <- rbind(rnorm(p, mean=bjmeans[[i]][1], sd=bjsds[[i]][1]), 
    rnorm(p, mean=bjmeans[[i]][2], sd=bjsds[[i]][2]), 
    rnorm(p, mean=bjmeans[[i]][3], sd=bjsds[[i]][3]), 
    rnorm(p, mean=bjmeans[[i]][4], sd=bjsds[[i]][4]), 
    rnorm(p, mean=bjmeans[[i]][5], sd=bjsds[[i]][5]))
 
  "++" <- function(x, ...) if (nargs() == 1) x else x + Recall(...) 
  Bjmatmean <- do.call("++", Bjmat)/4 
 
  Bjmat <- lapply(Bjmat, function(x) x-Bjmatmean)

  if(scenario[scenind]!="btildescen")
    Btildemat <- matrix(nrow=nrow(B0mat), ncol=ncol(B0mat), data=0)
  else
    Btildemat <- rbind(rnorm(p, mean=btildemeans[1], sd=btildesds[1]), 
      rnorm(p, mean=btildemeans[2], sd=btildesds[2]), 
      rnorm(p, mean=btildemeans[3], sd=btildesds[3]), 
      rnorm(p, mean=btildemeans[4], sd=btildesds[4]), 
      rnorm(p, mean=btildemeans[5], sd=btildesds[5]))
}

eval(parse(text=paste("save(alphag, betag, gammajg, sigmag2, deltajg2, B0mat, Btildemat, Bjmat, batchn, file=\"./FAbatchPaper/InterimResults/SimulationParameters", scenario[scenind], ".Rda\")", sep="")))

}




# Simulate a couple of datasets and study the following plots:
# boxplots marking the batch-specific means of the variables, plots of the first two
# principal components highlighting the different batches and boxplots of the
# batch-specific correlations.

library("bapred")

pdf("./FAbatchPaper/InterimResults/ScenarioPlotBatchMethodPaper.pdf")

for(i in 1:length(scenario)) {

eval(parse(text=paste("load(\"./FAbatchPaper/InterimResults/SimulationParameters", scenario[i], ".Rda\")", sep="")))

# Function to simulate a dataset:

simuldata <- function(batchsize = c(50, 50, 50, 50)) {

  # Get a couple of properties:
  numbatch <- 4
  if(length(batchsize)!=numbatch)
    stop(paste("Number of batches has to be equal to ", numbatch, sep=""))
  n <- sum(batchsize)
  batch <- rep(1:numbatch, times=batchsize)

  # Load simulation parameters:
  p <- length(alphag)

  # Simulate random parts:
  y <- rep(NA, length(batch))
  for(j in 1:length(unique(batch)))
    y[batch==j] <- sample(c(rep(1, floor(sum(batch==j)/2)), rep(2, sum(batch==j)-floor(sum(batch==j)/2))))
  eps01 <- matrix(nrow=n, ncol=p, data=rnorm(n*p))
  Z <- matrix(nrow=n, ncol=5, data=rnorm(n*5))
  Zbatch <- matrix(nrow=n, ncol=5, data=rnorm(n*5))

  # Put the parameters into matrices:
  alphagmat <- matrix(nrow=n, ncol=p, data=rep(alphag, each=n))
  betagmat <- matrix(nrow=n, ncol=p, data=rep(betag, each=n))
  sigmag2mat <- matrix(nrow=n, ncol=p, data=rep(sigmag2, each=n))

  gammajgmat <- deltajg2mat <- matrix(nrow=n, ncol=p)

  for(i in 1:numbatch) {
    gammajgmat[batch==i,] <- matrix(nrow=sum(batch==i), ncol=p, data=rep(gammajg[i,], each=sum(batch==i)))
    deltajg2mat[batch==i,] <- matrix(nrow=sum(batch==i), ncol=p, data=rep(deltajg2[i,], each=sum(batch==i)))
  }

  # Generate the simulated data matrix without batch- and class-specific correlations:
  X <- alphagmat + apply(t(betagmat), 1, function(x) x*(y-1)) + gammajgmat +
    Z%*%B0mat + sqrt(deltajg2mat)*sqrt(sigmag2mat)*eps01

  # Add class-specific correlations (Btildemat = 0 in case of no class-specific correlation):
  X[y==2,] <- X[y==2,] + Z[y==2,]%*%Btildemat

  # Add batch-specific correlations (loadings 0 in case of no class-specific correlation):
  for(i in 1:numbatch) {
    X[batch==i,] <- X[batch==i,] + Zbatch[batch==i,]%*%Bjmat[[i]] 
  }

  # Return the data:
  return(list(X=X, y=factor(y), batch=factor(batch)))

}



layout(mat=rbind(c(1,1,1,1), 1+c(1,2,3,4), 1+c(5,5,5,5), 1+c(6,6,6,6)),
  heights=c(1,5, 5, 5))

# PC plots:
  
par(mar=c(0.1, 2.1, 0.1, 0.1))

plot(0,0, col="white")
text(0,0, scenario[i], cex=1.5)

for(j in 1:4) {

simuldatlist <- simuldata()

X <- simuldatlist$X
y <- simuldatlist$y
batch <- simuldatlist$batch

pcplot(x=X, batch=factor(batch), y=y)

}

# Boxplots of correlations:

boxplot(as.vector(cor(X[batch==1,])), as.vector(cor(X[batch==2,])), 
  as.vector(cor(X[batch==3,])), as.vector(cor(X[batch==4,])))#, border=4, add=TRUE)
abline(h=c(-1,0,1), col="grey")


# Boxplots marking the batch-specific means of the variables:

Xsub <- X[,sample(1:ncol(X), size=100)]

boxplot(Xsub)
points(rep(1:ncol(Xsub), each=4), as.vector(rbind(colMeans(Xsub[batch==1,]), 
colMeans(Xsub[batch==2,]), 
colMeans(Xsub[batch==3,]), 
colMeans(Xsub[batch==4,]))), col=rep(1:4, times=ncol(Xsub)), 
  pch=rep(c(20, 2:4), times=ncol(Xsub)))

par(mar=c(5.1, 4.1, 4.1, 2.1))


}

dev.off()






# Boxplots of the differences of the batch-specific correlations
# and boxplots of the differences of the class-specific correlations:


# Function to simulate a dataset:

simuldata <- function(batchsize = c(50, 50, 50, 50)) {

  # Get a couple of properties:
  numbatch <- 4
  if(length(batchsize)!=numbatch)
    stop(paste("Number of batches has to be equal to ", numbatch, sep=""))
  n <- sum(batchsize)
  batch <- rep(1:numbatch, times=batchsize)

  # Load simulation parameters:
  p <- length(alphag)

  # Simulate random parts:
  y <- rep(NA, length(batch))
  for(j in 1:length(unique(batch)))
    y[batch==j] <- sample(c(rep(1, floor(sum(batch==j)/2)), rep(2, sum(batch==j)-floor(sum(batch==j)/2))))
  eps01 <- matrix(nrow=n, ncol=p, data=rnorm(n*p))
  Z <- matrix(nrow=n, ncol=5, data=rnorm(n*5))
  Zbatch <- matrix(nrow=n, ncol=5, data=rnorm(n*5))

  # Put the parameters into matrices:
  alphagmat <- matrix(nrow=n, ncol=p, data=rep(alphag, each=n))
  betagmat <- matrix(nrow=n, ncol=p, data=rep(betag, each=n))
  sigmag2mat <- matrix(nrow=n, ncol=p, data=rep(sigmag2, each=n))

  gammajgmat <- deltajg2mat <- matrix(nrow=n, ncol=p)

  for(i in 1:numbatch) {
    gammajgmat[batch==i,] <- matrix(nrow=sum(batch==i), ncol=p, data=rep(gammajg[i,], each=sum(batch==i)))
    deltajg2mat[batch==i,] <- matrix(nrow=sum(batch==i), ncol=p, data=rep(deltajg2[i,], each=sum(batch==i)))
  }

  # Generate the simulated data matrix without batch- and class-specific correlations:
  X <- alphagmat + apply(t(betagmat), 1, function(x) x*(y-1)) + gammajgmat +
    Z%*%B0mat + sqrt(deltajg2mat)*sqrt(sigmag2mat)*eps01

  # Add class-specific correlations (Btildemat = 0 in case of no class-specific correlation):
  X[y==2,] <- X[y==2,] + Z[y==2,]%*%Btildemat

  # Add batch-specific correlations (loadings 0 in case of no class-specific correlation):
  for(i in 1:numbatch) {
    X[batch==i,] <- X[batch==i,] + Zbatch[batch==i,]%*%Bjmat[[i]] 
  }

  # Return the data:
  return(list(X=X, y=factor(y), batch=factor(batch)))

}


# Simulate one dataset per scenario:

load("./FAbatchPaper/InterimResults/SimulationParametersb0scen.Rda")
simuldatlist <- simuldata()

X1 <- simuldatlist$X
y1 <- simuldatlist$y
batch1 <- simuldatlist$batch


load("./FAbatchPaper/InterimResults/SimulationParametersbjscen.Rda")
simuldatlist <- simuldata()

X2 <- simuldatlist$X
y2 <- simuldatlist$y
batch2 <- simuldatlist$batch


load("./FAbatchPaper/InterimResults/SimulationParametersbtildescen.Rda")
simuldatlist <- simuldata()

X3 <- simuldatlist$X
y3 <- simuldatlist$y
batch3 <- simuldatlist$batch



# Calculate correlations:

cor11 <- cor(X1[batch1==1,1:500])
cor12 <- cor(X1[batch1==2,1:500])
cor13 <- cor(X1[batch1==3,1:500])
cor14 <- cor(X1[batch1==4,1:500])

cor21 <- cor(X2[batch1==1,1:500])
cor22 <- cor(X2[batch1==2,1:500])
cor23 <- cor(X2[batch1==3,1:500])
cor24 <- cor(X2[batch1==4,1:500])

cor31 <- cor(X3[batch1==1,1:500])
cor32 <- cor(X3[batch1==2,1:500])
cor33 <- cor(X3[batch1==3,1:500])
cor34 <- cor(X3[batch1==4,1:500])


cors1 <- cbind(as.vector(cor11), as.vector(cor12), as.vector(cor13), as.vector(cor14))
cors2 <- cbind(as.vector(cor21), as.vector(cor22), as.vector(cor23), as.vector(cor24))
cors3 <- cbind(as.vector(cor31), as.vector(cor32), as.vector(cor33), as.vector(cor34))


# Calculate differences of batch-specific correlations and make boxplots:

combns <- combn(4, 2)
combns

diffs1 <- apply(combns, 2, function(x) cors1[,x[1]] - cors1[,x[2]])
diffs2 <- apply(combns, 2, function(x) cors2[,x[1]] - cors2[,x[2]])
diffs3 <- apply(combns, 2, function(x) cors3[,x[1]] - cors3[,x[2]])

par(mfrow=c(1,3))
boxplot(diffs1)
boxplot(diffs2)
boxplot(diffs3)
par(mfrow=c(1,1))


# Calculate differences of class-specific correlations and make boxplots:

corclass21 <- cor(X2[y2==1,1:500])
corclass22 <- cor(X2[y2==2,1:500])

corclass31 <- cor(X3[y3==1,1:500])
corclass32 <- cor(X3[y3==2,1:500])

boxplot(as.vector(corclass21) - as.vector(corclass22), as.vector(corclass31) - as.vector(corclass32))
var(as.vector(corclass21) - as.vector(corclass22)) 
var(as.vector(corclass31) - as.vector(corclass32))








#######################################################################################
#                                                                                     # 
#  Second part: Set up those simulation scenarios, which do not involve factors, but  #
#  in which the correlation matrices are estimated using real data (Design B).        #
#                                                                                     # 
#######################################################################################



# Scenario with same correlations in all batches.


# Except for the correlations all other parameters are set equal to those
# used in the corresponding scenario of the simulation with factors:
load("./FAbatchPaper/InterimResults/SimulationParametersb0scen.Rda")

# Make covariance matrices in batches.

# We do not want to generate the correlation matrices from factor models,
# but it is also not possible to choose arbitrary correlations between the
# variables, because then the correlation matrix will not be positive definite.

# Therefore we use a real dataset to estimate the correlation matrix and then
# determine the nearest positive correlation matrix.

# Load real dataset used for estimating the correlations:
# NOTE: The Rda-file loaded here is NOT included in the electronic appendix. To obtain it,
# one has to run the R-script 'AutismTranscr_preparationinfos.R' found in the folder
# 'FAbatchPaper/Datasets/PreparationScripts'.
load("./FAbatchPaper/Datasets/ProcessedData/AutismTranscr.Rda")

dim(X)
table(y)
table(batch)


# Choose the 1000 variables in batch 2 which show themselves most related
# to the outcome:
library(CMA)
selg <- GeneSelection(y = y[batch==2], X = X[batch==2,], method = 't.test')@rankings[[1]][1:1000]
Xsub <- X[,selg]

# Permute the 1000 selected variables randomly (not necessary in principle):
set.seed(1234)
Xsub <- Xsub[,sample(1:ncol(Xsub))]


# Remove the class signal in batch 2:
Xsub2 <- Xsub[batch==2,]
Xsub2[y[batch==2]==1,] <- scale(Xsub2[y[batch==2]==1,], scale=FALSE)
Xsub2[y[batch==2]==2,] <- scale(Xsub2[y[batch==2]==2,], scale=FALSE)

# Estimate the correlation matrix:
cormatest <- cor(Xsub2)

# Not positive definite yet:
library("corpcor")
is.positive.definite(cormatest)

# Find the nearest positive definite correlation matrix:
library("Matrix")
cormatpdest <- nearPD(cormatest, corr=TRUE)$mat
is.positive.definite(cormatpdest)


# Correlations look realistic: 
boxplot(cormatpdest[upper.tri(cormatest)])
abline(h=0, col="grey", lty=2)


# Difference between correlations from non-positive definite correlation
# matrix and those from nearest positive definite correlation matrix:

boxplot(cormatpdest[upper.tri(cormatest)] - cormatest[upper.tri(cormatest)])

# --> Almost no differences.


# Correlation matrix is of class "dpoMatrix", has to be however
# of class "matrix" for "rmvnorm" to work:
cormatpdest <- as.matrix(cormatpdest)


# Some informations necessary for the simulation:

batchsize <- rep(25, 4)

n <- sum(batchsize)

numbatch <- 4
batch <- rep(1:numbatch, times=batchsize)

y <- rep(NA, length(batch))
for(j in 1:length(unique(batch)))
  y[batch==j] <- sample(c(rep(1, floor(sum(batch==j)/2)), rep(2, sum(batch==j)-floor(sum(batch==j)/2))))


# Now calculate the covariance matrices, by multiplying by the variances:

covbatches <- list()
for(j in 1:length(batchsize))
  covbatches[[j]] <- cormatpdest*(sqrt(colSums(B0mat^2) + sigmag2*deltajg2[j,])%*%t(sqrt(colSums(B0mat^2) + sigmag2*deltajg2[j,])))

sapply(covbatches, is.positive.definite)



# Simulate a dataset:

Xsim <- matrix(nrow=n, ncol=length(alphag))
library("mvtnorm")
for(j in 1:length(batchsize))
  Xsim[batch==j,] <- rmvnorm(n = batchsize[j], mean = alphag + gammajg[j,], sigma = covbatches[[j]])

betagmat <- matrix(nrow=n, ncol=p, data=rep(betag, each=n))
Xsim <- Xsim + apply(t(betagmat), 1, function(x) x*(y-1))


# Simulate a dataset with factor-induced correlations
# for comparison:
Xsimfac <- simuldata(batchsize = rep(25,4))$X


# Compare the two:

par(mfrow=c(1,2))
pcplot(x=Xsim, batch=factor(batch), y=factor(y))
pcplot(x=Xsimfac, batch=factor(batch), y=factor(y))
par(mfrow=c(1,1))


randinds <- sample(1:ncol(Xsim), size=50)
Xsimsub <- Xsim[,randinds]
Xsimfacsub <- Xsimfac[,randinds]

par(mfrow=c(1,2))
boxplot(Xsimsub, main="Correlation matrix estimated on real data")
points(rep(1:ncol(Xsimsub), each=4), as.vector(rbind(colMeans(Xsimsub[batch==1,]), 
colMeans(Xsimsub[batch==2,]), 
colMeans(Xsimsub[batch==3,]), 
colMeans(Xsimsub[batch==4,]))), col=rep(1:4, times=ncol(Xsimsub)), 
  pch=rep(c(20, 2:4), times=ncol(Xsimsub)))

boxplot(Xsimfacsub, main="Correlation matrix induced by factors")
points(rep(1:ncol(Xsimfacsub), each=4), as.vector(rbind(colMeans(Xsimfacsub[batch==1,]), 
colMeans(Xsimfacsub[batch==2,]), 
colMeans(Xsimfacsub[batch==3,]), 
colMeans(Xsimfacsub[batch==4,]))), col=rep(1:4, times=ncol(Xsimfacsub)), 
  pch=rep(c(20, 2:4), times=ncol(Xsimfacsub)))
par(mfrow=c(1,1))

# --> The simulated data seems to behave realistically.

# String to identify the simulation scenario in the function for simulating:
scendesign <- "CommonCor"


# Save simulation parameters:

save(alphag, betag, gammajg, covbatches, scendesign, file="./FAbatchPaper/InterimResults/SimulationParametersCommonCor.Rda")





# Check the signal strength for the above setting and for the corresponding
# setting in the case of simulating with factors (Design A).


# Transform 'batch' and 'y' into factors:
batch <- factor(batch)
y <- factor(y)

batchsafe <- batch
ysafe <- y


# LASSO:
# Using CMA, to predict new observations one has to use the original
# function, the corresponding CMA-functions are calling, since CMA doesn't
# have this option - with LASSO the problem is, that whereas "LassoCMA" only
# requires a fractional value, "glmnet" uses the absolute value, that means,
# for prediction with "predict.glmnet" one has to determine the absolute 
# lambda-values, corresponding to the fractional values. The following function
# does this:

getlambda <- function(norm.fraction, output) {

    nbeta = rbind2(output$a0, output$beta)
    nbeta <- as.matrix(nbeta)

    b <- t(nbeta)

    s <- norm.fraction

    k <- nrow(b)

    std.b <- scale(b[, -1], FALSE, 1/apply(output$call$x, 2, sd))

    bnorm <- apply(abs(std.b), 1, sum)
    sb <- bnorm/bnorm[k]

    sfrac <- (s - sb[1])/(sb[k] - sb[1])
    sb <- (sb - sb[1])/(sb[k] - sb[1])

    usb <- unique(sb)

    useq <- match(usb, sb)

    sb <- sb[useq]
    b <- b[useq, ]

    coord <- approx(sb, seq(sb), sfrac)$y

    left <- floor(coord)
    right <- ceiling(coord)

    (output$lambda[left] + output$lambda[right])/2

}



# Simulation with multivariate normal distribution: 
# Cross-batch prediction error of Lasso when using the raw data:

norm.fraction.grid <- seq(from=0.1, to=0.9, length=9)
errors <- 0

for(j in 1:length(levels(batch))) {

  Xtrain <- Xsim[batch!=j,]
  ytrain <- y[batch!=j]
  batchtrain <- factor(as.numeric(factor(as.numeric(batch[batch!=j]))))

  Xtest <- Xsim[batch==j,]
  ytest <- y[batch==j]
  batchtest <- factor(as.numeric(factor(as.numeric(batch[batch==j]))))

  tunecv <- tune(Xtrain, ytrain, classifier = LassoCMA, grids = list(norm.fraction = norm.fraction.grid))
  norm.fraction.opt <- norm.fraction.grid[which.min(tunecv@tuneres[[1]])]

  classifiercma <- LassoCMA(Xtrain, ytrain, norm.fraction = norm.fraction.opt, models=TRUE)
  lasso <- classifiercma@model[[1]]

  ytesthat <- predict(object=lasso, newx=Xtest, type="response", s=getlambda(norm.fraction.opt, lasso))
  ytesthat <- as.numeric(ytesthat>0.5)+1

  errors[j] <- mean(ytest!=ytesthat)

}

errors
mean(errors)

errorsraw1 <- errors

apply(apply(table(batch, y), 1, function(x) x/sum(x)), 2, min)




# Simulation with multivariate normal distribution:
# Cross-batch prediction error when using ComBat:

norm.fraction.grid <- seq(from=0.1, to=0.9, length=9)
errors <- 0

for(j in 1:length(levels(batch))) {

  Xtrain <- Xsim[batch!=j,]
  ytrain <- y[batch!=j]
  batchtrain <- factor(as.numeric(factor(as.numeric(batch[batch!=j]))))

  Xtest <- Xsim[batch==j,]
  ytest <- y[batch==j]
  batchtest <- factor(as.numeric(factor(as.numeric(batch[batch==j]))))

  combattrain <- ba(x=Xtrain, batch=batchtrain, method = "combat")
  Xtraincombat <- combattrain$xadj

  tunecv <- tune(Xtraincombat, ytrain, classifier = LassoCMA, grids = list(norm.fraction = norm.fraction.grid))
  norm.fraction.opt <- norm.fraction.grid[which.min(tunecv@tuneres[[1]])]

  classifiercma <- LassoCMA(Xtraincombat, ytrain, norm.fraction = norm.fraction.opt, models=TRUE)
  lasso <- classifiercma@model[[1]]


  Xtestcombat <- baaddon(params=combattrain, x=Xtest, batch=batchtest)

  ytesthat <- predict(object=lasso, newx=Xtestcombat, type="response", s=getlambda(norm.fraction.opt, lasso))
  ytesthat <- as.numeric(ytesthat>0.5)+1

  errors[j] <- mean(ytest!=ytesthat)

}

errors
mean(errors)

errorscombat1 <- errors

apply(apply(table(batch, y), 1, function(x) x/sum(x)), 2, min)







# Simulation with factor-induced correlations: 

paramsfac <- simuldata(batchsize = rep(25,4))

Xsimfac <- paramsfac$X
y <- paramsfac$y
batch <- paramsfac$batch


# Simulation with factor-induced correlations: 
# Cross-batch prediction error when using the raw data:

norm.fraction.grid <- seq(from=0.1, to=0.9, length=9)
errors <- 0

for(j in 1:length(levels(batch))) {

  Xtrain <- Xsimfac[batch!=j,]
  ytrain <- y[batch!=j]
  batchtrain <- factor(as.numeric(factor(as.numeric(batch[batch!=j]))))

  Xtest <- Xsimfac[batch==j,]
  ytest <- y[batch==j]
  batchtest <- factor(as.numeric(factor(as.numeric(batch[batch==j]))))

  tunecv <- tune(Xtrain, ytrain, classifier = LassoCMA, grids = list(norm.fraction = norm.fraction.grid))
  norm.fraction.opt <- norm.fraction.grid[which.min(tunecv@tuneres[[1]])]

  classifiercma <- LassoCMA(Xtrain, ytrain, norm.fraction = norm.fraction.opt, models=TRUE)
  lasso <- classifiercma@model[[1]]

  ytesthat <- predict(object=lasso, newx=Xtest, type="response", s=getlambda(norm.fraction.opt, lasso))
  ytesthat <- as.numeric(ytesthat>0.5)+1

  errors[j] <- mean(ytest!=ytesthat)

}

errors
mean(errors)

errorsfacraw1 <- errors

apply(apply(table(batch, y), 1, function(x) x/sum(x)), 2, min)



# Simulation with factor-induced correlations: 
# Cross-batch prediction error when using ComBat:


norm.fraction.grid <- seq(from=0.1, to=0.9, length=9)
errors <- 0

for(j in 1:length(levels(batch))) {

  Xtrain <- Xsimfac[batch!=j,]
  ytrain <- y[batch!=j]
  batchtrain <- factor(as.numeric(factor(as.numeric(batch[batch!=j]))))

  Xtest <- Xsimfac[batch==j,]
  ytest <- y[batch==j]
  batchtest <- factor(as.numeric(factor(as.numeric(batch[batch==j]))))


  combattrain <- ba(x=Xtrain, batch=batchtrain, method = "combat")
  Xtraincombat <- combattrain$xadj

  tunecv <- tune(Xtraincombat, ytrain, classifier = LassoCMA, grids = list(norm.fraction = norm.fraction.grid))
  norm.fraction.opt <- norm.fraction.grid[which.min(tunecv@tuneres[[1]])]

  classifiercma <- LassoCMA(Xtraincombat, ytrain, norm.fraction = norm.fraction.opt, models=TRUE)
  lasso <- classifiercma@model[[1]]


  Xtestcombat <- baaddon(params=combattrain, x=Xtest, batch=batchtest)
  
  ytesthat <- predict(object=lasso, newx=Xtestcombat, type="response", s=getlambda(norm.fraction.opt, lasso))
  ytesthat <- as.numeric(ytesthat>0.5)+1

  errors[j] <- mean(ytest!=ytesthat)

}

errors
mean(errors)

errorsfaccombat1 <- errors

apply(apply(table(batch, y), 1, function(x) x/sum(x)), 2, min)


rbind(errorsraw1, errorscombat1, errorsfacraw1, errorsfaccombat1)
rowMeans(rbind(errorsraw1, errorscombat1, errorsfacraw1, errorsfaccombat1))








# Scenario with different correlations in the batches.
# NOTE: This code is not entirely commented because of its similarity
# to the above code used for the first scenario.

load("./FAbatchPaper/InterimResults/SimulationParametersbjscen.Rda")

load("./FAbatchPaper/Datasets/ProcessedData/AutismTranscr.Rda")

dim(X)
table(y)
table(batch)


# Choose the 1000 variables in batch 2 which show themselves most related
# to the outcome:
library(CMA)
selg <- GeneSelection(y = y[batch==2], X = X[batch==2,], method = 't.test')@rankings[[1]][1:1000]
Xsub <- X[,selg]

# Permute the 1000 selected variables randomly (not necessary in principle):
set.seed(1234)
Xsub <- Xsub[,sample(1:ncol(Xsub))]

Xsubnocl <- matrix(nrow=nrow(Xsub), ncol=ncol(Xsub))
for(i in 1:length(levels(batch))) {
  Xsubnocl[batch==i,][y[batch==i]==1,] <- scale(Xsub[batch==i,][y[batch==i]==1,], scale=FALSE)
  Xsubnocl[batch==i,][y[batch==i]==2,] <- scale(Xsub[batch==i,][y[batch==i]==2,], scale=FALSE)
}

par(mfrow=c(1,2))
pcplot(x=Xsub, batch=factor(batch), y=factor(y))
pcplot(x=Xsubnocl, batch=factor(batch), y=factor(y))
par(mfrow=c(1,1))

# --> Class-signal has to be removed in order to obtain realistic estimates.


# --> Outlier detected. --> Remove:

xpr <- prcomp(Xsubnocl, scale. = FALSE)
xp <- predict(xpr)[,1:2]

which(xp[,2]< -20)

X <- X[-which(xp[,2]< -20),]
y <- y[-which(xp[,2]< -20)]
batch <- batch[-which(xp[,2]< -20)]



# Repeat:

Xsub <- X[,selg]

# Permute the 1000 selected variables randomly (not necessary in principle):
set.seed(1234)
Xsub <- Xsub[,sample(1:ncol(Xsub))]

Xsubnocl <- matrix(nrow=nrow(Xsub), ncol=ncol(Xsub))
for(i in 1:length(levels(batch))) {
  Xsubnocl[batch==i,][y[batch==i]==1,] <- scale(Xsub[batch==i,][y[batch==i]==1,], scale=FALSE)
  Xsubnocl[batch==i,][y[batch==i]==2,] <- scale(Xsub[batch==i,][y[batch==i]==2,], scale=FALSE)
}

par(mfrow=c(1,2))
pcplot(x=Xsub, batch=factor(batch), y=factor(y))
pcplot(x=Xsubnocl, batch=factor(batch), y=factor(y))
par(mfrow=c(1,1))

# --> OK.


# Estimate the batch-specific correlation matrices:

cormatest <- list()
for(j in 1:4)
  cormatest[[j]] <- cor(Xsubnocl[batch==j,])

# Not positive definite yet:
library("corpcor")
sapply(cormatest, is.positive.definite)


# Find the nearest positive definite correlation matrices:
library("Matrix")
cormatpdest <- lapply(cormatest, function(x) as.matrix(nearPD(x, corr=TRUE)$mat))
sapply(cormatpdest, is.positive.definite)


# Correlations look realistic: 
boxplot(cormatpdest[[1]][upper.tri(cormatest[[1]])], cormatpdest[[2]][upper.tri(cormatest[[1]])], cormatpdest[[3]][upper.tri(cormatest[[1]])], cormatpdest[[4]][upper.tri(cormatest[[1]])])
abline(h=0, col="grey", lty=2)


# Difference between correlations from non-positive definite correlation
# matrices and those from nearest positive definite correlation matrices:

boxplot(cormatpdest[[1]][upper.tri(cormatest[[1]])] - cormatest[[1]][upper.tri(cormatest[[1]])],
  cormatpdest[[2]][upper.tri(cormatest[[1]])] - cormatest[[2]][upper.tri(cormatest[[1]])],
  cormatpdest[[3]][upper.tri(cormatest[[1]])] - cormatest[[3]][upper.tri(cormatest[[1]])],
  cormatpdest[[4]][upper.tri(cormatest[[1]])] - cormatest[[4]][upper.tri(cormatest[[1]])])

# --> Almost no differences.




# Some informations necessary for the simulation:

batchsize <- rep(25, 4)

n <- sum(batchsize)

numbatch <- 4
batch <- rep(1:numbatch, times=batchsize)

y <- rep(NA, length(batch))
for(j in 1:length(unique(batch)))
  y[batch==j] <- sample(c(rep(1, floor(sum(batch==j)/2)), rep(2, sum(batch==j)-floor(sum(batch==j)/2))))


# Now calculate the covariance matrices, by multiplying by the variances:

covbatches <- list()
for(j in 1:length(batchsize))
  covbatches[[j]] <- cormatpdest[[j]]*(sqrt(colSums(B0mat^2) + sigmag2*deltajg2[j,])%*%t(sqrt(colSums(B0mat^2) + sigmag2*deltajg2[j,])))

sapply(covbatches, is.positive.definite)



# Simulate a dataset:

Xsim <- matrix(nrow=n, ncol=length(alphag))
for(j in 1:length(batchsize))
  Xsim[batch==j,] <- rmvnorm(n = batchsize[j], mean = alphag + gammajg[j,], sigma = covbatches[[j]])

betagmat <- matrix(nrow=n, ncol=p, data=rep(betag, each=n))
Xsim <- Xsim + apply(t(betagmat), 1, function(x) x*(y-1))


# Simulate a dataset with factor-induced correlations
# for comparison:
Xsimfac <- simuldata(batchsize = rep(25,4))$X

# Compare the two:
par(mfrow=c(1,2))
pcplot(x=Xsim, batch=factor(batch), y=factor(y))
pcplot(x=Xsimfac, batch=factor(batch), y=factor(y))
par(mfrow=c(1,1))


randinds <- sample(1:ncol(Xsim), size=50)
Xsimsub <- Xsim[,randinds]
Xsimfacsub <- Xsimfac[,randinds]

par(mfrow=c(1,2))
boxplot(Xsimsub, main="Correlation matrix estimated on real data")
points(rep(1:ncol(Xsimsub), each=4), as.vector(rbind(colMeans(Xsimsub[batch==1,]), 
colMeans(Xsimsub[batch==2,]), 
colMeans(Xsimsub[batch==3,]), 
colMeans(Xsimsub[batch==4,]))), col=rep(1:4, times=ncol(Xsimsub)), 
  pch=rep(c(20, 2:4), times=ncol(Xsimsub)))

boxplot(Xsimfacsub, main="Correlation matrix induced by factors")
points(rep(1:ncol(Xsimfacsub), each=4), as.vector(rbind(colMeans(Xsimfacsub[batch==1,]), 
colMeans(Xsimfacsub[batch==2,]), 
colMeans(Xsimfacsub[batch==3,]), 
colMeans(Xsimfacsub[batch==4,]))), col=rep(1:4, times=ncol(Xsimfacsub)), 
  pch=rep(c(20, 2:4), times=ncol(Xsimfacsub)))
par(mfrow=c(1,1))

# --> The simulated data seems to behave realistically.

# String to identify the simulation scenario in the function for simulating:
scendesign <- "BatchCor"


# Save simulation parameters:

save(alphag, betag, gammajg, covbatches, scendesign, file="./FAbatchPaper/InterimResults/SimulationParametersBatchspecificCor.Rda")








# Check the signal strength for the above setting and for the corresponding
# setting in the case of simulating with factors (Design A).


# Transform 'batch' and 'y' into factors:
batch <- factor(batch)
y <- factor(y)

batchsafe <- batch
ysafe <- y


# LASSO:
# Using CMA, to predict new observations one has to use the original
# function, the corresponding CMA-functions are calling, since CMA doesn't
# have this option - with LASSO the problem is, that whereas "LassoCMA" only
# requires a fractional value, "glmnet" uses the absolute value, that means,
# for prediction with "predict.glmnet" one has to determine the absolute 
# lambda-values, corresponding to the fractional values. The following function
# does this:

getlambda <- function(norm.fraction, output) {

    nbeta = rbind2(output$a0, output$beta)
    nbeta <- as.matrix(nbeta)

    b <- t(nbeta)

    s <- norm.fraction

    k <- nrow(b)

    std.b <- scale(b[, -1], FALSE, 1/apply(output$call$x, 2, sd))

    bnorm <- apply(abs(std.b), 1, sum)
    sb <- bnorm/bnorm[k]

    sfrac <- (s - sb[1])/(sb[k] - sb[1])
    sb <- (sb - sb[1])/(sb[k] - sb[1])

    usb <- unique(sb)

    useq <- match(usb, sb)

    sb <- sb[useq]
    b <- b[useq, ]

    coord <- approx(sb, seq(sb), sfrac)$y

    left <- floor(coord)
    right <- ceiling(coord)

    (output$lambda[left] + output$lambda[right])/2

}



# Simulation with multivariate normal distribution: 
# Cross-batch prediction error when using the raw data:

norm.fraction.grid <- seq(from=0.1, to=0.9, length=9)
errors <- 0

for(j in 1:length(levels(batch))) {

  Xtrain <- Xsim[batch!=j,]
  ytrain <- y[batch!=j]
  batchtrain <- factor(as.numeric(factor(as.numeric(batch[batch!=j]))))

  Xtest <- Xsim[batch==j,]
  ytest <- y[batch==j]
  batchtest <- factor(as.numeric(factor(as.numeric(batch[batch==j]))))

  tunecv <- tune(Xtrain, ytrain, classifier = LassoCMA, grids = list(norm.fraction = norm.fraction.grid))
  norm.fraction.opt <- norm.fraction.grid[which.min(tunecv@tuneres[[1]])]

  classifiercma <- LassoCMA(Xtrain, ytrain, norm.fraction = norm.fraction.opt, models=TRUE)
  lasso <- classifiercma@model[[1]]

  ytesthat <- predict(object=lasso, newx=Xtest, type="response", s=getlambda(norm.fraction.opt, lasso))
  ytesthat <- as.numeric(ytesthat>0.5)+1

  errors[j] <- mean(ytest!=ytesthat)

}

errors
mean(errors)

errorsraw2 <- errors

apply(apply(table(batch, y), 1, function(x) x/sum(x)), 2, min)




# Simulation with multivariate normal distribution:
# Cross-batch prediction error when using ComBat:

norm.fraction.grid <- seq(from=0.1, to=0.9, length=9)
errors <- 0

for(j in 1:length(levels(batch))) {

  Xtrain <- Xsim[batch!=j,]
  ytrain <- y[batch!=j]
  batchtrain <- factor(as.numeric(factor(as.numeric(batch[batch!=j]))))

  Xtest <- Xsim[batch==j,]
  ytest <- y[batch==j]
  batchtest <- factor(as.numeric(factor(as.numeric(batch[batch==j]))))


  combattrain <- ba(x=Xtrain, batch=batchtrain, method = "combat")
  Xtraincombat <- combattrain$xadj

  tunecv <- tune(Xtraincombat, ytrain, classifier = LassoCMA, grids = list(norm.fraction = norm.fraction.grid))
  norm.fraction.opt <- norm.fraction.grid[which.min(tunecv@tuneres[[1]])]

  classifiercma <- LassoCMA(Xtraincombat, ytrain, norm.fraction = norm.fraction.opt, models=TRUE)
  lasso <- classifiercma@model[[1]]


  Xtestcombat <- baaddon(params=combattrain, x=Xtest, batch=batchtest)

  ytesthat <- predict(object=lasso, newx=Xtestcombat, type="response", s=getlambda(norm.fraction.opt, lasso))
  ytesthat <- as.numeric(ytesthat>0.5)+1

  errors[j] <- mean(ytest!=ytesthat)

}

errors
mean(errors)

errorscombat2 <- errors

apply(apply(table(batch, y), 1, function(x) x/sum(x)), 2, min)







# Simulation with factor-induced correlations: 

paramsfac <- simuldata(batchsize = rep(25,4))

Xsimfac <- paramsfac$X
y <- paramsfac$y
batch <- paramsfac$batch



# Simulation with factor-induced correlations: 
# Cross-batch prediction error when using the raw data:

norm.fraction.grid <- seq(from=0.1, to=0.9, length=9)
errors <- 0

for(j in 1:length(levels(batch))) {

  Xtrain <- Xsimfac[batch!=j,]
  ytrain <- y[batch!=j]
  batchtrain <- factor(as.numeric(factor(as.numeric(batch[batch!=j]))))

  Xtest <- Xsimfac[batch==j,]
  ytest <- y[batch==j]
  batchtest <- factor(as.numeric(factor(as.numeric(batch[batch==j]))))

  tunecv <- tune(Xtrain, ytrain, classifier = LassoCMA, grids = list(norm.fraction = norm.fraction.grid))
  norm.fraction.opt <- norm.fraction.grid[which.min(tunecv@tuneres[[1]])]

  classifiercma <- LassoCMA(Xtrain, ytrain, norm.fraction = norm.fraction.opt, models=TRUE)
  lasso <- classifiercma@model[[1]]

  ytesthat <- predict(object=lasso, newx=Xtest, type="response", s=getlambda(norm.fraction.opt, lasso))
  ytesthat <- as.numeric(ytesthat>0.5)+1

  errors[j] <- mean(ytest!=ytesthat)

}

errors
mean(errors)

errorsfacraw2 <- errors

apply(apply(table(batch, y), 1, function(x) x/sum(x)), 2, min)




# Simulation with factor-induced correlations: 
# Cross-batch prediction error when using ComBat:

norm.fraction.grid <- seq(from=0.1, to=0.9, length=9)
errors <- 0

for(j in 1:length(levels(batch))) {

  Xtrain <- Xsimfac[batch!=j,]
  ytrain <- y[batch!=j]
  batchtrain <- factor(as.numeric(factor(as.numeric(batch[batch!=j]))))

  Xtest <- Xsimfac[batch==j,]
  ytest <- y[batch==j]
  batchtest <- factor(as.numeric(factor(as.numeric(batch[batch==j]))))


  combattrain <- ba(x=Xtrain, batch=batchtrain, method = "combat")
  Xtraincombat <- combattrain$xadj

  tunecv <- tune(Xtraincombat, ytrain, classifier = LassoCMA, grids = list(norm.fraction = norm.fraction.grid))
  norm.fraction.opt <- norm.fraction.grid[which.min(tunecv@tuneres[[1]])]

  classifiercma <- LassoCMA(Xtraincombat, ytrain, norm.fraction = norm.fraction.opt, models=TRUE)
  lasso <- classifiercma@model[[1]]


  Xtestcombat <- baaddon(params=combattrain, x=Xtest, batch=batchtest)

  ytesthat <- predict(object=lasso, newx=Xtestcombat, type="response", s=getlambda(norm.fraction.opt, lasso))
  ytesthat <- as.numeric(ytesthat>0.5)+1

  errors[j] <- mean(ytest!=ytesthat)

}

errors
mean(errors)

errorsfaccombat2 <- errors

apply(apply(table(batch, y), 1, function(x) x/sum(x)), 2, min)



rbind(errorsraw2, errorscombat2, errorsfacraw2, errorsfaccombat2)
rowMeans(rbind(errorsraw2, errorscombat2, errorsfacraw2, errorsfaccombat2))













# Scenario with batch-specific and class-specific correlations in the batches.
# NOTE: This code is not entirely commented because of its similarity
# to the above code used for the first scenario.

load("./FAbatchPaper/InterimResults/SimulationParametersbtildescen.Rda")

load("./FAbatchPaper/Datasets/ProcessedData/AutismTranscr.Rda")

dim(X)
table(y)
table(batch)


# Choose the 1000 variables in batch 2 which show themselves most related
# to the outcome:
library(CMA)
selg <- GeneSelection(y = y[batch==2], X = X[batch==2,], method = 't.test')@rankings[[1]][1:1000]
Xsub <- X[,selg]

# Permute the 1000 selected variables randomly (not necessary in principle):
set.seed(1234)
Xsub <- Xsub[,sample(1:ncol(Xsub))]

Xsubnocl <- matrix(nrow=nrow(Xsub), ncol=ncol(Xsub))
for(i in 1:length(levels(batch))) {
  Xsubnocl[batch==i,][y[batch==i]==1,] <- scale(Xsub[batch==i,][y[batch==i]==1,], scale=FALSE)
  Xsubnocl[batch==i,][y[batch==i]==2,] <- scale(Xsub[batch==i,][y[batch==i]==2,], scale=FALSE)
}

par(mfrow=c(1,2))
pcplot(x=Xsub, batch=factor(batch), y=factor(y))
pcplot(x=Xsubnocl, batch=factor(batch), y=factor(y))
par(mfrow=c(1,1))

# --> Class-signal has to be removed in order to obtain realistic estimates.


# --> Outlier detected. --> Remove:

xpr <- prcomp(Xsubnocl, scale. = FALSE)
xp <- predict(xpr)[,1:2]

which(xp[,2]< -20)

X <- X[-which(xp[,2]< -20),]
y <- y[-which(xp[,2]< -20)]
batch <- batch[-which(xp[,2]< -20)]



# Repeat:

Xsub <- X[,selg]

# Permute the 1000 selected variables randomly (not necessary in principle):
set.seed(1234)
Xsub <- Xsub[,sample(1:ncol(Xsub))]

Xsubnocl <- matrix(nrow=nrow(Xsub), ncol=ncol(Xsub))
for(i in 1:length(levels(batch))) {
  Xsubnocl[batch==i,][y[batch==i]==1,] <- scale(Xsub[batch==i,][y[batch==i]==1,], scale=FALSE)
  Xsubnocl[batch==i,][y[batch==i]==2,] <- scale(Xsub[batch==i,][y[batch==i]==2,], scale=FALSE)
}

par(mfrow=c(1,2))
pcplot(x=Xsub, batch=batch, y=y)
pcplot(x=Xsubnocl, batch=batch, y=y)
par(mfrow=c(1,1))

# --> OK.


# Estimate the batch- and class-specific correlation matrices:

cormatest <- list()
for(j in 1:4) {
  cormatest[[j]] <- list()
  for(k in 1:2)
    cormatest[[j]][[k]] <- cor(Xsubnocl[batch==j,][y[batch==j]==k,])
}

# Not positive definite yet:
library("corpcor")
sapply(cormatest, function(x) sapply(x, is.positive.definite))


# Find the nearest positive definite correlation matrices:
library("Matrix")
cormatpdest <- lapply(cormatest, function(x) lapply(x, function(y) as.matrix(nearPD(y, corr=TRUE)$mat)))
sapply(cormatpdest, function(x) sapply(x, is.positive.definite))


# Correlations look realistic: 
boxplot(cormatpdest[[1]][[1]][upper.tri(cormatest[[1]][[1]])], cormatpdest[[1]][[2]][upper.tri(cormatest[[1]][[1]])], 
cormatpdest[[2]][[1]][upper.tri(cormatest[[1]][[1]])], cormatpdest[[2]][[2]][upper.tri(cormatest[[1]][[1]])], 
cormatpdest[[3]][[1]][upper.tri(cormatest[[1]][[1]])], cormatpdest[[3]][[2]][upper.tri(cormatest[[1]][[1]])], 
cormatpdest[[4]][[1]][upper.tri(cormatest[[1]][[1]])], cormatpdest[[4]][[2]][upper.tri(cormatest[[1]][[1]])])

abline(h=0, col="grey", lty=2)


# Difference between correlations from non-positive definite correlation
# matrices and those from nearest positive definite correlation matrices:

boxplot(cormatpdest[[1]][[1]][upper.tri(cormatest[[1]][[1]])] - cormatest[[1]][[1]][upper.tri(cormatest[[1]][[1]])],
  cormatpdest[[1]][[2]][upper.tri(cormatest[[1]][[1]])] - cormatest[[1]][[2]][upper.tri(cormatest[[1]][[1]])],
  cormatpdest[[2]][[1]][upper.tri(cormatest[[1]][[1]])] - cormatest[[2]][[1]][upper.tri(cormatest[[1]][[1]])],
  cormatpdest[[2]][[2]][upper.tri(cormatest[[1]][[1]])] - cormatest[[2]][[2]][upper.tri(cormatest[[1]][[1]])],
  cormatpdest[[3]][[1]][upper.tri(cormatest[[1]][[1]])] - cormatest[[3]][[1]][upper.tri(cormatest[[1]][[1]])],
  cormatpdest[[3]][[2]][upper.tri(cormatest[[1]][[1]])] - cormatest[[3]][[2]][upper.tri(cormatest[[1]][[1]])],
  cormatpdest[[4]][[1]][upper.tri(cormatest[[1]][[1]])] - cormatest[[4]][[1]][upper.tri(cormatest[[1]][[1]])],
  cormatpdest[[4]][[2]][upper.tri(cormatest[[1]][[1]])] - cormatest[[4]][[2]][upper.tri(cormatest[[1]][[1]])])

# --> Almost no differences.




# Some informations necessary for the simulation:

batchsize <- rep(25, 4)

n <- sum(batchsize)

numbatch <- 4
batch <- rep(1:numbatch, times=batchsize)

y <- rep(NA, length(batch))
for(j in 1:length(unique(batch)))
  y[batch==j] <- sample(c(rep(1, floor(sum(batch==j)/2)), rep(2, sum(batch==j)-floor(sum(batch==j)/2))))

  
# Now calculate the covariance matrices, by multiplying by the variances:

covbatches <- list()
for(j in 1:length(batchsize)) {
  covbatches[[j]] <- list()
  for(k in 1:2)
    covbatches[[j]][[k]] <- cormatpdest[[j]][[k]]*(sqrt(colSums(B0mat^2) + sigmag2*deltajg2[j,])%*%t(sqrt(colSums(B0mat^2) + sigmag2*deltajg2[j,])))
}

sapply(covbatches, function(x) sapply(x, is.positive.definite))



# Simulate a dataset:

Xsim <- matrix(nrow=n, ncol=length(alphag))
for(j in 1:length(batchsize)) {
  for(k in 1:2)
    Xsim[batch==j,][y[batch==j]==k,] <- rmvnorm(n = sum((batch==j) & (y==k)), mean = alphag + gammajg[j,], sigma = covbatches[[j]][[k]])
}

betagmat <- matrix(nrow=n, ncol=p, data=rep(betag, each=n))
Xsim <- Xsim + apply(t(betagmat), 1, function(x) x*(y-1))


# Simulate a dataset with factor-induced correlations
# for comparison:
Xsimfac <- simuldata(batchsize = rep(25,4))$X


# Compare the two:
par(mfrow=c(1,2))
pcplot(x=Xsim, batch=factor(batch), y=factor(y))
pcplot(x=Xsimfac, batch=factor(batch), y=factor(y))
par(mfrow=c(1,1))


randinds <- sample(1:ncol(Xsim), size=50)
Xsimsub <- Xsim[,randinds]
Xsimfacsub <- Xsimfac[,randinds]

par(mfrow=c(1,2))
boxplot(Xsimsub, main="Correlation matrix estimated on real data")
points(rep(1:ncol(Xsimsub), each=4), as.vector(rbind(colMeans(Xsimsub[batch==1,]), 
colMeans(Xsimsub[batch==2,]), 
colMeans(Xsimsub[batch==3,]), 
colMeans(Xsimsub[batch==4,]))), col=rep(1:4, times=ncol(Xsimsub)), 
  pch=rep(c(20, 2:4), times=ncol(Xsimsub)))

boxplot(Xsimfacsub, main="Correlation matrix induced by factors")
points(rep(1:ncol(Xsimfacsub), each=4), as.vector(rbind(colMeans(Xsimfacsub[batch==1,]), 
colMeans(Xsimfacsub[batch==2,]), 
colMeans(Xsimfacsub[batch==3,]), 
colMeans(Xsimfacsub[batch==4,]))), col=rep(1:4, times=ncol(Xsimfacsub)), 
  pch=rep(c(20, 2:4), times=ncol(Xsimfacsub)))
par(mfrow=c(1,1))

# --> The simulated data seems to behave realistically.

# String to identify the simulation scenario in the function for simulating:
scendesign <- "BatchClassCor"


# Save simulation parameters:

save(alphag, betag, gammajg, covbatches, scendesign, file="./FAbatchPaper/InterimResults/SimulationParametersBatchClassspecificCor.Rda")





# Check the signal strength for the above setting and for the corresponding
# setting in the case of simulating with factors (Design A).


# Transform 'batch' and 'y' into factors:
batch <- factor(batch)
y <- factor(y)

batchsafe <- batch
ysafe <- y


# LASSO:
# Using CMA, to predict new observations one has to use the original
# function, the corresponding CMA-functions are calling, since CMA doesn't
# have this option - with LASSO the problem is, that whereas "LassoCMA" only
# requires a fractional value, "glmnet" uses the absolute value, that means,
# for prediction with "predict.glmnet" one has to determine the absolute 
# lambda-values, corresponding to the fractional values. The following function
# does this:

getlambda <- function(norm.fraction, output) {

    nbeta = rbind2(output$a0, output$beta)
    nbeta <- as.matrix(nbeta)

    b <- t(nbeta)

    s <- norm.fraction

    k <- nrow(b)

    std.b <- scale(b[, -1], FALSE, 1/apply(output$call$x, 2, sd))

    bnorm <- apply(abs(std.b), 1, sum)
    sb <- bnorm/bnorm[k]

    sfrac <- (s - sb[1])/(sb[k] - sb[1])
    sb <- (sb - sb[1])/(sb[k] - sb[1])

    usb <- unique(sb)

    useq <- match(usb, sb)

    sb <- sb[useq]
    b <- b[useq, ]

    coord <- approx(sb, seq(sb), sfrac)$y

    left <- floor(coord)
    right <- ceiling(coord)

    (output$lambda[left] + output$lambda[right])/2

}



# Simulation with multivariate normal distribution: 
# Cross-batch prediction error when using the raw data:

norm.fraction.grid <- seq(from=0.1, to=0.9, length=9)
errors <- 0

for(j in 1:length(levels(batch))) {

  Xtrain <- Xsim[batch!=j,]
  ytrain <- y[batch!=j]
  batchtrain <- factor(as.numeric(factor(as.numeric(batch[batch!=j]))))

  Xtest <- Xsim[batch==j,]
  ytest <- y[batch==j]
  batchtest <- factor(as.numeric(factor(as.numeric(batch[batch==j]))))

  tunecv <- tune(Xtrain, ytrain, classifier = LassoCMA, grids = list(norm.fraction = norm.fraction.grid))
  norm.fraction.opt <- norm.fraction.grid[which.min(tunecv@tuneres[[1]])]

  classifiercma <- LassoCMA(Xtrain, ytrain, norm.fraction = norm.fraction.opt, models=TRUE)
  lasso <- classifiercma@model[[1]]

  ytesthat <- predict(object=lasso, newx=Xtest, type="response", s=getlambda(norm.fraction.opt, lasso))
  ytesthat <- as.numeric(ytesthat>0.5)+1

  errors[j] <- mean(ytest!=ytesthat)

}

errors
mean(errors)

errorsraw3 <- errors

apply(apply(table(batch, y), 1, function(x) x/sum(x)), 2, min)




# Simulation with multivariate normal distribution:
# Cross-batch prediction error when using ComBat:

norm.fraction.grid <- seq(from=0.1, to=0.9, length=9)
errors <- 0

for(j in 1:length(levels(batch))) {

  Xtrain <- Xsim[batch!=j,]
  ytrain <- y[batch!=j]
  batchtrain <- factor(as.numeric(factor(as.numeric(batch[batch!=j]))))

  Xtest <- Xsim[batch==j,]
  ytest <- y[batch==j]
  batchtest <- factor(as.numeric(factor(as.numeric(batch[batch==j]))))


  combattrain <- ba(x=Xtrain, batch=batchtrain, method = "combat")
  Xtraincombat <- combattrain$xadj

  tunecv <- tune(Xtraincombat, ytrain, classifier = LassoCMA, grids = list(norm.fraction = norm.fraction.grid))
  norm.fraction.opt <- norm.fraction.grid[which.min(tunecv@tuneres[[1]])]

  classifiercma <- LassoCMA(Xtraincombat, ytrain, norm.fraction = norm.fraction.opt, models=TRUE)
  lasso <- classifiercma@model[[1]]


  Xtestcombat <- baaddon(params=combattrain, x=Xtest, batch=batchtest)

  ytesthat <- predict(object=lasso, newx=Xtestcombat, type="response", s=getlambda(norm.fraction.opt, lasso))
  ytesthat <- as.numeric(ytesthat>0.5)+1

  errors[j] <- mean(ytest!=ytesthat)

}

errors
mean(errors)

errorscombat3 <- errors

apply(apply(table(batch, y), 1, function(x) x/sum(x)), 2, min)







# Simulation with factor-induced correlations: 

paramsfac <- simuldata(batchsize = rep(25,4))

Xsimfac <- paramsfac$X
y <- paramsfac$y
batch <- paramsfac$batch



# Simulation with factor-induced correlations: 
# Cross-batch prediction error when using the raw data:

norm.fraction.grid <- seq(from=0.1, to=0.9, length=9)
errors <- 0

for(j in 1:length(levels(batch))) {

  Xtrain <- Xsimfac[batch!=j,]
  ytrain <- y[batch!=j]
  batchtrain <- factor(as.numeric(factor(as.numeric(batch[batch!=j]))))

  Xtest <- Xsimfac[batch==j,]
  ytest <- y[batch==j]
  batchtest <- factor(as.numeric(factor(as.numeric(batch[batch==j]))))

  tunecv <- tune(Xtrain, ytrain, classifier = LassoCMA, grids = list(norm.fraction = norm.fraction.grid))
  norm.fraction.opt <- norm.fraction.grid[which.min(tunecv@tuneres[[1]])]

  classifiercma <- LassoCMA(Xtrain, ytrain, norm.fraction = norm.fraction.opt, models=TRUE)
  lasso <- classifiercma@model[[1]]

  ytesthat <- predict(object=lasso, newx=Xtest, type="response", s=getlambda(norm.fraction.opt, lasso))
  ytesthat <- as.numeric(ytesthat>0.5)+1

  errors[j] <- mean(ytest!=ytesthat)

}

errors
mean(errors)

errorsfacraw3 <- errors

apply(apply(table(batch, y), 1, function(x) x/sum(x)), 2, min)




# Simulation with factor-induced correlations: 
# Cross-batch prediction error when using ComBat:

norm.fraction.grid <- seq(from=0.1, to=0.9, length=9)
errors <- 0

for(j in 1:length(levels(batch))) {

  Xtrain <- Xsimfac[batch!=j,]
  ytrain <- y[batch!=j]
  batchtrain <- factor(as.numeric(factor(as.numeric(batch[batch!=j]))))

  Xtest <- Xsimfac[batch==j,]
  ytest <- y[batch==j]
  batchtest <- factor(as.numeric(factor(as.numeric(batch[batch==j]))))


  combattrain <- ba(x=Xtrain, batch=batchtrain, method = "combat")
  Xtraincombat <- combattrain$xadj

  tunecv <- tune(Xtraincombat, ytrain, classifier = LassoCMA, grids = list(norm.fraction = norm.fraction.grid))
  norm.fraction.opt <- norm.fraction.grid[which.min(tunecv@tuneres[[1]])]

  classifiercma <- LassoCMA(Xtraincombat, ytrain, norm.fraction = norm.fraction.opt, models=TRUE)
  lasso <- classifiercma@model[[1]]


  Xtestcombat <- baaddon(params=combattrain, x=Xtest, batch=batchtest)

  ytesthat <- predict(object=lasso, newx=Xtestcombat, type="response", s=getlambda(norm.fraction.opt, lasso))
  ytesthat <- as.numeric(ytesthat>0.5)+1

  errors[j] <- mean(ytest!=ytesthat)

}

errors
mean(errors)

errorsfaccombat3 <- errors

apply(apply(table(batch, y), 1, function(x) x/sum(x)), 2, min)



rbind(errorsraw3, errorscombat3, errorsfacraw3, errorsfaccombat3)
rowMeans(rbind(errorsraw3, errorscombat3, errorsfacraw3, errorsfaccombat3))






# CV errors out of all scenarios:

rbind(errorsraw1, errorscombat1, errorsfacraw1, errorsfaccombat1)
rowMeans(rbind(errorsraw1, errorscombat1, errorsfacraw1, errorsfaccombat1))

rbind(errorsraw2, errorscombat2, errorsfacraw2, errorsfaccombat2)
rowMeans(rbind(errorsraw2, errorscombat2, errorsfacraw2, errorsfaccombat2))

rbind(errorsraw3, errorscombat3, errorsfacraw3, errorsfaccombat3)
rowMeans(rbind(errorsraw3, errorscombat3, errorsfacraw3, errorsfaccombat3))

# --> OK.


# Clear workspace:
rm(list=ls());gc()












# PC plots - one plot per scenario.


# Function to simulate a dataset:

simuldata <- function(batchsize = c(50, 50, 50, 50), indscen) {

  # Load simulation parameters:

  objectsnow <- ls()
  eval(parse(text=paste("load(\"./FAbatchPaper/InterimResults/SimulationParameters", indscen, ".Rda\")", sep="")))

  if(!exists("scendesign")) {

  # Get a couple of properties:
  numbatch <- 4
  if(length(batchsize)!=numbatch)
    stop(paste("Number of batches has to be equal to ", numbatch, sep=""))
  n <- sum(batchsize)
  batch <- rep(1:numbatch, times=batchsize)

  # Load simulation parameters:
  p <- length(alphag)

  # Simulate random parts:
  y <- rep(NA, length(batch))
  for(j in 1:length(unique(batch)))
    y[batch==j] <- sample(c(rep(1, floor(sum(batch==j)/2)), rep(2, sum(batch==j)-floor(sum(batch==j)/2))))
  eps01 <- matrix(nrow=n, ncol=p, data=rnorm(n*p))
  Z <- matrix(nrow=n, ncol=5, data=rnorm(n*5))
  Zbatch <- matrix(nrow=n, ncol=5, data=rnorm(n*5))

  # Put the parameters into matrices:
  alphagmat <- matrix(nrow=n, ncol=p, data=rep(alphag, each=n))
  betagmat <- matrix(nrow=n, ncol=p, data=rep(betag, each=n))
  sigmag2mat <- matrix(nrow=n, ncol=p, data=rep(sigmag2, each=n))

  gammajgmat <- deltajg2mat <- matrix(nrow=n, ncol=p)

  for(i in 1:numbatch) {
    gammajgmat[batch==i,] <- matrix(nrow=sum(batch==i), ncol=p, data=rep(gammajg[i,], each=sum(batch==i)))
    deltajg2mat[batch==i,] <- matrix(nrow=sum(batch==i), ncol=p, data=rep(deltajg2[i,], each=sum(batch==i)))
  }

  # Generate the simulated data matrix without batch- and class-specific correlations:
  X <- alphagmat + apply(t(betagmat), 1, function(x) x*(y-1)) + gammajgmat +
    Z%*%B0mat + sqrt(deltajg2mat)*sqrt(sigmag2mat)*eps01

  # Add class-specific correlations (Btildemat = 0 in case of no class-specific correlation):
  X[y==2,] <- X[y==2,] + Z[y==2,]%*%Btildemat

  # Add batch-specific correlations (loadings 0 in case of no class-specific correlation):
  for(i in 1:numbatch) {
    X[batch==i,] <- X[batch==i,] + Zbatch[batch==i,]%*%Bjmat[[i]] 
  }

  }
  else {

  # Get a couple of properties:
  numbatch <- 4
  if(length(batchsize)!=numbatch)
    stop(paste("Number of batches has to be equal to ", numbatch, sep=""))
  n <- sum(batchsize)
  batch <- rep(1:numbatch, times=batchsize)

  # Load simulation parameters:
  p <- length(alphag)

  # Simulate random parts:
  y <- rep(NA, length(batch))
  for(j in 1:length(unique(batch)))
    y[batch==j] <- sample(c(rep(1, floor(sum(batch==j)/2)), rep(2, sum(batch==j)-floor(sum(batch==j)/2))))

  if(scendesign %in% c("CommonCor", "BatchCor")) {
    X <- matrix(nrow=n, ncol=length(alphag))
    for(j in 1:length(batchsize))
      X[batch==j,] <- rmvnorm(n = batchsize[j], mean = alphag + gammajg[j,], sigma = covbatches[[j]])
  }

  if(scendesign=="BatchClassCor") {
    X <- matrix(nrow=n, ncol=length(alphag))
    for(j in 1:length(batchsize)) {
      for(k in 1:2)
        X[batch==j,][y[batch==j]==k,] <- rmvnorm(n = sum((batch==j) & (y==k)), mean = alphag + gammajg[j,], sigma = covbatches[[j]][[k]])
    }
  }

  betagmat <- matrix(nrow=n, ncol=p, data=rep(betag, each=n))
  X <- X + apply(t(betagmat), 1, function(x) x*(y-1))

  }

  rm(list=setdiff(ls(), c(objectsnow, "X", "y", "batch")))

  # Return the data:
  return(list(X=X, y=factor(y), batch=factor(batch)))

}


# Simulate datasets:

paramb0 <- simuldata(batchsize=rep(25,4), indscen="b0scen")
parambj <- simuldata(batchsize=rep(25,4), indscen="bjscen")
parambtilde <- simuldata(batchsize=rep(25,4), indscen="btildescen")

paramcomm <- simuldata(batchsize=rep(25,4), indscen="CommonCor")
parambatchc <- simuldata(batchsize=rep(25,4), indscen="BatchspecificCor")
parambatchclc <- simuldata(batchsize=rep(25,4), indscen="BatchClassspecificCor")


# PC plots:

library("bapred")

par(mfrow=c(2,3))
pcplot(x=paramb0$X, batch=paramb0$batch, y=paramb0$y)
pcplot(x=parambj$X, batch=parambj$batch, y=parambj$y)
pcplot(x=parambtilde$X, batch=parambtilde$batch, y=parambtilde$y)
pcplot(x=paramcomm$X, batch=paramcomm$batch, y=paramcomm$y)
pcplot(x=parambatchc$X, batch=parambatchc$batch, y=parambatchc$y)
pcplot(x=parambatchclc$X, batch=parambatchclc$batch, y=parambatchclc$y)
par(mfrow=c(1,1))








#######################################################################################
#                                                                                     # 
#  Third part: Set up the additional setting used in the cross-batch prediction       #
#  ("CrossBatchPredictionSimulation.R"), in which there is no correlation between     #
#  the predictors.                                                                    #
#                                                                                     # 
#######################################################################################


# Load one of the simulation scenarios from Design B:
load("./FAbatchPaper/InterimResults/SimulationParametersBatchspecificCor.Rda")

# String to identify the simulation scenario in the function for simulating:
scendesign <- "NoCor"

# Set the covariances between the variables to zero:
for(i in seq(along=covbatches)) {
covbatches[[i]][lower.tri(covbatches[[i]])] <- 0
covbatches[[i]][upper.tri(covbatches[[i]])] <- 0
}


# Save simulation parameters:

save(alphag, betag, gammajg, covbatches, scendesign, file="./FAbatchPaper/InterimResults/SimulationParametersNoCor.Rda")
