# AgeDichotomTranscr
#####################


rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-36194"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")




# Meta data:

adf <- try(read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
} else {

adf <- readLines(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'
  
}



# It was seen that the signal was too strong and that the (measured)
# expressions were too disparate for men and women:

table(datainfo$'Characteristics[organism part]', datainfo$'FactorValue [SEX]')
 
# --> Take only frontal cortex measurements taken from men and use dichotomized age as target variable:

subsetind <- which((datainfo$'Characteristics[organism part]'=="frontal cortex") & (datainfo$'FactorValue [SEX]'=="male"))

datainfo <- datainfo[subsetind,]


# Batch and target variable:

y <- factor(as.numeric(as.numeric(datainfo$'Characteristics[age (y)]') > median(as.numeric(datainfo$'Characteristics[age (y)]'))) + 1)
table(datainfo$'Characteristics[batch]')
table(datainfo$'Characteristics[batch]', y)

# --> Many batches of smaller sizes.




# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL

all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]



# Batch and target variable:

batch <- factor(as.numeric(factor(datainfo$'Characteristics[batch]')))
y <- factor(as.numeric(as.numeric(datainfo$'Characteristics[age (y)]') > median(as.numeric(datainfo$'Characteristics[age (y)]'))) + 1)




# Read in the arrays:

# Find out indices of NA-values for each array:

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}

mean(sapply(nalist, length)==0)

table(sapply(nalist, length))


# Get indices of variables without missing values:

nonas <- 1:length(sampletemp)
for(i in 1:length(files))
  nonas <- setdiff(nonas, nalist[[i]])

# Percentage of variables without missing values:
(length(sampletemp) - length(nonas))/length(sampletemp)

# --> Too many variables would be lost when using only
#     those with complete measurements.



# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=length(sampletemp))

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)


# Treatment of missing values:

missingpercentage <- sapply(1:15, function(y) apply(X[batch==y,], 2, function(x) mean(is.na(x))))
maxmiss <- apply(missingpercentage, 1, max)

# --> Exclude those variables where >=50% in at least one batch
#     are missing:

X <- X[,maxmiss < 0.5]


# Set missing values to the intra-batch median of the corresponding variable:

navars <- which(apply(X, 2, function(x) sum(is.na(x)))>0)

for(i in seq(along=navars)) {
  for(j in 1:15)
    X[batch==j,navars[i]][is.na(X[batch==j,navars[i]])] <- median(X[batch==j,navars[i]], na.rm=TRUE)
}



# Save data set 'AgeDichotomTranscr'
##################################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/AgeDichotomTranscr.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls());gc()















# AlcoholismTranscr
#####################


rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-44456"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")



# Meta data:

adf <- try(read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
} else {

adf <- readLines(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'
  
}


# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL

all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]



# Construct the batch and the target variable:

batch <- factor(as.numeric(datainfo$Characteristics..labeling_batch.==2)+1)
y <- factor(as.numeric(datainfo$FactorValue..PHENOTYPE.=="alcoholic")+1)




# Read in the arrays:

# Find out indices of NA-values for each array:

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}

mean(sapply(nalist, length)==0)

# --> No missing values.


# --> Indices without missing values:

sampletemp <- readLines(files[1])[-1]
nonas <- 1:length(sampletemp)


# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=length(sampletemp))

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)


# Save data set 'AlcoholismTranscr'
##################################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/AlcoholismTranscr.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls());gc()















# AutismTranscr
################


rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-37772"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")




# Meta data:

adf <- try(read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
} else {

adf <- readLines(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'
  
}


# Batch and target variable:

table(datainfo$'Characteristics[batch]', datainfo$'Characteristics[autism trait]')

# --> 4 big batches, 1 small; class imbalance in the last two batches.




# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL

all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]



# Batch and target variable:

batch <- factor(as.numeric(factor(datainfo$'Characteristics[batch]')))
y <- factor(as.numeric(datainfo$'Characteristics[autism trait]'=="autism")+1)



# Read in the arrays:

# Find out indices of NA-values for each array:

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}

mean(sapply(nalist, length)==0)

# --> No missing values.


# --> Indices without missing values:

sampletemp <- readLines(files[1])[-1]
nonas <- 1:length(sampletemp)


# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=length(sampletemp))

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)


# Save data set 'AutismTranscr'
##################################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/AutismTranscr.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls());gc()















# BipolardisorderMethyl
#######################


rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-38873"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")




# Meta data:

adf <- try(read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
} else {

adf <- readLines(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'
  
}



# Remove replicates:
datainfo <- datainfo[setdiff(1:nrow(datainfo), grep("_rep", datainfo$Comment..Sample_title.)),]

table(datainfo$FactorValue..DISEASE.STATE., datainfo$Characteristics..batch.)



# Keep only BP patients and controls:

datainfo <- datainfo[datainfo$FactorValue..DISEASE.STATE. %in% c("BP", "Unaffected control"),]

table(datainfo$FactorValue..DISEASE.STATE., datainfo$Characteristics..batch.)

# --> OK.



# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL

all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]



# Construct the batch and the target variable:

batch <- factor(as.numeric(datainfo$Characteristics..batch.==2)+1)
y <- factor(as.numeric(datainfo$FactorValue..DISEASE.STATE.=="BP")+1)



# Read in the arrays:

# Find out indices of NA-values for each array:

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}

mean(sapply(nalist, length)==0)

# --> Every observations has missing values.


# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=length(sampletemp))

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)



# Treatment of missing values:

table(apply(X, 2, function(x) sum(is.na(x))))

# --> Remove variables with more than five missing values:

X <- X[,apply(X, 2, function(x) sum(is.na(x))) <= 5]

# Set missing values to the intra-batch median of the corresponding variable:

navars <- which(apply(X, 2, function(x) sum(is.na(x)))>0)

for(i in seq(along=navars)) {
  for(j in 1:2)
    X[batch==j,navars[i]][is.na(X[batch==j,navars[i]])] <- median(X[batch==j,navars[i]], na.rm=TRUE)
}



# Save data set 'BipolardisorderMethyl'
############################################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/BipolardisorderMethyl.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls()); gc()















# BreastCancerConcatenation
###########################


#######################################################################################
#######################################################################################
# This concatenation of microarray data sets studying breast cancer consists of samples from 
# ArrayExpress data sets with the following accession codes:
#
# 1) E-GEOD-27562
# 2) E-GEOD-21422
# 3) E-GEOD-22544
# 4) E-GEOD-20266
# 5) E-TABM-276
#
# To reduce the computational burden we thereby only considered genes present in
# the Affymetrix GeneChip Human Genome U133A 2.0-arrays, resulting in 22,277 variables.
#######################################################################################
#######################################################################################


#######################################################################################
# As as first step, we downloaded the data sets and put the CEL-files belonging to a 
# specific data set into single folders. We subset these data sets, so that the resulting 
# data sets would consist of samples belonging to two classes only, "breast cancer" vs. 
# "healthy". Outliers were also removed in two data sets.
# Then we subset the variables.
#######################################################################################




# Downloading of the CEL-files, removing CEL-files corresponding to observations 
# not used in the analysis and creating the target and batch variable:
################################################################################

# First Dataset:
##################

# Accession number:
datasetid <- "E-GEOD-27562"
newfolder <- paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep="")

# Download RAW data from ArrayExpress:
library("ArrayExpress")
getAE(datasetid, path = newfolder, type = "raw")  


# Load Phenodata (for older microarray datasets the 'read.AnnotatedDataFrame' throws an
# error, in which cases the 'srd.txt'-file has to be read in manually):

adf <- try(read.AnnotatedDataFrame(paste(newfolder, "/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
  arraydatafile <- "Array.Data.File"
} else {

adf <- readLines(paste(newfolder, "/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'

arraydatafile <- "Array Data File"
  
}


# Subset phenodata to feature only relevant observations:

datainfo <- datainfo[datainfo$'Characteristics.phenotype.' %in% c("Malignant", "Normal", "Pre-Surgery (aka Malignant)"),]



# The not used CEL files are removed from the corresponding folder
# to spare memory space:

eval(parse(text=paste("celfiles <- datainfo$'", arraydatafile, "'", sep="")))

allcelfiles <- unique(c(grep(".cel", list.files(newfolder), value=TRUE), grep(".CEL", list.files(newfolder), value=TRUE)))

celfilestodelete <- setdiff(allcelfiles, celfiles)

length(celfilestodelete)
length(celfiles)+length(celfilestodelete)
length(allcelfiles)

for(i in seq(along=celfilestodelete)) {
  if(file.exists(paste(newfolder, "/", celfilestodelete[i], sep=""))) 
    file.remove(paste(newfolder, "/", celfilestodelete[i], sep=""))
}


# Remove the zip files the CEL files where stored in to spare memory space:

filesinfolder <- list.files(newfolder)

zipfileind <- which(sapply(filesinfolder, function(x) {
  splitted <- strsplit(x, split="")[[1]]
  paste(splitted[(length(splitted)-2):length(splitted)], collapse="")=="zip"
}))

zipfiles <- filesinfolder[zipfileind]

for(i in seq(along=zipfiles)) {
  if(file.exists(paste(paste(newfolder, "/", zipfiles[i], sep=""))))
    file.remove(paste(paste(newfolder, "/", zipfiles[i], sep="")))
}



# Removal of outliers:

# Read in raw data:
library(affy)
cellist <- list.celfiles(path=newfolder, full.names=TRUE)

affydata <- ReadAffy(filenames=cellist)

# perform RMA normalization:
affydataprocessed <- rma(affydata)

# get expression matrix:
databatchtog <- exprs(affydataprocessed)
databatchtog <- t(databatchtog)


# Perform PCA analysis to spot outliers:

arraysdata <- rownames(databatchtog)
arraysdata <- unlist(strsplit(arraysdata, split=".CEL"))

xpr <- prcomp(databatchtog, scale. = FALSE)
xp <- predict(xpr)[,1:2]
# plot(xp)

# Outliers:
arraysdata[which(xp[,1] < -100)]

# --> Remove outliers:

celfilestodelete <- arraysdata[which(xp[,1] < -100)]
celfilestodelete <- paste(celfilestodelete, ".CEL", sep="")


# Delete not used CEL-files:

for(i in seq(along=celfilestodelete)) {
  if(file.exists(paste(newfolder, "/", celfilestodelete[i], sep=""))) 
    file.remove(paste(newfolder, "/", celfilestodelete[i], sep=""))
}

arraysdata <- setdiff(arraysdata, arraysdata[which(xp[,1] < -100)])

datainfo <- datainfo[datainfo$'Array.Data.File' %in% paste(arraysdata, ".CEL", sep=""),]



# Generate target variable:

y <- factor(as.numeric(datainfo$'Characteristics.phenotype.'!="Normal")+1)
names(y) <- datainfo$'Array.Data.File'

# batch variable:

batchid <- 1

batch <- rep(batchid, length(y))
names(batch) <- datainfo$'Array.Data.File'

# Save target and batch variable:

save(y, batch, file=paste(newfolder, "/", datasetid, "metadata.Rda", sep=""))


# Second Dataset:
##################

# NOTE: See first dataset above for commentation. 

datasetid <- "E-GEOD-21422"
newfolder <- paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep="")

library("ArrayExpress")
getAE(datasetid, path = newfolder, type = "raw")  


adf <- try(read.AnnotatedDataFrame(paste(newfolder, "/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
  arraydatafile <- "Array.Data.File"
} else {

adf <- readLines(paste(newfolder, "/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'

arraydatafile <- "Array Data File"
  
}


 
datainfo <- datainfo[datainfo$'Comment..Sample_source_name.' %in% c("healthy", "tumor"),] 
 

 
eval(parse(text=paste("celfiles <- datainfo$'", arraydatafile, "'", sep="")))

allcelfiles <- unique(c(grep(".cel", list.files(newfolder), value=TRUE), grep(".CEL", list.files(newfolder), value=TRUE)))

celfilestodelete <- setdiff(allcelfiles, celfiles)

length(celfilestodelete)
length(celfiles)+length(celfilestodelete)
length(allcelfiles)

for(i in seq(along=celfilestodelete)) {
  if(file.exists(paste(newfolder, "/", celfilestodelete[i], sep=""))) 
    file.remove(paste(newfolder, "/", celfilestodelete[i], sep=""))
}



filesinfolder <- list.files(newfolder)

zipfileind <- which(sapply(filesinfolder, function(x) {
  splitted <- strsplit(x, split="")[[1]]
  paste(splitted[(length(splitted)-2):length(splitted)], collapse="")=="zip"
}))

zipfiles <- filesinfolder[zipfileind]

for(i in seq(along=zipfiles)) {
  if(file.exists(paste(paste(newfolder, "/", zipfiles[i], sep=""))))
    file.remove(paste(paste(newfolder, "/", zipfiles[i], sep="")))
}


y <- factor(as.numeric(datainfo$'Comment..Sample_source_name.'=="tumor")+1) 
names(y) <- datainfo$'Array.Data.File' 
 
batchid <- 2 
 
batch <- rep(batchid, length(y)) 
names(batch) <- datainfo$'Array.Data.File' 
 
 
save(y, batch, file=paste(newfolder, "/", datasetid, "metadata.Rda", sep=""))



# Third Dataset:
##################
 
# NOTE: See first dataset above for commentation. 

datasetid <- "E-GEOD-22544"
newfolder <- paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep="")

library("ArrayExpress")
getAE(datasetid, path = newfolder, type = "raw")  



adf <- try(read.AnnotatedDataFrame(paste(newfolder, "/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
  arraydatafile <- "Array.Data.File"
} else {

adf <- readLines(paste(newfolder, "/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'

arraydatafile <- "Array Data File"
  
}


filesinfolder <- list.files(newfolder)

zipfileind <- which(sapply(filesinfolder, function(x) {
  splitted <- strsplit(x, split="")[[1]]
  paste(splitted[(length(splitted)-2):length(splitted)], collapse="")=="zip"
}))

zipfiles <- filesinfolder[zipfileind]

for(i in seq(along=zipfiles)) {
  if(file.exists(paste(paste(newfolder, "/", zipfiles[i], sep=""))))
    file.remove(paste(paste(newfolder, "/", zipfiles[i], sep="")))
}



library(affy) 
cellist <- list.celfiles(path=newfolder, full.names=TRUE) 
 
affydata <- ReadAffy(filenames=cellist) 
 
affydataprocessed <- rma(affydata) 
 
databatchtog <- exprs(affydataprocessed) 
databatchtog <- t(databatchtog) 
 
 
arraysdata <- rownames(databatchtog) 
arraysdata <- unlist(strsplit(arraysdata, split=".CEL")) 
 
 
xpr <- prcomp(databatchtog, scale. = FALSE) 
xp <- predict(xpr)[,1:2] 
 
arraysdata[which(xp[,1] < -200)] 
 
 
 
celfilestodelete <- arraysdata[which(xp[,1] < -200)] 
celfilestodelete <- paste(celfilestodelete, ".CEL", sep="") 
 
for(i in seq(along=celfilestodelete)) { 
  if(file.exists(paste(newfolder, "/", celfilestodelete[i], sep="")))  
    file.remove(paste(newfolder, "/", celfilestodelete[i], sep="")) 
} 
 
arraysdata <- setdiff(arraysdata, arraysdata[which(xp[,1] < -200)]) 
 


datainfo <- datainfo[datainfo$'Array.Data.File' %in% paste(arraysdata, ".CEL", sep=""),] 
 
y <- factor(as.numeric(datainfo$'Characteristics..disease.state.'=="breast cancer")+1) 
names(y) <- datainfo$'Array.Data.File' 
 
batchid <- 3 
 
batch <- rep(batchid, length(y)) 
names(batch) <- datainfo$'Array.Data.File' 
 
 
save(y, batch, file=paste(newfolder, "/", datasetid, "metadata.Rda", sep="")) 
 

 
# Fourth Dataset:
##################
 
# NOTE: See first dataset above for commentation. 
 
datasetid <- "E-GEOD-20266"
newfolder <- paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep="")

library("ArrayExpress")
getAE(datasetid, path = newfolder, type = "raw")  


adf <- try(read.AnnotatedDataFrame(paste(newfolder, "/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
  arraydatafile <- "Array.Data.File"
} else {

adf <- readLines(paste(newfolder, "/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'

arraydatafile <- "Array Data File"
  
}


filesinfolder <- list.files(newfolder)

zipfileind <- which(sapply(filesinfolder, function(x) {
  splitted <- strsplit(x, split="")[[1]]
  paste(splitted[(length(splitted)-2):length(splitted)], collapse="")=="zip"
}))

zipfiles <- filesinfolder[zipfileind]

for(i in seq(along=zipfiles)) {
  if(file.exists(paste(paste(newfolder, "/", zipfiles[i], sep=""))))
    file.remove(paste(paste(newfolder, "/", zipfiles[i], sep="")))
}


y <- factor(as.numeric(datainfo$'Characteristics [disease status]'=="Breast Cancer")+1) 
names(y) <- datainfo$'Array Data File' 
 
batchid <- 4 
 
batch <- rep(batchid, length(y)) 
names(batch) <- datainfo$'Array Data File'  
 

save(y, batch, file=paste(newfolder, "/", datasetid, "metadata.Rda", sep=""))


# Fifth Dataset:
##################
 
# NOTE: See first dataset above for commentation. 

datasetid <- "E-TABM-276"
newfolder <- paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep="")

library("ArrayExpress")
getAE(datasetid, path = newfolder, type = "raw")  


adf <- try(read.AnnotatedDataFrame(paste(newfolder, "/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
  arraydatafile <- "Array.Data.File"
} else {

adf <- readLines(paste(newfolder, "/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'

arraydatafile <- "Array Data File"
  
} 
 

patientids <- names(table(datainfo$'Factor Value [Individual]'[datainfo$'Factor Value [Histology]'=="Invasive Ductal Carcinoma"])) 
 
indssub <- 0 
 
set.seed(1234) 
 
for(i in seq(along=patientids)) { 
 
  indstemp <- which((datainfo$'Factor Value [Individual]'==patientids[i]) &  
    (datainfo$'Factor Value [Histology]'=="Invasive Ductal Carcinoma")) 
 
  if(length(indstemp) > 1) 
    indssub[i] <- sample(indstemp, size=1) 
  else 
    indssub[i] <- indstemp 
 
} 
 
allsubinds <- sort(c(which(datainfo$'Factor Value [Histology]'=="not applicable"), indssub)) 
 
datainfo <- datainfo[allsubinds,] 
 
 
 
eval(parse(text=paste("celfiles <- datainfo$'", arraydatafile, "'", sep=""))) 
 
allcelfiles <- unique(c(grep(".cel", list.files(newfolder), value=TRUE), grep(".CEL", list.files(newfolder), value=TRUE))) 
 
celfilestodelete <- setdiff(allcelfiles, celfiles) 
 
length(celfilestodelete) 
length(celfiles)+length(celfilestodelete) 
length(allcelfiles) 
 
for(i in seq(along=celfilestodelete)) { 
  if(file.exists(paste(newfolder, "/", celfilestodelete[i], sep="")))  
    file.remove(paste(newfolder, "/", celfilestodelete[i], sep="")) 
} 
 
 
 
filesinfolder <- list.files(newfolder) 
 
zipfileind <- which(sapply(filesinfolder, function(x) { 
  splitted <- strsplit(x, split="")[[1]] 
  paste(splitted[(length(splitted)-2):length(splitted)], collapse="")=="zip" 
})) 
 
zipfiles <- filesinfolder[zipfileind] 
 
for(i in seq(along=zipfiles)) { 
  if(file.exists(paste(newfolder, "/", zipfiles[i], sep="")))  
    file.remove(paste(newfolder, "/", zipfiles[i], sep="")) 
} 
 
 
 
y <- factor(as.numeric(datainfo$'Factor Value [Histology]'=="Invasive Ductal Carcinoma")+1) 
names(y) <- datainfo$'Array Data File' 
 
batchid <- 5 
 
batch <- rep(batchid, length(y)) 
names(batch) <- datainfo$'Array Data File' 
 
 
save(y, batch, file=paste(newfolder, "/", datasetid, "metadata.Rda", sep="")) 
 

 
# Read in all CEL-files and normalize them together.
# Then create combined target and batch variable:
######################################################

# Read in and normalize data:

datasetids <- c("E-GEOD-27562", "E-GEOD-21422", "E-GEOD-22544", 
  "E-GEOD-20266", "E-TABM-276")

newfolders <- paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetids, sep="")


library("affy")
cellist <- list.celfiles(path=newfolders, full.names=TRUE)

affydata <- ReadAffy(filenames=cellist)

# perform RMA normalization:
affydataprocessed <- rma(affydata)

# get expression matrix:
databatchtog <- exprs(affydataprocessed)
databatchtog <- t(databatchtog)


# Create target and batch variable:

yall <- c(); batchall <- c()
for(i in seq(along=newfolders)) {
  load(paste(newfolders[i], "/", datasetids[i], "metadata.Rda", sep=""))
  yall <- c(yall, y)
  batchall <- c(batchall, batch)
}
yall <- factor(yall)
batchall <- factor(batchall)


# Reorder target and batch variable according to the combined
# dataset:

y <- yall[as.numeric(factor(rownames(databatchtog), levels=names(yall)))]
batch <- batchall[as.numeric(factor(rownames(databatchtog), levels=names(batchall)))]


# Reorder data matrix, target and batch variable to obtain the
# right order of the batches:

batchorder <- order(batch)

y <- y[batchorder]
batch <- batch[batchorder]
databatchtog <- databatchtog[batchorder,]



# Subset the genes to feature only those present on an HGU133A-array:
######################################################################

# Read in two HGU133A-datasets and check whether the same genes
# are present:

library("GEOquery")

download.file(url="ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE36nnn/GSE36487/matrix/GSE36487_series_matrix.txt.gz", destfile="./FAbatchPaper/Datasets/DownloadedIntermediateData/GSE36487/GSE36487_series_matrix.txt.gz")

# Read in expression data:
X1 <- getGEO(file="./FAbatchPaper/Datasets/DownloadedIntermediateData/GSE36487/GSE36487_series_matrix.txt.gz")

# p x n - expression matrix:
Xdata1 <- exprs(X1)

genenames1 <- rownames(Xdata1)



download.file(url="ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE30nnn/GSE30884/matrix/GSE30884_series_matrix.txt.gz", destfile="./FAbatchPaper/Datasets/DownloadedIntermediateData/GSE30884/GSE30884_series_matrix.txt.gz")

# Read in expression data:
X2 <- getGEO(file="./FAbatchPaper/Datasets/DownloadedIntermediateData/GSE30884/GSE30884_series_matrix.txt.gz")

# p x n - expression matrix:
Xdata2 <- exprs(X2)

genenames2 <- rownames(Xdata2)


all(genenames1==genenames2)

# --> Names are the same.



# Look whether all the genenames are also present in 'databatchtog':

namesdatabatchtog <- colnames(databatchtog)
all(genenames1 %in% namesdatabatchtog)

sum(!(genenames1 %in% namesdatabatchtog))

# --> This is not the case. However looking at the dimension
# of the HGU133A-datasets 
dim(Xdata2)
# reveals that there are 6 more "variables" than should be.
# Obviously these 6 additional "variables" are not specific to
# standard Affymetrix GeneChip Human Genome U133 Plus 2.0-data sets,
# wherefore these are also not included in 'databatchtog'.


# --> Create the subsetted data set:

X <- databatchtog[,namesdatabatchtog %in% genenames1]


# Remove row and column names of data matrix, names of
# the target and batch variable:

rownames(X) <- NULL
colnames(X) <- NULL

names(y) <- NULL
names(batch) <- NULL


# Save combined dataset:
#########################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/BreastCancerConcatenation.Rda")


# Remove all temporarily stored files:
#######################################

datasetids <- c("E-GEOD-27562", "E-GEOD-21422", "E-GEOD-22544", 
  "E-GEOD-20266", "E-TABM-276", "GSE30884", "GSE36487")

newfolders <- paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetids, sep="")

for(i in seq(along=newfolders)) {
  do.call(file.remove, list(list.files(newfolders[i], full.names=TRUE)))
}


# Clear workspace:
rm(list=ls()); gc()















# BreastcTranscr
#################


rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-44281"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")


# Meta data:

adf <- read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
datainfo <- adf@data


# Leave out the first nine samples since they correspond to samples not
# belonging to the 410 samples used in the analysis performed by the
# authors who generated the dataset:

datainfo <- datainfo[-(1:9),]




# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL

all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]



# Construct the batch and the target variable:

batch <- factor(datainfo$Characteristics..batch.)
y <- factor(as.numeric(datainfo$Characteristics..status.=="case")+1)



# Read in the arrays:

# We have to make a subset of the variables, since there are 500,000.
# --> Choose randomly 50,000.

# Find out indices of NA-values for each array:

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}

mean(sapply(nalist, length)==0)

# --> No missing values.


# --> Indices without missing values:

sampletemp <- readLines(files[1])[-1]
nonas <- 1:length(sampletemp)


# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=length(sampletemp))

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)




# Save data set 'BreastcTranscr'
##################################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/BreastcTranscr.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls());gc()















# ColonGastricEsophagealcSNPArray
##################################


rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-36458"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")



# Meta data:

adf <- try(read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
} else {

adf <- readLines(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'
  
}



# Look at the frequencies of cancer types in the different batches:

(tabcancer <- table(datainfo$'Characteristics [batch]', datainfo$'Characteristics [cancer type]'))

# --> Exclude batches, which are comprised only of a specific cancer type:
tabcancercand <- tabcancer[apply(tabcancer, 1, function(x) sum(x!=0))>1,]
tabcancercand

# --> Use cancer types 'gastric' and 'non-malignant'
# and batches 'COXES', 'GHATS' and 'SULKY':

datainfoall <- datainfo

datainfo <- datainfo[(datainfo$'Characteristics [batch]' %in% c("COXES", "GHATS", "SULKY")) &
  (datainfo$'Characteristics [cancer type]' %in% c("gastric", "non-malignant")),]

table(datainfo$'Characteristics [batch]', datainfo$'Characteristics [cancer type]')





# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]



# Batch and target variable:

batch <- factor(as.numeric(factor(datainfo$'Characteristics [batch]')))
y <- factor(as.numeric(datainfo$'Characteristics [cancer type]'=="gastric")+1)



# Read in the arrays:

# We have to make a subset of the variables, since there are 500,000.
# --> Choose randomly 50,000.

# Find out indices of NA-values for each array:

head(readLines(files[1]))

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}

table(sapply(nalist, length))

# --> Are the same variables missing in the blocks of observations
# with same numbers of missing values?:

for(i in 1:length(table(sapply(nalist, length)))) {
  nalisttemp <- nalist[sapply(nalist, length)==names(table(sapply(nalist, length)))[i]]
  tempmat <- matrix(nrow=length(nalisttemp), ncol=length(nalisttemp[[1]]))
  for(j in 1:length(nalisttemp)) {
    tempmat[j,] <- nalisttemp[[j]]
  }
  cat(all(apply(tempmat, 2, function(x) length(unique(x))==1)), "\n")
}

# --> Yes there are blocks of observations with missing values in the
#     same variables.

# The latter blocks correspond to the batches:

table(sapply(nalist, length), batch)


# --> Indices without missing values:

# Get indices of variables without missing values:

nonas <- 1:length(sampletemp)
for(i in 1:length(files))
  nonas <- setdiff(nonas, nalist[[i]])



# Randomly choose 50000 of the variables without missing values:  
  
set.seed(1234)
ranind <- sort(sample(nonas, size=50000))


# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=50000)

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))[ranind]
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)





# Remove outliers:

# Principal component plot:

library("bapred")
# pcplot(x=X, batch=batch, y=y)

# --> One outlier. --> Identify and remove:

xpr <- prcomp(X, scale. = FALSE)
xp <- predict(xpr)[,1:2]

outlierind <- which(xp[,1] > 1000)

X <- X[-outlierind,]
batch <- batch[-outlierind]
y <- y[-outlierind]

# pcplot(x=X, batch=batch, y=y)

# --> Another less sever outlier is seen. 
# --> Identify and remove:

xpr <- prcomp(X, scale. = FALSE)
xp <- predict(xpr)[,1:2]

outlierind <- which(xp[,2] < -300)

X <- X[-outlierind,]
batch <- batch[-outlierind]
y <- y[-outlierind]

# --> OK.


# Look at further PCs:

xpr <- prcomp(X, scale. = FALSE)
xp <- predict(xpr)[,1:5]
pairs(xp)

# --> Another outlier is found. --> Remove:

outlierind <- which(xp[,3] > 200)

X <- X[-outlierind,]
batch <- batch[-outlierind]
y <- y[-outlierind]


# Look at further PCs:

xpr <- prcomp(X, scale. = FALSE)
xp <- predict(xpr)[,1:5]
pairs(xp)

# --> No severe outlier anymore.




# Save data set 'ColonGastricEsophagealcSNPArray'
#################################################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/ColonGastricEsophagealcSNPArray.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls());gc()















# EthnicityMethyl
##################


rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-39672"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")



# Meta data:

adf <- try(read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
} else {

adf <- readLines(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'
  
}




# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]



# Construct the batch and the target variable:

batch <- factor(datainfo$FactorValue..HYBRIDIZATION.BATCH.)
y <- factor(as.numeric(datainfo$Characteristics..panel.=="CEU")+1)



# Read in the arrays:

# We have to make a subset of the variables, since there are 500,000.
# --> Choose randomly 50,000.

# Find out indices of NA-values for each array:

head(readLines(files[1]))

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}


# Get indices of variables without missing values:

nonas <- 1:length(sampletemp)
for(i in 1:length(files))
  nonas <- setdiff(nonas, nalist[[i]])


# Randomly choose 50000 of the variables without missing values:  
  
set.seed(1234)
ranind <- sort(sample(nonas, size=50000))


# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=50000)

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))[ranind]
  cat(paste("Sample", i, "of", nit), "\n")
}



# Bring to the used format:

X <- as.matrix(datamat)



# Save data set 'EthnicityMethyl'
##################################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/EthnicityMethyl.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls()); gc()















# IBSTranscr
#############


rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-36701"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")



# Meta data:

adf <- try(read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
} else {

adf <- readLines(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'
  
}




table(datainfo$'Characteristics[batch]', datainfo$'Characteristics[disease]')

# --> Exclude batches "2_A" and "2_B", since these are follow-ups for
#     patients in the other batches:

datainfo <- datainfo[!(datainfo$'Characteristics[batch]' %in% c("2_A", "2_B")),]


# For the majority of patients there are two arrays:

table(datainfo$'Characteristics[subject identifier]')

# Look at one example:
datainfo2 <- datainfo[datainfo$'Characteristics[subject identifier]'=="SD52930",]

datainfo2[,apply(datainfo2, 2, function(x) length(unique(x))!=1)]

# --> Only the names are different, therefore one can assume that
#     the patients were simply measured twice.


# Look at the samples which were only measured once:

onetimenames <- names(table(datainfo$'Characteristics[subject identifier]'))[table(datainfo$'Characteristics[subject identifier]')==1]

datainfo1 <- datainfo[datainfo$'Characteristics[subject identifier]' %in% onetimenames,]

datainfo1[,apply(datainfo2, 2, function(x) length(unique(x))!=1)]

# --> No peculiarities could be made out. --> Only use the data of one array
#     per patient:

# First include all arrays which belong to patients only measured once:
indchoose <- which(datainfo$'Characteristics[subject identifier]' %in% onetimenames)

# Identifiers of subjects measured two times:
twotimenames <- names(table(datainfo$'Characteristics[subject identifier]'))[table(datainfo$'Characteristics[subject identifier]')==2]

# Get the indices in the data set for each of these subjects:
subjectinds <- lapply(twotimenames, function(x) which(datainfo$'Characteristics[subject identifier]' %in% x))

# For consistency we always choose the array with the smaller number.
# --> Extract numbers from the array names:
arraynumber <- sapply(rownames(datainfo), function(x) as.numeric(strsplit(strsplit(x, split=" 1")[[1]], split="GSM")[[1]][2]))

indslower <- unlist(lapply(subjectinds, function(x) x[which.min(arraynumber[x])]))

indchoose <- c(indchoose, indslower)
indchoose <- sort(indchoose)

# Subset datainfo:

datainfo <- datainfo[indchoose,]


# -->

table(datainfo$'Characteristics[batch]', datainfo$'Characteristics[disease]')




# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]





# Read in the arrays:

# Find out indices of NA-values for each array:

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}

mean(sapply(nalist, length)==0)

# --> No missing values.


# --> Indices without missing values:

sampletemp <- readLines(files[1])[-1]
nonas <- 1:length(sampletemp)


# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=length(sampletemp))

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)




# Remove outliers:

# Principal component plot:

library("bapred")
# pcplot(x=X, batch=factor(as.numeric(factor(datainfo$'Characteristics[batch]'))))

# --> There are obviously outliers.

# --> Identify and remove outliers:

xpr <- prcomp(X, scale. = FALSE)
xp <- predict(xpr)[,1:5]
pairs(xp)

outlierind <- which(xp[,2] > 10000)

X <- X[-outlierind,]
datainfo <- datainfo[-outlierind,]


# Further outliers:

# pcplot(x=X, batch=factor(as.numeric(factor(datainfo$'Characteristics[batch]'))))

xpr <- prcomp(X, scale. = FALSE)
xp <- predict(xpr)[,1:5]

outlierind <- which(xp[,1] > 10000)

X <- X[-outlierind,]
datainfo <- datainfo[-outlierind,]


# No further outliers:

# pcplot(x=X, batch=factor(as.numeric(factor(datainfo$'Characteristics[batch]'))))

# --> OK.



# Batch and target variable:

batch <- factor(as.numeric(factor(datainfo$'Characteristics[batch]')))
y <- factor(as.numeric(datainfo$'Characteristics[disease]'!="HV")+1)



# Save data set 'IBSTranscr'
############################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/IBSTranscr.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls());gc()















# IUGRTranscr
##############


rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-35574"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")




# Meta data:

adf <- try(read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
} else {

adf <- readLines(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'
  
}



# Leave out replicates:
datainfo <- datainfo[setdiff(1:nrow(datainfo), grep(".rep", datainfo$'Comment [Sample_title]')),]

# Leave out "PE" samples, since we have to form a binary variable and
# "PE" was seen to disparate form "IUGR":
datainfo <- datainfo[datainfo$'Characteristics[classification]'!="PE",]




# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL

all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]




# Batch and target variable:

batch <- factor(as.numeric(factor(datainfo$'Characteristics[batch]')))
y <- factor(as.numeric(datainfo$'Characteristics[classification]'=="IUGR")+1)



# Read in the arrays:

# Find out indices of NA-values for each array:

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}

mean(sapply(nalist, length)==0)

# --> No missing values.

# --> Indices without missing values:

sampletemp <- readLines(files[1])[-1]
nonas <- 1:length(sampletemp)


# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=length(sampletemp))

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)



# Save data set 'IUGRTranscr'
##################################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/IUGRTranscr.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls());gc()















# PostpartumDepressionMethyl
#############################


rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-44132"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")



# Meta data:

adf <- try(read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
} else {

adf <- readLines(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'
  
}



# Leave out Cross Batch Controls, since they seem to be replicates:
datainfo <- datainfo[datainfo$Characteristics..cross.batch.control.=="no",]

table(datainfo$Characteristics..array.batch.)

table(datainfo$Characteristics..postpartum.depression., datainfo$Characteristics..array.batch.)

# --> Small batches, however not strongly batch-imbalanced.




# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL

all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]



# Construct the batch and the target variable:

batch <- factor(datainfo$Characteristics..array.batch.)
y <- factor(as.numeric(datainfo$Characteristics..postpartum.depression.=="yes")+1)



# Read in the arrays:

# We have to make a subset of the variables, since there are 500,000.
# --> Choose randomly 50,000.

# Find out indices of NA-values for each array:

head(readLines(files[1]))

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}


mean(sapply(nalist, length)==0)

# --> No missing values.


# --> Indices without missing values:

sampletemp <- readLines(files[1])[-1]
nonas <- 1:length(sampletemp)


# Randomly choose 50000 of the variables without missing values:  
  
set.seed(1234)
ranind <- sort(sample(nonas, size=50000))


# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=50000)

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))[ranind]
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)


# Save data set 'PostpartumDepressionMethyl'
############################################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/PostpartumDepressionMethyl.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls());gc()















# pSSTranscr
#############


rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-40611"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")



# Meta data:

adf <- try(read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
} else {

adf <- readLines(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'
  
}



# Batch and target variable:

table(datainfo$'Characteristics [block]', datainfo$'Characteristics [disease state]')

# Merge the two disease states to one class:
table(datainfo$'Characteristics [block]', as.numeric(datainfo$'Characteristics [disease state]'!="normal"))

# --> OK.



# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL

all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]



# Batch and target variable:

batch <- factor(as.numeric(factor(datainfo$'Characteristics [block]')))
y <- factor(as.numeric(datainfo$'Characteristics [disease state]'!="normal")+1)



# Read in the arrays:

# Find out indices of NA-values for each array:

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}

mean(sapply(nalist, length)==0)

# --> No missing values.



# --> Indices without missing values:

sampletemp <- readLines(files[1])[-1]
nonas <- 1:length(sampletemp)


# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=length(sampletemp))

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)



# Save data set 'pSSTranscr'
##################################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/pSSTranscr.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls());gc()















# SarcoidosisTranscr
#####################


rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-19314"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")



# Meta data:

adf <- try(read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
} else {

adf <- readLines(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'
  
}



# Include only samples of class 'sarcoidosis' and of class 'normal':

datainfo <- datainfo[datainfo$'Characteristics[disease]' %in% c("sarcoidosis", "normal"),]



# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is not reversed.


# Reorder 'datainfo' according to 'datainfonames':

reorderind <- as.numeric(factor(filesnames, levels=datainfonames))
datainfo <- datainfo[reorderind,]



# Batch and target variable:

batch <- factor(as.numeric(factor(datainfo$'FactorValue[block]')))
y <- factor(as.numeric(datainfo$'Characteristics[disease]'=="sarcoidosis")+1)




# Read in the arrays:

# Find out indices of NA-values for each array:

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}

mean(sapply(nalist, length)==0)

# --> No missing values.


# --> Indices without missing values:

sampletemp <- readLines(files[1])[-1]
nonas <- 1:length(sampletemp)


# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=length(sampletemp))

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)


# Save data set 'SarcoidosisTranscr'
#####################################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/SarcoidosisTranscr.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls());gc()















# WestNileVirusTranscr
#######################


rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-43190"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")



# Meta data:

adf <- try(read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
} else {

adf <- readLines(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'
  
}




# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]



# Batch and target variable:

batch <- factor(as.numeric(factor(datainfo$'Characteristics [batch]')))
y <- factor(as.numeric(datainfo$'Characteristics [disease status]'=="Severe Disease")+1)



# Read in the arrays:

# Find out indices of NA-values for each array:

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}

mean(sapply(nalist, length)==0)

# --> No missing values.


# --> Indices without missing values:

sampletemp <- readLines(files[1])[-1]
nonas <- 1:length(sampletemp)


# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=length(sampletemp))

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)


# Save data set 'WestNileVirusTranscr'
######################################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/WestNileVirusTranscr.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls());gc()
