#######################################################################################
#######################################################################################
# This concatenation of microarray data sets studying breast cancer consists of samples from 
# ArrayExpress data sets with the following accession codes:
#
# 1) E-GEOD-27562
# 2) E-GEOD-21422
# 3) E-GEOD-22544
# 4) E-GEOD-20266
# 5) E-TABM-276
#
# To reduce the computational burden we thereby only considered genes present in
# the Affymetrix GeneChip Human Genome U133A 2.0-arrays, resulting in 22,277 variables.
#######################################################################################
#######################################################################################


#######################################################################################
# As as first step, we downloaded the data sets and put the CEL-files belonging to a 
# specific data set into single folders. We subset these data sets, so that the resulting 
# data sets would consist of samples belonging to two classes only, "breast cancer" vs. 
# "healthy". Outliers were also removed in two data sets.
# Then we subset the variables.
#######################################################################################




# Downloading of the CEL-files, removing CEL-files corresponding to observations 
# not used in the analysis and creating the target and batch variable:
################################################################################

# First Dataset:
##################

# Accession number:
datasetid <- "E-GEOD-27562"
newfolder <- paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep="")

# Download RAW data from ArrayExpress:
library("ArrayExpress")
getAE(datasetid, path = newfolder, type = "raw")  


# Load Phenodata (for older microarray datasets the 'read.AnnotatedDataFrame' throws an
# error, in which cases the 'srd.txt'-file has to be read in manually):

adf <- try(read.AnnotatedDataFrame(paste(newfolder, "/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
  arraydatafile <- "Array.Data.File"
} else {

adf <- readLines(paste(newfolder, "/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'

arraydatafile <- "Array Data File"
  
}


# Subset phenodata to feature only relevant observations:

datainfo <- datainfo[datainfo$'Characteristics.phenotype.' %in% c("Malignant", "Normal", "Pre-Surgery (aka Malignant)"),]



# The not used CEL files are removed from the corresponding folder
# to spare memory space:

eval(parse(text=paste("celfiles <- datainfo$'", arraydatafile, "'", sep="")))

allcelfiles <- unique(c(grep(".cel", list.files(newfolder), value=TRUE), grep(".CEL", list.files(newfolder), value=TRUE)))

celfilestodelete <- setdiff(allcelfiles, celfiles)

length(celfilestodelete)
length(celfiles)+length(celfilestodelete)
length(allcelfiles)

for(i in seq(along=celfilestodelete)) {
  if(file.exists(paste(newfolder, "/", celfilestodelete[i], sep=""))) 
    file.remove(paste(newfolder, "/", celfilestodelete[i], sep=""))
}


# Remove the zip files the CEL files where stored in to spare memory space:

filesinfolder <- list.files(newfolder)

zipfileind <- which(sapply(filesinfolder, function(x) {
  splitted <- strsplit(x, split="")[[1]]
  paste(splitted[(length(splitted)-2):length(splitted)], collapse="")=="zip"
}))

zipfiles <- filesinfolder[zipfileind]

for(i in seq(along=zipfiles)) {
  if(file.exists(paste(paste(newfolder, "/", zipfiles[i], sep=""))))
    file.remove(paste(paste(newfolder, "/", zipfiles[i], sep="")))
}



# Removal of outliers:

# Read in raw data:
library(affy)
cellist <- list.celfiles(path=newfolder, full.names=TRUE)

affydata <- ReadAffy(filenames=cellist)

# perform RMA normalization:
affydataprocessed <- rma(affydata)

# get expression matrix:
databatchtog <- exprs(affydataprocessed)
databatchtog <- t(databatchtog)


# Perform PCA analysis to spot outliers:

arraysdata <- rownames(databatchtog)
arraysdata <- unlist(strsplit(arraysdata, split=".CEL"))

xpr <- prcomp(databatchtog, scale. = FALSE)
xp <- predict(xpr)[,1:2]
plot(xp)

# Outliers:
arraysdata[which(xp[,1] < -100)]

# --> Remove outliers:

celfilestodelete <- arraysdata[which(xp[,1] < -100)]
celfilestodelete <- paste(celfilestodelete, ".CEL", sep="")


# Delete not used CEL-files:

for(i in seq(along=celfilestodelete)) {
  if(file.exists(paste(newfolder, "/", celfilestodelete[i], sep=""))) 
    file.remove(paste(newfolder, "/", celfilestodelete[i], sep=""))
}

arraysdata <- setdiff(arraysdata, arraysdata[which(xp[,1] < -100)])

datainfo <- datainfo[datainfo$'Array.Data.File' %in% paste(arraysdata, ".CEL", sep=""),]



# Generate target variable:

y <- factor(as.numeric(datainfo$'Characteristics.phenotype.'!="Normal")+1)
names(y) <- datainfo$'Array.Data.File'

# batch variable:

batchid <- 1

batch <- rep(batchid, length(y))
names(batch) <- datainfo$'Array.Data.File'

# Save target and batch variable:

save(y, batch, file=paste(newfolder, "/", datasetid, "metadata.Rda", sep=""))



	

# Second Dataset:
##################

# NOTE: See first dataset above for commentation. 

datasetid <- "E-GEOD-21422"
newfolder <- paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep="")

library("ArrayExpress")
getAE(datasetid, path = newfolder, type = "raw")  


adf <- try(read.AnnotatedDataFrame(paste(newfolder, "/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
  arraydatafile <- "Array.Data.File"
} else {

adf <- readLines(paste(newfolder, "/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'

arraydatafile <- "Array Data File"
  
}


 
datainfo <- datainfo[datainfo$'Comment..Sample_source_name.' %in% c("healthy", "tumor"),] 
 

 
eval(parse(text=paste("celfiles <- datainfo$'", arraydatafile, "'", sep="")))

allcelfiles <- unique(c(grep(".cel", list.files(newfolder), value=TRUE), grep(".CEL", list.files(newfolder), value=TRUE)))

celfilestodelete <- setdiff(allcelfiles, celfiles)

length(celfilestodelete)
length(celfiles)+length(celfilestodelete)
length(allcelfiles)

for(i in seq(along=celfilestodelete)) {
  if(file.exists(paste(newfolder, "/", celfilestodelete[i], sep=""))) 
    file.remove(paste(newfolder, "/", celfilestodelete[i], sep=""))
}



filesinfolder <- list.files(newfolder)

zipfileind <- which(sapply(filesinfolder, function(x) {
  splitted <- strsplit(x, split="")[[1]]
  paste(splitted[(length(splitted)-2):length(splitted)], collapse="")=="zip"
}))

zipfiles <- filesinfolder[zipfileind]

for(i in seq(along=zipfiles)) {
  if(file.exists(paste(paste(newfolder, "/", zipfiles[i], sep=""))))
    file.remove(paste(paste(newfolder, "/", zipfiles[i], sep="")))
}


y <- factor(as.numeric(datainfo$'Comment..Sample_source_name.'=="tumor")+1) 
names(y) <- datainfo$'Array.Data.File' 
 
batchid <- 2 
 
batch <- rep(batchid, length(y)) 
names(batch) <- datainfo$'Array.Data.File' 
 
 
save(y, batch, file=paste(newfolder, "/", datasetid, "metadata.Rda", sep=""))




 

# Third Dataset:
##################
 
# NOTE: See first dataset above for commentation. 

datasetid <- "E-GEOD-22544"
newfolder <- paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep="")

library("ArrayExpress")
getAE(datasetid, path = newfolder, type = "raw")  



adf <- try(read.AnnotatedDataFrame(paste(newfolder, "/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
  arraydatafile <- "Array.Data.File"
} else {

adf <- readLines(paste(newfolder, "/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'

arraydatafile <- "Array Data File"
  
}


filesinfolder <- list.files(newfolder)

zipfileind <- which(sapply(filesinfolder, function(x) {
  splitted <- strsplit(x, split="")[[1]]
  paste(splitted[(length(splitted)-2):length(splitted)], collapse="")=="zip"
}))

zipfiles <- filesinfolder[zipfileind]

for(i in seq(along=zipfiles)) {
  if(file.exists(paste(paste(newfolder, "/", zipfiles[i], sep=""))))
    file.remove(paste(paste(newfolder, "/", zipfiles[i], sep="")))
}



library(affy) 
cellist <- list.celfiles(path=newfolder, full.names=TRUE) 
 
affydata <- ReadAffy(filenames=cellist) 
 
affydataprocessed <- rma(affydata) 
 
databatchtog <- exprs(affydataprocessed) 
databatchtog <- t(databatchtog) 
 
 
arraysdata <- rownames(databatchtog) 
arraysdata <- unlist(strsplit(arraysdata, split=".CEL")) 
 
 
xpr <- prcomp(databatchtog, scale. = FALSE) 
xp <- predict(xpr)[,1:2] 
 
arraysdata[which(xp[,1] < -200)] 
 
 
 
celfilestodelete <- arraysdata[which(xp[,1] < -200)] 
celfilestodelete <- paste(celfilestodelete, ".CEL", sep="") 
 
for(i in seq(along=celfilestodelete)) { 
  if(file.exists(paste(newfolder, "/", celfilestodelete[i], sep="")))  
    file.remove(paste(newfolder, "/", celfilestodelete[i], sep="")) 
} 
 
arraysdata <- setdiff(arraysdata, arraysdata[which(xp[,1] < -200)]) 
 


datainfo <- datainfo[datainfo$'Array.Data.File' %in% paste(arraysdata, ".CEL", sep=""),] 
 
y <- factor(as.numeric(datainfo$'Characteristics..disease.state.'=="breast cancer")+1) 
names(y) <- datainfo$'Array.Data.File' 
 
batchid <- 3 
 
batch <- rep(batchid, length(y)) 
names(batch) <- datainfo$'Array.Data.File' 
 
 
save(y, batch, file=paste(newfolder, "/", datasetid, "metadata.Rda", sep="")) 
 


 
 
 
 
# Fourth Dataset:
##################
 
# NOTE: See first dataset above for commentation. 
 
datasetid <- "E-GEOD-20266"
newfolder <- paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep="")

library("ArrayExpress")
getAE(datasetid, path = newfolder, type = "raw")  


adf <- try(read.AnnotatedDataFrame(paste(newfolder, "/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
  arraydatafile <- "Array.Data.File"
} else {

adf <- readLines(paste(newfolder, "/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'

arraydatafile <- "Array Data File"
  
}


filesinfolder <- list.files(newfolder)

zipfileind <- which(sapply(filesinfolder, function(x) {
  splitted <- strsplit(x, split="")[[1]]
  paste(splitted[(length(splitted)-2):length(splitted)], collapse="")=="zip"
}))

zipfiles <- filesinfolder[zipfileind]

for(i in seq(along=zipfiles)) {
  if(file.exists(paste(paste(newfolder, "/", zipfiles[i], sep=""))))
    file.remove(paste(paste(newfolder, "/", zipfiles[i], sep="")))
}


y <- factor(as.numeric(datainfo$'Characteristics [disease status]'=="Breast Cancer")+1) 
names(y) <- datainfo$'Array Data File' 
 
batchid <- 4 
 
batch <- rep(batchid, length(y)) 
names(batch) <- datainfo$'Array Data File'  
 

save(y, batch, file=paste(newfolder, "/", datasetid, "metadata.Rda", sep=""))



 
 
 
 

# Fifth Dataset:
##################
 
# NOTE: See first dataset above for commentation. 

datasetid <- "E-TABM-276"
newfolder <- paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep="")

library("ArrayExpress")
getAE(datasetid, path = newfolder, type = "raw")  


adf <- try(read.AnnotatedDataFrame(paste(newfolder, "/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
  arraydatafile <- "Array.Data.File"
} else {

adf <- readLines(paste(newfolder, "/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'

arraydatafile <- "Array Data File"
  
} 
 

patientids <- names(table(datainfo$'Factor Value [Individual]'[datainfo$'Factor Value [Histology]'=="Invasive Ductal Carcinoma"])) 
 
indssub <- 0 
 
set.seed(1234) 
 
for(i in seq(along=patientids)) { 
 
  indstemp <- which((datainfo$'Factor Value [Individual]'==patientids[i]) &  
    (datainfo$'Factor Value [Histology]'=="Invasive Ductal Carcinoma")) 
 
  if(length(indstemp) > 1) 
    indssub[i] <- sample(indstemp, size=1) 
  else 
    indssub[i] <- indstemp 
 
} 
 
allsubinds <- sort(c(which(datainfo$'Factor Value [Histology]'=="not applicable"), indssub)) 
 
datainfo <- datainfo[allsubinds,] 
 
 
 
eval(parse(text=paste("celfiles <- datainfo$'", arraydatafile, "'", sep=""))) 
 
allcelfiles <- unique(c(grep(".cel", list.files(newfolder), value=TRUE), grep(".CEL", list.files(newfolder), value=TRUE))) 
 
celfilestodelete <- setdiff(allcelfiles, celfiles) 
 
length(celfilestodelete) 
length(celfiles)+length(celfilestodelete) 
length(allcelfiles) 
 
for(i in seq(along=celfilestodelete)) { 
  if(file.exists(paste(newfolder, "/", celfilestodelete[i], sep="")))  
    file.remove(paste(newfolder, "/", celfilestodelete[i], sep="")) 
} 
 
 
 
filesinfolder <- list.files(newfolder) 
 
zipfileind <- which(sapply(filesinfolder, function(x) { 
  splitted <- strsplit(x, split="")[[1]] 
  paste(splitted[(length(splitted)-2):length(splitted)], collapse="")=="zip" 
})) 
 
zipfiles <- filesinfolder[zipfileind] 
 
for(i in seq(along=zipfiles)) { 
  if(file.exists(paste(newfolder, "/", zipfiles[i], sep="")))  
    file.remove(paste(newfolder, "/", zipfiles[i], sep="")) 
} 
 
 
 
y <- factor(as.numeric(datainfo$'Factor Value [Histology]'=="Invasive Ductal Carcinoma")+1) 
names(y) <- datainfo$'Array Data File' 
 
batchid <- 5 
 
batch <- rep(batchid, length(y)) 
names(batch) <- datainfo$'Array Data File' 
 
 
save(y, batch, file=paste(newfolder, "/", datasetid, "metadata.Rda", sep="")) 
 
 
 

# Read in all CEL-files and normalize them together.
# Then create combined target and batch variable:
######################################################

# Read in and normalize data:

datasetids <- c("E-GEOD-27562", "E-GEOD-21422", "E-GEOD-22544", 
  "E-GEOD-20266", "E-TABM-276")

newfolders <- paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetids, sep="")


library("affy")
cellist <- list.celfiles(path=newfolders, full.names=TRUE)

affydata <- ReadAffy(filenames=cellist)

# perform RMA normalization:
affydataprocessed <- rma(affydata)

# get expression matrix:
databatchtog <- exprs(affydataprocessed)
databatchtog <- t(databatchtog)


# Create target and batch variable:

yall <- c(); batchall <- c()
for(i in seq(along=newfolders)) {
  load(paste(newfolders[i], "/", datasetids[i], "metadata.Rda", sep=""))
  yall <- c(yall, y)
  batchall <- c(batchall, batch)
}
yall <- factor(yall)
batchall <- factor(batchall)


# Reorder target and batch variable according to the combined
# dataset:

y <- yall[as.numeric(factor(rownames(databatchtog), levels=names(yall)))]
batch <- batchall[as.numeric(factor(rownames(databatchtog), levels=names(batchall)))]


# Reorder data matrix, target and batch variable to obtain the
# right order of the batches:

batchorder <- order(batch)

y <- y[batchorder]
batch <- batch[batchorder]
databatchtog <- databatchtog[batchorder,]





# Subset the genes to feature only those present on an HGU133A-array:
######################################################################

# Read in two HGU133A-datasets and check whether the same genes
# are present:

library("GEOquery")

download.file(url="ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE36nnn/GSE36487/matrix/GSE36487_series_matrix.txt.gz", destfile="./FAbatchPaper/Datasets/DownloadedIntermediateData/GSE36487/GSE36487_series_matrix.txt.gz")

# Read in expression data:
X1 <- getGEO(file="./FAbatchPaper/Datasets/DownloadedIntermediateData/GSE36487/GSE36487_series_matrix.txt.gz")

# p x n - expression matrix:
Xdata1 <- exprs(X1)

genenames1 <- rownames(Xdata1)



download.file(url="ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE30nnn/GSE30884/matrix/GSE30884_series_matrix.txt.gz", destfile="./FAbatchPaper/Datasets/DownloadedIntermediateData/GSE30884/GSE30884_series_matrix.txt.gz")

# Read in expression data:
X2 <- getGEO(file="./FAbatchPaper/Datasets/DownloadedIntermediateData/GSE30884/GSE30884_series_matrix.txt.gz")

# p x n - expression matrix:
Xdata2 <- exprs(X2)

genenames2 <- rownames(Xdata2)


all(genenames1==genenames2)

# --> Names are the same.



# Look whether all the genenames are also present in 'databatchtog':

namesdatabatchtog <- colnames(databatchtog)
all(genenames1 %in% namesdatabatchtog)

sum(!(genenames1 %in% namesdatabatchtog))

# --> This is not the case. However looking at the dimension
# of the HGU133A-datasets 
dim(Xdata2)
# reveals that there are 6 more "variables" than should be.
# Obviously these 6 additional "variables" are not specific to
# standard Affymetrix GeneChip Human Genome U133 Plus 2.0-data sets,
# wherefore these are also not included in 'databatchtog'.


# --> Create the subsetted data set:

X <- databatchtog[,namesdatabatchtog %in% genenames1]


# Remove row and column names of data matrix, names of
# the target and batch variable:

rownames(X) <- NULL
colnames(X) <- NULL

names(y) <- NULL
names(batch) <- NULL




# Save combined dataset:
#########################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/BreastCancerConcatenation.Rda")



# Remove all temporarily stored files:
#######################################

datasetids <- c("E-GEOD-27562", "E-GEOD-21422", "E-GEOD-22544", 
  "E-GEOD-20266", "E-TABM-276", "GSE30884", "GSE36487")

newfolders <- paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetids, sep="")

for(i in seq(along=newfolders)) {
  do.call(file.remove, list(list.files(newfolders[i], full.names=TRUE)))
}


# Clear workspace:
rm(list=ls()); gc()
