rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-44281"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")


# Meta data:

adf <- read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
datainfo <- adf@data


# Leave out the first nine samples since they correspond to samples not
# belonging to the 410 samples used in the analysis performed by the
# authors who generated the dataset:

datainfo <- datainfo[-(1:9),]




# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL

all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]



# Construct the batch and the target variable:

batch <- factor(datainfo$Characteristics..batch.)
y <- factor(as.numeric(datainfo$Characteristics..status.=="case")+1)



# Read in the arrays:

# We have to make a subset of the variables, since there are 500,000.
# --> Choose randomly 50,000.

# Find out indices of NA-values for each array:

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}

mean(sapply(nalist, length)==0)

# --> No missing values.


# --> Indices without missing values:

sampletemp <- readLines(files[1])[-1]
nonas <- 1:length(sampletemp)


# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=length(sampletemp))

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)




# Save data set 'BreastcTranscr'
##################################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/BreastcTranscr.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls());gc()
