rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-44132"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")



# Meta data:

adf <- try(read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
} else {

adf <- readLines(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'
  
}



# Leave out Cross Batch Controls, since they seem to be replicates:
datainfo <- datainfo[datainfo$Characteristics..cross.batch.control.=="no",]

table(datainfo$Characteristics..array.batch.)

table(datainfo$Characteristics..postpartum.depression., datainfo$Characteristics..array.batch.)

# --> Small batches, however not strongly batch-imbalanced.




# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL

all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]



# Construct the batch and the target variable:

batch <- factor(datainfo$Characteristics..array.batch.)
y <- factor(as.numeric(datainfo$Characteristics..postpartum.depression.=="yes")+1)



# Read in the arrays:

# We have to make a subset of the variables, since there are 500,000.
# --> Choose randomly 50,000.

# Find out indices of NA-values for each array:

head(readLines(files[1]))

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}


mean(sapply(nalist, length)==0)

# --> No missing values.


# --> Indices without missing values:

sampletemp <- readLines(files[1])[-1]
nonas <- 1:length(sampletemp)


# Randomly choose 50000 of the variables without missing values:  
  
set.seed(1234)
ranind <- sort(sample(nonas, size=50000))


# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=50000)

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))[ranind]
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)


# Save data set 'PostpartumDepressionMethyl'
############################################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/PostpartumDepressionMethyl.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls());gc()
