rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-38873"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")




# Meta data:

adf <- try(read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
} else {

adf <- readLines(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'
  
}



# Remove replicates:
datainfo <- datainfo[setdiff(1:nrow(datainfo), grep("_rep", datainfo$Comment..Sample_title.)),]

table(datainfo$FactorValue..DISEASE.STATE., datainfo$Characteristics..batch.)



# Keep only BP patients and controls:

datainfo <- datainfo[datainfo$FactorValue..DISEASE.STATE. %in% c("BP", "Unaffected control"),]

table(datainfo$FactorValue..DISEASE.STATE., datainfo$Characteristics..batch.)

# --> OK.



# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL

all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]



# Construct the batch and the target variable:

batch <- factor(as.numeric(datainfo$Characteristics..batch.==2)+1)
y <- factor(as.numeric(datainfo$FactorValue..DISEASE.STATE.=="BP")+1)



# Read in the arrays:

# Find out indices of NA-values for each array:

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}

mean(sapply(nalist, length)==0)

# --> Every observations has missing values.


# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=length(sampletemp))

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)



# Treatment of missing values:

table(apply(X, 2, function(x) sum(is.na(x))))

# --> Remove variables with more than five missing values:

X <- X[,apply(X, 2, function(x) sum(is.na(x))) <= 5]

# Set missing values to the intra-batch median of the corresponding variable:

navars <- which(apply(X, 2, function(x) sum(is.na(x)))>0)

for(i in seq(along=navars)) {
  for(j in 1:2)
    X[batch==j,navars[i]][is.na(X[batch==j,navars[i]])] <- median(X[batch==j,navars[i]], na.rm=TRUE)
}



# Save data set 'BipolardisorderMethyl'
############################################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/BipolardisorderMethyl.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls()); gc()
