rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-36194"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")




# Meta data:

adf <- try(read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
} else {

adf <- readLines(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'
  
}



# It was seen that the signal was too strong and that the (measured)
# expressions were too disparate for men and women:

table(datainfo$'Characteristics[organism part]', datainfo$'FactorValue [SEX]')
 
# --> Take only frontal cortex measurements taken from men and use dichotomized age as target variable:

subsetind <- which((datainfo$'Characteristics[organism part]'=="frontal cortex") & (datainfo$'FactorValue [SEX]'=="male"))

datainfo <- datainfo[subsetind,]


# Batch and target variable:

y <- factor(as.numeric(as.numeric(datainfo$'Characteristics[age (y)]') > median(as.numeric(datainfo$'Characteristics[age (y)]'))) + 1)
table(datainfo$'Characteristics[batch]')
table(datainfo$'Characteristics[batch]', y)

# --> Many batches of smaller sizes.




# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL

all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]



# Batch and target variable:

batch <- factor(as.numeric(factor(datainfo$'Characteristics[batch]')))
y <- factor(as.numeric(as.numeric(datainfo$'Characteristics[age (y)]') > median(as.numeric(datainfo$'Characteristics[age (y)]'))) + 1)




# Read in the arrays:

# Find out indices of NA-values for each array:

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}

mean(sapply(nalist, length)==0)

table(sapply(nalist, length))


# Get indices of variables without missing values:

nonas <- 1:length(sampletemp)
for(i in 1:length(files))
  nonas <- setdiff(nonas, nalist[[i]])

# Percentage of variables without missing values:
(length(sampletemp) - length(nonas))/length(sampletemp)

# --> Too many variables would be lost when using only
#     those with complete measurements.



# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=length(sampletemp))

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)


# Treatment of missing values:

missingpercentage <- sapply(1:15, function(y) apply(X[batch==y,], 2, function(x) mean(is.na(x))))
maxmiss <- apply(missingpercentage, 1, max)

# --> Exclude those variables where >=50% in at least one batch
#     are missing:

X <- X[,maxmiss < 0.5]


# Set missing values to the intra-batch median of the corresponding variable:

navars <- which(apply(X, 2, function(x) sum(is.na(x)))>0)

for(i in seq(along=navars)) {
  for(j in 1:15)
    X[batch==j,navars[i]][is.na(X[batch==j,navars[i]])] <- median(X[batch==j,navars[i]], na.rm=TRUE)
}



# Save data set 'AgeDichotomTranscr'
##################################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/AgeDichotomTranscr.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls());gc()
