rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-36701"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")



# Meta data:

adf <- try(read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
} else {

adf <- readLines(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'
  
}




table(datainfo$'Characteristics[batch]', datainfo$'Characteristics[disease]')

# --> Exclude batches "2_A" and "2_B", since these are follow-ups for
#     patients in the other batches:

datainfo <- datainfo[!(datainfo$'Characteristics[batch]' %in% c("2_A", "2_B")),]


# For the majority of patients there are two arrays:

table(datainfo$'Characteristics[subject identifier]')

# Look at one example:
datainfo2 <- datainfo[datainfo$'Characteristics[subject identifier]'=="SD52930",]

datainfo2[,apply(datainfo2, 2, function(x) length(unique(x))!=1)]

# --> Only the names are different, therefore one can assume that
#     the patients were simply measured twice.


# Look at the samples which were only measured once:

onetimenames <- names(table(datainfo$'Characteristics[subject identifier]'))[table(datainfo$'Characteristics[subject identifier]')==1]

datainfo1 <- datainfo[datainfo$'Characteristics[subject identifier]' %in% onetimenames,]

datainfo1[,apply(datainfo2, 2, function(x) length(unique(x))!=1)]

# --> No peculiarities could be made out. --> Only use the data of one array
#     per patient:

# First include all arrays which belong to patients only measured once:
indchoose <- which(datainfo$'Characteristics[subject identifier]' %in% onetimenames)

# Identifiers of subjects measured two times:
twotimenames <- names(table(datainfo$'Characteristics[subject identifier]'))[table(datainfo$'Characteristics[subject identifier]')==2]

# Get the indices in the data set for each of these subjects:
subjectinds <- lapply(twotimenames, function(x) which(datainfo$'Characteristics[subject identifier]' %in% x))

# For consistency we always choose the array with the smaller number.
# --> Extract numbers from the array names:
arraynumber <- sapply(rownames(datainfo), function(x) as.numeric(strsplit(strsplit(x, split=" 1")[[1]], split="GSM")[[1]][2]))

indslower <- unlist(lapply(subjectinds, function(x) x[which.min(arraynumber[x])]))

indchoose <- c(indchoose, indslower)
indchoose <- sort(indchoose)

# Subset datainfo:

datainfo <- datainfo[indchoose,]


# -->

table(datainfo$'Characteristics[batch]', datainfo$'Characteristics[disease]')




# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]





# Read in the arrays:

# Find out indices of NA-values for each array:

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}

mean(sapply(nalist, length)==0)

# --> No missing values.


# --> Indices without missing values:

sampletemp <- readLines(files[1])[-1]
nonas <- 1:length(sampletemp)


# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=length(sampletemp))

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)




# Remove outliers:

# Principal component plot:

library("bapred")
pcplot(x=X, batch=factor(as.numeric(factor(datainfo$'Characteristics[batch]'))))

# --> There are obviously outliers.

# --> Identify and remove outliers:

xpr <- prcomp(X, scale. = FALSE)
xp <- predict(xpr)[,1:5]
pairs(xp)

outlierind <- which(xp[,2] > 10000)

X <- X[-outlierind,]
datainfo <- datainfo[-outlierind,]


# Further outliers:

pcplot(x=X, batch=factor(as.numeric(factor(datainfo$'Characteristics[batch]'))))

xpr <- prcomp(X, scale. = FALSE)
xp <- predict(xpr)[,1:5]

outlierind <- which(xp[,1] > 10000)

X <- X[-outlierind,]
datainfo <- datainfo[-outlierind,]


# No further outliers:

pcplot(x=X, batch=factor(as.numeric(factor(datainfo$'Characteristics[batch]'))))

# --> OK.



# Batch and target variable:

batch <- factor(as.numeric(factor(datainfo$'Characteristics[batch]')))
y <- factor(as.numeric(datainfo$'Characteristics[disease]'!="HV")+1)



# Save data set 'IBSTranscr'
############################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/IBSTranscr.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls());gc()
