rm(list=ls()); gc()

# Data set ID:

datasetid <- "E-GEOD-36458"


# Download processed Data:

library("ArrayExpress")
getAE(datasetid, path = paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), type = "processed")



# Meta data:

adf <- try(read.AnnotatedDataFrame(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep="")))

if(class(adf)!="try-error") {
  datainfo <- adf@data
} else {

adf <- readLines(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid,"/", datasetid, ".sdrf.txt", sep=""))
table(sapply(adf, function(x) length(strsplit(x, split="\t")[[1]])))

datainfo <- data.frame(matrix(nrow=0, ncol=length(strsplit(adf[1], split="\t")[[1]])))
names(datainfo) <- strsplit(adf[1], split="\t")[[1]]

for(i in 1:(length(adf)-1))
  datainfo[i,] <- strsplit(adf[i+1], split="\t")[[1]]
rownames(datainfo) <- datainfo$'Source Name'
  
}



# Look at the frequencies of cancer types in the different batches:

(tabcancer <- table(datainfo$'Characteristics [batch]', datainfo$'Characteristics [cancer type]'))

# --> Exclude batches, which are comprised only of a specific cancer type:
tabcancercand <- tabcancer[apply(tabcancer, 1, function(x) sum(x!=0))>1,]
tabcancercand

# --> Use cancer types 'gastric' and 'non-malignant'
# and batches 'COXES', 'GHATS' and 'SULKY':

datainfoall <- datainfo

datainfo <- datainfo[(datainfo$'Characteristics [batch]' %in% c("COXES", "GHATS", "SULKY")) &
  (datainfo$'Characteristics [cancer type]' %in% c("gastric", "non-malignant")),]

table(datainfo$'Characteristics [batch]', datainfo$'Characteristics [cancer type]')





# We have to check, whether the ordering of the samples is the same in
# the pheno data file as it is in the gene data, we read in:

setwd("~/")
homedir <- getwd()

files <- grep("_sample_table.txt", list.files(paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE), value=TRUE)

datainfonames <- gsub(" 1", "", rownames(datainfo))

files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


files <- files[filesnames %in% datainfonames]


files.names2 <- sapply(files, function(x) strsplit(x, split=paste(homedir, "/FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, "/", sep=""))[[1]][2])
filesnames <- sapply(files.names2, function(x) strsplit(x, split="_sample_table.txt")[[1]])
names(filesnames) <- NULL


all(filesnames==datainfonames[length(datainfonames):1])

# --> The ordering in the pheno data file is reversed.

# -->

datainfo <- datainfo[nrow(datainfo):1,]



# Batch and target variable:

batch <- factor(as.numeric(factor(datainfo$'Characteristics [batch]')))
y <- factor(as.numeric(datainfo$'Characteristics [cancer type]'=="gastric")+1)



# Read in the arrays:

# We have to make a subset of the variables, since there are 500,000.
# --> Choose randomly 50,000.

# Find out indices of NA-values for each array:

head(readLines(files[1]))

nalist <- list()

nit <- length(files)
for(i in 1:length(files)) {
  sampletemp <- readLines(files[i])[-1]
  nalist[[i]] <- which(is.na(sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))))
  cat(paste("Sample", i, "of", nit), "\n")
}

table(sapply(nalist, length))

# --> Are the same variables missing in the blocks of observations
# with same numbers of missing values?:

for(i in 1:length(table(sapply(nalist, length)))) {
  nalisttemp <- nalist[sapply(nalist, length)==names(table(sapply(nalist, length)))[i]]
  tempmat <- matrix(nrow=length(nalisttemp), ncol=length(nalisttemp[[1]]))
  for(j in 1:length(nalisttemp)) {
    tempmat[j,] <- nalisttemp[[j]]
  }
  cat(all(apply(tempmat, 2, function(x) length(unique(x))==1)), "\n")
}

# --> Yes there are blocks of observations with missing values in the
#     same variables.

# The latter blocks correspond to the batches:

table(sapply(nalist, length), batch)


# --> Indices without missing values:

# Get indices of variables without missing values:

nonas <- 1:length(sampletemp)
for(i in 1:length(files))
  nonas <- setdiff(nonas, nalist[[i]])



# Randomly choose 50000 of the variables without missing values:  
  
set.seed(1234)
ranind <- sort(sample(nonas, size=50000))


# Generate the covariate matrix:

datamat <- matrix(nrow=length(files), ncol=50000)

nit <- nrow(datamat)
for(i in seq(along=files)) {
  sampletemp <- readLines(files[i])[-1]
  datamat[i,] <- sapply(sampletemp, function(x) as.numeric(strsplit(x, split="\t")[[1]][2]))[ranind]
  cat(paste("Sample", i, "of", nit), "\n")
}


# Bring to the used format:

X <- as.matrix(datamat)





# Remove outliers:

# Principal component plot:

library("bapred")
pcplot(x=X, batch=batch, y=y)

# --> One outlier. --> Identify and remove:

xpr <- prcomp(X, scale. = FALSE)
xp <- predict(xpr)[,1:2]

outlierind <- which(xp[,1] > 1000)

X <- X[-outlierind,]
batch <- batch[-outlierind]
y <- y[-outlierind]

pcplot(x=X, batch=batch, y=y)

# --> Another less sever outlier is seen. 
# --> Identify and remove:

xpr <- prcomp(X, scale. = FALSE)
xp <- predict(xpr)[,1:2]

outlierind <- which(xp[,2] < -300)

X <- X[-outlierind,]
batch <- batch[-outlierind]
y <- y[-outlierind]

# --> OK.


# Look at further PCs:

xpr <- prcomp(X, scale. = FALSE)
xp <- predict(xpr)[,1:5]
pairs(xp)

# --> Another outlier is found. --> Remove:

outlierind <- which(xp[,3] > 200)

X <- X[-outlierind,]
batch <- batch[-outlierind]
y <- y[-outlierind]


# Look at further PCs:

xpr <- prcomp(X, scale. = FALSE)
xp <- predict(xpr)[,1:5]
pairs(xp)

# --> No severe outlier anymore.




# Save data set 'ColonGastricEsophagealcSNPArray'
#################################################

save(X, y, batch, file="./FAbatchPaper/Datasets/ProcessedData/ColonGastricEsophagealcSNPArray.Rda")

do.call(file.remove, list(list.files(paste("./FAbatchPaper/Datasets/DownloadedIntermediateData/", datasetid, sep=""), full.names=TRUE)))

# Clear the workspace:
rm(list=ls());gc()
