#PREPROCESSING # Set directory setwd("DIR RAW DATA/DATA") # Get data library(GEOquery) # Get supplemental files getGEOSuppFiles("GSE24112") # Download normalised data for pheno data data1 <- getGEO("GSE24112");data1=data1[[1]]; # Extract pheno data pheno1=pData(data1) setwd("DIR RAW DATA/DATA/GSE24112") write.table(pheno1,"GSE24112_pheno.txt", sep = '\t') ################ setwd("DIR RAW DATA/DATA/GSE24112") library(illuminaio) library(limma) # Read Annotation # Illumina provide probe annotation for expression arrays as either tab separated text files or as files with the extension .bgx. BGX files are infact just gzipped versions of the text files. This function reads such files and returns a list with two entries, the first containing the target-probe information and the second containing details of the control probes. bgxfile = dir(pattern="bgx") bgx <- readBGX(bgxfile) # Read Expression data # Be sure: had to manually add ".AVG_Signal" after each sample name..... x <- read.ilmn(files="GSE24112_non-normalized_GH.txt", probeid="ID_REF", other.columns=c("Detection Pval")) # Add annotation data m <- match(rownames(x), bgx$probes$Probe_Id) x$genes <- bgx$probes[m,] x.norm <- neqc(x, detection.p="Detection Pval") ############### #### Determine mean of each gene #### x.normE=x.norm$E SYMBOL=x.norm$genes$Symbol y.norm=aggregate(x.normE,by=list(SYMBOL),FUN="mean") SYMBOL=y.norm$Group.1 y.norm=y.norm[,2:12] # PCA plot jpeg('pca.jpg') color=c('Yellow','Yellow','Yellow','Yellow','Orange','Orange','Orange','Red','Red','Red','Red') data.PC = prcomp(t(y.norm),scale.=TRUE) plot(data.PC$x,col=color);dev.off() # scale individual expression data scaled=data.frame(scale(y.norm, center=TRUE, scale=TRUE)) # Analyse data # create sample source data samples=c("Control","Control","Control","Control","Early","Early","Early","Late","Late","Late","Late") # Set names for scaled values colnames(scaled) <- samples # convert into factors samples <- as.factor(samples) # set up the experimental design design <- model.matrix(~0 + samples) colnames(design) <- levels(samples) # fit the linear model to the filtered expression set fit <- lmFit(y.norm, design) # set up a contrast matrix to compare different groups contrast.matrix <- makeContrasts( Tumor = Late - Control, levels=design) # Now the contrast matrix is combined with the per-probeset linear model fit. tumor_fits <- contrasts.fit(fit, contrast.matrix) tumor_ebFit <- eBayes(tumor_fits) source("IBMT.R") tumor_ibmtFit = IBMT(tumor_ebFit) # IBMT correction tumor_dfFit = data.frame(tumor_ibmtFit) # data frame fdr.fun = function(x) p.adjust(x, method = 'fdr') # FDR Function adj.p = apply(tumor_dfFit[c("p.value")], 2, fdr.fun) # FDR apply on p-values colnames(adj.p) = paste('adj.', colnames(adj.p), sep = '') # rename adj.p.imbt = apply(tumor_dfFit[c("IBMT.p")], 2, fdr.fun) # FDR apply on IBMT p-values colnames(adj.p.imbt) = paste('adj.', colnames(adj.p.imbt), sep = '') tumor_dfFit_adj = cbind(tumor_dfFit, adj.p, adj.p.imbt) # bind columns resultsdf <- cbind(tumor_dfFit_adj,SYMBOL, scaled) setwd("DIR PROCESSED DATA/DATA/") write.table(resultsdf,"GSE24112_genemean.txt", sep = '\t')