#PREPROCESSING # Set directory setwd("DIR RAW DATA/DATA") # Get data library(GEOquery) # Get supplemental files getGEOSuppFiles("GSE63032") # Download normalised data for pheno data data1 <- getGEO("GSE63032");data1=data1[[1]]; # Extract pheno data pheno1=pData(data1) setwd("DIR RAW DATA/DATA/GSE63032") write.table(pheno1,"GSE63032_pheno.txt", sep = '\t') # -> Unpack data in windows explorer # Process data library(limma) library(affyPLM) library(RColorBrewer) setwd("DIR RAW DATA/DATA/GSE63032") # Read in data using remapped CDF affy.data <- ReadAffy(cdfname = 'mogene10stmmentrezg') # RMA normalization x.norm <- fitPLM(affy.data) # QC checks # set colour palette cols <- brewer.pal(8, "Set1") # plot a boxplot of unnormalised intensity values jpeg('boxplot1.jpg') boxplot(affy.data, col=cols);dev.off() # plot a boxplot of normalised intensity values, affyPLM is required to interrogate celfiles.gcrma jpeg('boxplot2.jpg') boxplot(x.norm, col=cols);dev.off() # the boxplots are somewhat skewed by the normalisation algorithm # and it is often more informative to look at density plots # Plot a density vs log intensity histogram for the unnormalised data jpeg('hist1.jpg') hist(affy.data, col=cols);dev.off() # Plot a density vs log intensity histogram for the normalised data jpeg('hist2.jpg') hist(pset2eset(x.norm), col=cols);dev.off() # Create image of arrays: # Weights jpeg('weights.jpg',width = 1200, height = 1200, units = "px", pointsize = 12,quality = 100) op = par(mfrow = c(4,3)) for (i in 1:6){image(x.norm,which=i)};dev.off() # Residuals jpeg('residuals.jpg',width = 1200, height = 1200, units = "px", pointsize = 12,quality = 100) op = par(mfrow = c(4,3)) for (i in 1:6){image(x.norm,type='resids',which=i)};dev.off() # Significance jpeg('significance.jpg',width = 1200, height = 1200, units = "px", pointsize = 12,quality = 100) op = par(mfrow = c(4,3)) for (i in 1:6){image(x.norm,type='sign.resids',which=i)};dev.off() # PCA plot jpeg('pca.jpg') op = par(mfrow= c(1,1)) data.matrix=exprs(pset2eset(x.norm)) color=c('blue','blue','blue', 'cyan','cyan','cyan', 'red','red','red', 'orange','orange','orange') data.PC = prcomp(t(data.matrix),scale.=TRUE) plot(data.PC$x,col=color);dev.off() # affyPLM also provides more informative boxplots # RLE (Relative Log Expression) plots should have # values close to zero. jpeg('RLE.jpg') RLE(x.norm, main="RLE");dev.off() # We can also use NUSE (Normalised Unscaled Standard Errors). # The median standard error should be 1 for most genes. jpeg('NUSE.jpg') NUSE(x.norm, main="NUSE");dev.off() # convert PLMset to eSet! x.norm <- pset2eset(x.norm) ###### MUSCLE DATA ###### # scale individual expression data scaled=data.frame(t(scale(data.frame(x.norm)[,1:22135]))) # Analyse data # create sample source data samples=c("GSE63032_C26_Muscle","GSE63032_C26_Muscle","GSE63032_C26_Muscle","GSE63032_Control_Muscle","GSE63032_Control_Muscle","GSE63032_Control_Muscle") # Set names for scaled values colnames(scaled) <- samples # convert into factors samples <- as.factor(samples) # set up the experimental design design <- model.matrix(~0 + samples) colnames(design) <- levels(samples) # fit the linear model to the filtered expression set fit <- lmFit(exprs(x.norm), design) # set up a contrast matrix to compare different groups contrast.matrix <- makeContrasts( C26_Muscle_Control_Muscle = GSE63032_C26_Muscle - GSE63032_Control_Muscle, levels=design) # Now the contrast matrix is combined with the per-probeset linear model fit. tumor_fits <- contrasts.fit(fit, contrast.matrix) tumor_ebFit <- eBayes(tumor_fits) source("IBMT.R") tumor_ibmtFit = IBMT(tumor_ebFit) # IBMT correctie tumor_dfFit = data.frame(tumor_ibmtFit) # Omzetten naar data frame fdr.fun = function(x) p.adjust(x, method = 'fdr') # FDR adj.p = apply(tumor_dfFit[9], 2, fdr.fun) # FDR apply on p-values colnames(adj.p) = paste('adj.', colnames(adj.p), sep = '') # rename adj.p.imbt = apply(tumor_dfFit[14], 2, fdr.fun) # FDR apply on IBMT p-values colnames(adj.p.imbt) = paste('adj.', colnames(adj.p.imbt), sep = '') tumor_dfFit_adj = cbind(tumor_dfFit, adj.p, adj.p.imbt) # bind columns # Annotate results library(mogene10stmmentrezg.db) anno.resultdf <- select(mogene10stmmentrezg.db, keys=rownames(tumor_dfFit_adj), columns=c("ENTREZID","SYMBOL","GENENAME","ALIAS"),keytype="PROBEID") anno.resultdf <- anno.resultdf[!duplicated(anno.resultdf[,1]),] # Get rid of duplicates; only keep 1st hit resultsdf <- cbind(tumor_dfFit_adj, anno.resultdf,scaled) setwd("DIR PROCESSED DATA/DATA/") write.table(resultsdf,"GSE63032_customcdf23.txt", sep = '\t')