# This script shows how data analyses were done in the paper by Khomich et al. (2021) # "Analysing microbiome intervention design studies: Comparison of alternative multivariate statistical methods" #Script to simulate microbiome data using experimental design from Moen et al. (2016) # #Simulation according to four scenarios, given by a design on # (1) the number of differentially abundant (DA) OTUs (few or many), and (2) the effect size (low or high fold change). # 100 data sets were generated for each scenario. # #For each data set the DA OTUs were assigned randomly (to filtered data). # Data were simulated as counts using the package metaSPARSim. # Counts, relative abundance and CLR-transformed data were saved for each simulation in separate sub-folders. # Simulated data were saved in separate sub-folders under the "datadir" directory, # and one sub-folder for each of the data types. # Each data set is named sim1.txt, sim2.txt etc. # # ### Workflow of the script ### # Load libraries # Set parameters: Location of the data, path where to save the data ... # Define functions for computing relative abundance, CLR and do filtering # Estimate parameters based on the Moen data. RData file with the data should be loaded (S3 File) # Make the simulation design. # Simulate Nsim data sets for each scenario. # The S3 File "Moen data with preprocessing and filtering.RData" is required for estimation of simulation parameters. ### INSTALLATION METASPARSIM ### #if metaSparSim is not installed, than run: #library(devtools) #install_gitlab("sysbiobig/metaSPARSim", build_opts = c("--no-resave-data","--no-manual"), build_vignettes = TRUE) library(plyr) library(reshape2) library(magrittr) library(tidyr) library(metaSPARSim) library(ggforce) rm(list = ls()) #-------------------------------------------------------------------------------------------------------------------# ### Path definitions ### #change here to run on your own computer #-------------------------------------------------------------------------------------------------------------------# #Working directory" setwd("C:/Users/name.surname/full_path_to_the_folder") #Main folder for saving simulated data setwd("C:/Users/name.surname/full_path_to_the_folder") path <- file.path(dirname(getwd()),"Simulation4Scenarios") dir.create(path) set.seed(8765) #To ensure reproducibility #Here Nsim is set to 2 for fast testing. Set Nsim to the desired number of simulations Nsim <- 2 #number of simulations ## Define functions #### #Make functions for operations done several times in the script. #Function to compute relative abundance (for simulated data all OTUs) ComputeRelAb <- function(SS){ #SS input data, output from MetaSPARSim simdata_norm <- sweep(SS$counts,2, colSums(SS$counts), FUN="/") #normalisation by dividing with colsums SimRelAb <- t(simdata_norm) #get samples in rows colnames(SimRelAb) <- colnames(ObsData) #same variable names as ObsData return(SimRelAb)} #Function for filtering and adding the "filtered_out" column FilterData <- function(RR,filter){ ##RR input relative abundances RRF <- as.data.frame(RR[,filter]) RRF$filtered_out <- 1 - rowSums(RRF) #to account for data filtered_out colnames(RRF[,-which(colnames(RRF) == "filtered_out")]) <- paste0("V",1:sum(filter)) return(RRF) } #function for CLR-transformation, also removes "filtered_out" column CLR.transform <- function(RR){ #RR input - relative abundance after filtering, includes column "filtered_out" C0 <- RR + offsetRelAb #use same offset as for observed data CLR <-as.data.frame(apply(C0, 1, function(x){log(x) - mean(log(x))})) CLR <- as.matrix(t(CLR)) # to get back to matrix samples x OTUs CLR <- CLR[,-which(colnames(CLR)=="filtered_out")] colnames(CLR) <- paste0("V",1:ncol(CLR)) return(CLR) } #---------------------------------------------------------------------------------------------------------------# ### LOAD PREPROCESSED DATA FROM Moen data CASE ### #---------------------------------------------------------------------------------------------------------------# # OTU data # do not include anything on taxonomy here as it is not relevant for the simulations # include OTUid to make sure different data/results merged correctly #prev saved RData compise # data, data_norm, design, filter, indpop, ObsCLRm ObsData, ObsRelab, ObsRElabF, offsetRelAb load(file = "Moen data with preprocessing and filtering.RData") otu_data <- data.frame(OTUid = colnames(ObsData), filter = filter, meanCount = apply(ObsData,2,mean)) otu_data$meanCLR <- NA #initialisation otu_data$meanCLR[filter] <- apply(ObsCLR,2,mean) #average value across all samples otu_data$meanRelAb <- apply(ObsRelAb,2,mean) otu_data$LogMeanRelAb <- log10(otu_data$meanRelAb + offsetRelAb) #---------------------------------------------------------------------------------------------------------------# ### Create null parameters (for no differences) ### #---------------------------------------------------------------------------------------------------------------# #estimate parameters from Moen data params<-estimate_parameter_from_data(data, data_norm, indpop) names(params)<-names(indpop) design$Group <- factor(design$Group, levels = sort(unique(design$Group)) ) #make all groups similar med.int <- apply(ldply(params, function(pp) pp$intensity, .id=NULL),2,median, na.rm=T) med.var <- apply(ldply(params, function(pp) pp$variability, .id=NULL),2,median, na.rm=T) ## Simulated data are zero for all samples on OTU nr 67 (after filtering) # solution: the med.int value for OTU 67 is 0 - replace with the min of positive values for those passing the filter #Replace the zero value with the min value (for those passing the filter) med.int[filter & med.int==0] <- min(med.int[med.int>0 & filter]) params0 <- lapply(params, function(pp) { pp$intensity <- med.int pp$variability <- med.var return(pp)}) #Simulate for null differences # set.seed(2138) #for repeat ability # simdata0 <- metaSPARSim(params0) # # #Relative abundance, filtering and CLR-transformation for the simulated data # SimRelAb0 <- ComputeRelAb(simdata0) # SimRelAbF0 <- FilterData(SimRelAb0,filter) # CLR.nulldiff <- CLR.transform(SimRelAbF0) #---------------------------------------------------------------------------------------------------------------# ### Make Design for simulation study ### #---------------------------------------------------------------------------------------------------------------# # 2 by 2 design # Factor A: percentage of OTUs significantly different, # Factor B: Foldchange (= effect size) #aim for design # low level both factors - on the border of detectable # high level both factors - non detection is failure #DA OTUs generated randomly among the 507 OTUs per simulation design.factors <- data.frame(pDA = c(2,50), FC = c("low","high"),lb = c(2^3,2^8), ub = c(2^4,2^9)) SimDesign <- expand.grid(pDA=design.factors[,"pDA"],FC = design.factors[,"FC"]) SimDesign$Simulation <- paste0("S",1:nrow(SimDesign)) SimDesign$Label <- paste(SimDesign$Simulation,SimDesign$pDA, SimDesign$FC,sep="_") SimDesign <- merge(SimDesign,design.factors[,-1]) SimDesign$FC2.low <- log2(SimDesign$lb) SimDesign$FC2.high <- log2(SimDesign$ub) SimDesign <- SimDesign[order(SimDesign$Simulation), c("Simulation","Label","pDA","FC","lb","ub","FC2.low","FC2.high")] SimDesign$FC<- factor(SimDesign$FC, levels = c("low","high")) #Initialisation for simulations params.all <- as.list(1:nrow(SimDesign)) #initialisation Notu <- ncol(ObsCLR) FoldChanges <- array(NA, dim = c(nrow(SimDesign),Notu), dimnames = list(Simulation = paste0("S",1:nrow(SimDesign)), OTU = paste0("V",1:Notu))) #---------------------------------------------------------------------------------------------------------------# ### Simulate data according to the simulation design ### #---------------------------------------------------------------------------------------------------------------# save(SimDesign, file = file.path(path,"SimulationDesign.RData")) #Initialisation pca.all <- as.list(1:nrow(SimDesign)) names(pca.all)<- SimDesign$Label ff.list <- pca.all for (i in 1:nrow(SimDesign)) { set.seed(1234) #repeat ability #sub-folders for each scenario and data type (counts, relab, CLR) resdir.i <- file.path(path,SimDesign$Label[i]) counts.i <- file.path(resdir.i,"counts") clr.i <- file.path(resdir.i,"CLR") RelAb.i <- file.path(resdir.i,"RelAb") res.i <- file.path(resdir.i,"results") tmp <- dir.create(resdir.i) dir.create(counts.i) dir.create(clr.i) dir.create(RelAb.i) dir.create(res.i) #parameter settings simulation i ps <- params0 pDA <- SimDesign[i,"pDA"] lb <- SimDesign[i,"lb"] #lower bound on log fold change this scenario ub <- SimDesign[i,"ub"] #upper bound on log fold change nDA <- round(pDA*Notu/100) #number Differential Abundant #Intialisation - #Store foldchanges - store true intensities - indx DA - per simulation (foldchanges != 0 tell which are DA) FCs <- matrix(0,nrow=Notu, ncol=Nsim) colnames(FCs) <- paste0("s",1:Nsim) FoldChanges <- cbind(data.frame(OTU = paste0("V",1:Notu)),FCs) TrueInt <- array(0,dim = c(Notu,Nsim,2),dimnames = list(OTU = rownames(FoldChanges), simulation = paste0("s",1:Nsim), group = c("IN","Other"))) idxDA.all <- matrix(0,nrow=nDA,ncol=Nsim) for (ss in 1:Nsim) { cat("Setting",i,"of",nrow(SimDesign),"Simulation",i,"of",Nsim) idxDA.all[,ss] <- idxDA <- sample(Notu,nDA) # Foldchange for DA OTUs DA_multiplier <- rep(0,Notu) DA_multiplier[idxDA] <- runif(n = nDA, min = log2(lb), max = log2(ub) ) log_foldchange_multiplier <- rep(0,ncol(ObsData)) #initialise with no change log_foldchange_multiplier[filter==T] <- DA_multiplier #manipulate on those passing the filter fold_change_multiplier <- 2^log_foldchange_multiplier FoldChanges[,ss+1] <- fold_change_multiplier[filter] #pure feed effect - no interaction with this setup #can add interaction by making two different fold_change_multipliers #pr interaction fold_change multipler so ps <- params0 ps$'0.05_IN'$intensity <- params0$'0.05_IN'$intensity*fold_change_multiplier ps$'0.15_IN'$intensity <- params0$'0.15_IN'$intensity*fold_change_multiplier #Parameters for simulation #it is only for IN that it is changed, and both are simulated in the same way, save for one of the IN groups #but keep one for simplicity TrueInt[,ss,"IN"] <- ps$`0.05_IN`$intensity[filter] TrueInt[,ss,"Other"] <- ps$`0.05_BSG`$intensity[filter] #Do simulation and preprocess simdata.ss <- metaSPARSim(ps) #indicate simulation ss relab.ss <- ComputeRelAb(simdata.ss) relab.ss.f <- FilterData(relab.ss,filter) CLR.ss <- CLR.transform(relab.ss.f) #Save simulated data to a file - different folders for counts (all), relative abundance (all) and CLR (filtered) write.table(simdata.ss$counts, file=file.path(counts.i,paste0("sim",ss,".txt")),sep="\t") write.table(CLR.ss, file=file.path(clr.i,paste0("sim",ss,".txt")),sep="\t") write.table(relab.ss, file=file.path(RelAb.i,paste0("sim",ss,".txt")),sep="\t") } #Save truth to the main folder for scenario i save(FoldChanges, TrueInt, file=file.path(resdir.i,"FoldChanges.RData")) write.table(TrueInt[,,"IN"], file=file.path(resdir.i,"TrueIntensities.txt"),sep="\t") write.table(FoldChanges, file=file.path(resdir.i,"FoldChanges.txt"),sep="\t") } ### The end of the script ###