### ### With this script, genotype and phenotype data of half-sib families are simulated and a genome-based association analysis is carried out. ### Note that an exact reproduction of the results of our simulation study is not possible ### because AlphaSimR relies on random number generation which cannot be initialized yet. ### ### PACKAGES REQUIRED library(AlphaSimR) library(snpStats) library(hscovar) library(magrittr) library(rlist) library(asreml) library(doParallel) library(fpc) ### GLOBAL SETTINGS alpha <- seq(0.01, 1, 0.01) thresh <- c(0.8, 0.7, 0.6, 0.5) nmothers.max <- 1000 p <- 300 h2 <- 0.3 repetitions <- 100 folder.fig <- 'figures_simstudy' dir.create(folder.fig, showWarnings = F) folder.dat <- 'genomic_data' dir.create(folder.dat, showWarnings = F) nclust <- 6 ### FUNCTION DEFINITIONS # at least one significant SNP in window around QTL = 1 TP (<= nqtl) fpr <- function(sign.snp, snp.pos, qtl.pos, win = 1e-2){ # window around true QTL in map distances (e.g. cM) len <- length(snp.pos) pos.a <- qtl.pos - win pos.e <- qtl.pos + win qtl.map <- data.frame(qtl.pos, pos.a, pos.e) # SNPs being next to causative variants qtlsnp <- found.qtl <- c() for (l in 1:length(qtl.pos)) { interval <- which((snp.pos >= qtl.map$pos.a[l]) & (snp.pos <= qtl.map$pos.e[l])) qtlsnp <- c(qtlsnp, interval) found.qtl[l] <- any(sign.snp %in% interval) } tp <- sum(found.qtl) fp <- sum(!(sign.snp %in% qtlsnp)) fn <- sum(!found.qtl) tn <- sum(setdiff(1:len, sign.snp) %in% setdiff(1:len, qtlsnp)) return(list(sens = tp / (tp + fn), spec = tn / (fp + tn), tp = tp, fp = fp, tn = tn, fn = fn, found.qtl = found.qtl)) } # recode loci according to major allele frequency recode.loci <- function(pop){ X <- pullSegSiteGeno(pop) freqs <- colSums(X) / (2 * nrow(X)) rec <- which(freqs < 0.5) return(rec) } recode.anything <- function(mat, loci, geno = F){ if(!geno){ # haplotypes for(l in loci){ eins <- mat[, l] == 1 mat[mat[, l] == 0, l] <- 1 mat[eins, l] <- 0 } } else{ # genotypes for(l in loci){ zwei <- mat[, l] == 2 mat[mat[, l] == 0, l] <- 2 mat[zwei, l] <- 0 } } return(mat) } ### MAIN PROGRAM cl <- makeCluster(nclust) registerDoParallel(cl) out <- foreach(nfathers = rep(c(10, 5, 1), each = 2), nqtl = rep(c(2, 5), times = 3), .packages = c('asreml', 'AlphaSimR', 'hscovar', 'snpStats', 'magrittr', 'rlist', 'fpc'), .combine = 'rbind') %dopar% { bins <- list() fail.ridge <- 0 sens.ridge.all <- sens.ridge.r2 <- sens.ridge.cor <- spec.ridge.all <- spec.ridge.r2 <- spec.ridge.cor <- matrix(nrow = repetitions, ncol = length(thresh), 0) nfam <- round(nmothers.max / nfathers) for(wdh in 1:repetitions){ if(wdh == 1) for(idx.thresh in 1:length(thresh)) bins[[idx.thresh]] <- matrix(nrow = repetitions, ncol = 7, NA) ### simulate founder population (only 1 trait at a time) ######################################## u <- 0 repeat{ founderPop <- runMacs2(nInd = 2 * nmothers.max, nChr = 1, segSites = p, Ne = 100, bp = 1e+08, genLen = 0.01, mutRate = 2.5e-08) assign("SP", SimParam$new(founderPop), envir = globalenv()) SP$setSexes("yes_sys") SP$addTraitA(nQtlPerChr = nqtl) trait <- SP$traits[[1]] # Change additive effects to ones trait@addEff <- rep(1, trait@nLoci) # Change intercept to zero trait@intercept <- 0 # Replace trait at position 1 SP$switchTrait(1, trait) if((SP$varA > 0) & (SP$varG > 0)){ SP$setVarE(h2 = h2) SP$rescaleTraits(mean = 0, var = 1, varEnv = 0, varGxE = 1e-06) pop <- newPop(founderPop) break } u <- u + 1 if (u == 10){ cat('ERROR no meaningful simulation in repetition', wdh, '\n') break } } rec <- recode.loci(pop) qtl.pos <- getQtlMap(trait = 1, simParam = SP)$pos * 100 snp.pos <- founderPop@genMap[[1]] * 100 qtl.snp <- SP$traits[[1]]@lociLoc qtl.eff <- SP$traits[[1]]@addEff if(!all(is.finite(qtl.eff))){ fail.ridge <- fail.ridge + 1 next } n <- nfam * nfathers ## determine recent population my_pop <- selectCross(pop = pop, nFemale = nmothers.max, nMale = nfathers, use = "pheno", nCrosses = n, trait = 1) ## family-wise centred genotype and phenotype matrix X <- pullSegSiteGeno(my_pop) X <- recode.anything(X, rec, geno = T) M <- apply(X, 2, function(y){tapply(y, my_pop@father, mean)}) if(nfathers == 1) { M <- matrix(M, nrow = nfathers, ncol = ncol(X)) rownames(M) <- my_pop@father[1] } y <- pheno(my_pop)[, 1] Xc <- matrix(ncol = ncol(X), nrow = nrow(X)) for(s in unique(my_pop@father)){ id <- which(my_pop@father == s) Xc[id, ] <- sapply(1:ncol(X), function(i){(X[id, i] - M[s, i])}) y[id] <- y[id] - mean(y[id]) } Xs <- apply(Xc, 2, function(x){if(sd(x) > 1e-6) x / sd(x) else x}) y <- y / sd(y) ## MEASURE OF LD in half-sibs -> covmat # maternal haplotypes of progeny H.mat <- pullSegSiteHaplo(my_pop, haplo = 1) H.mat <- recode.anything(H.mat, rec) # sire haplotypes sire <- pop@id[match(my_pop@father, pop@id)] rown <- paste(rep(unique(sire), each = 2), c(1, 2), sep = '_') H.sire <- pullSegSiteHaplo(pop)[rown, ] H.sire <- recode.anything(H.sire, rec)#[1:(2 * nfathers), ] # correlation matrix pos.chr <- list(founderPop@genMap[[1]]) # in Morgan linkDam <- LDdam(inMat = H.mat, pos.chr) covmat <- CovMat(linkDam, H.sire, nfam, pos.chr, corr = T) q <- ncol(covmat$R) fam.index <- my_pop@father %>% as.factor %>% as.numeric save(list = c('covmat', 'X', 'y', 'fam.index'), file = paste0('genomic_data/N_', nfathers, 'QTL_', nqtl, '_', wdh, '.RData'), compress = 'xz') ## MEASURE OF POPULATION LD -> LDmat colnames(X) <- 1:ncol(X) X1 <- new('SnpMatrix', X) ldmat <- ld(X1, stats = "R.squared", depth = ncol(X) - 1) popLD <- as.matrix(ldmat) for(idx.thresh in 1:length(thresh)){ ## TAGSNPS FROM CORRELATION MATRIX bin.cor <- tagSNP(covmat$R, threshold = thresh[idx.thresh]) ts.cor <- list.select(bin.cor, tagsnp) %>% unlist %>% as.numeric %>% sort len3.cor <- list.filter(bin.cor, length(snps) >= 3) %>% length grp.cor <- c() for(l in 1:length(bin.cor)) grp.cor[as.numeric(bin.cor[[l]]$snps)] <- l ch.cor <- calinhara(t(Xs[, covmat$valid.snps]), grp.cor[covmat$valid.snps]) ### TAGSNPS FROM R2-MATRIX bin.r2 <- tagSNP(popLD, threshold = thresh[idx.thresh]) ts.r2 <- list.select(bin.r2, tagsnp) %>% unlist %>% as.numeric %>% sort len3.r2 <- list.filter(bin.r2, length(snps) >= 3) %>% length grp.r2 <- c() for(l in 1:length(bin.r2)) grp.r2[as.numeric(bin.r2[[l]]$snps)] <- l ch.r2 <- calinhara(t(Xs), grp.r2) bins[[idx.thresh]][wdh, ] <- c(q, length(bin.cor), len3.cor, ch.cor, length(bin.r2), len3.r2, ch.r2) ## SNP-BLUP approach - all SNPs data <- data.frame(y, Xs) ridge.all <- asreml(fixed = y ~ 1, random = ~ grp(geno), group = list(geno = 2:ncol(data)), data = data, control = asreml.control(trace = F), na.method.X = "omit", na.method.Y = "omit", workspace = 1024e+06) ## SNP-BLUP approach - covmat data1 <- data.frame(y, Xs[, ts.cor]) ridge.cor <- asreml(fixed = y ~ 1, random = ~ grp(geno), group = list(geno = 2:ncol(data1)), data = data1, control = asreml.control(trace = F), na.method.X = "omit", na.method.Y = "omit", workspace = 1024e+06) ## SNP-BLUP approach - LDmat data2 <- data.frame(y, Xs[, ts.r2]) ridge.r2 <- asreml(fixed = y ~ 1, random = ~ grp(geno), group = list(geno = 2:ncol(data2)), data = data2, control = asreml.control(trace = F), na.method.X = "omit", na.method.Y = "omit", workspace = 1024e+06) ## testing - all SNPs ridge.all$Vu <- summary(ridge.all)$varcomp$component[1] ridge.all$u.var <- ridge.all$vcoef$random * ridge.all$sigma2 # var for u-uHat u.var <- ridge.all$Vu - ridge.all$u.var # var for uHat, Searle p. 272 id <- u.var > 1e-6 pval.ridge.all <- rep(1, ncol(Xs)) pval.ridge.all[id] <- 2 * pnorm(abs(ridge.all$coefficients$random[id]) / sqrt(u.var[id]), lower.tail = F) idx.ridge <- lapply(alpha, function(typeI){which(pval.ridge.all <= typeI)}) fpr.ridge <- lapply(idx.ridge, function(idx){fpr(idx, snp.pos, qtl.pos, win = 0.05)}) sens.ridge.all[, idx.thresh] <- sens.ridge.all[, idx.thresh] + list.select(fpr.ridge, sens) %>% unlist %>% as.numeric spec.ridge.all[, idx.thresh] <- spec.ridge.all[, idx.thresh] + list.select(fpr.ridge, spec) %>% unlist %>% as.numeric ## testing - covmat ridge.cor$Vu <- summary(ridge.cor)$varcomp$component[1] ridge.cor$u.var <- ridge.cor$vcoef$random * ridge.cor$sigma2 # var for u-uHat u.var <- ridge.cor$Vu - ridge.cor$u.var # var for uHat, Searle p. 272 id <- u.var > 1e-6 pval.ridge.cor <- rep(1, length(ts.cor)) pval.ridge.cor[id] <- 2 * pnorm(abs(ridge.cor$coefficients$random[id]) / sqrt(u.var[id]), lower.tail = F) idx.ridge <- lapply(alpha, function(typeI){which(pval.ridge.cor <= typeI)}) fpr.ridge <- lapply(idx.ridge, function(idx){fpr(idx, snp.pos[ts.cor], qtl.pos, win = 0.05)}) sens.ridge.cor[, idx.thresh] <- sens.ridge.cor[, idx.thresh] + list.select(fpr.ridge, sens) %>% unlist %>% as.numeric spec.ridge.cor[, idx.thresh] <- spec.ridge.cor[, idx.thresh] + list.select(fpr.ridge, spec) %>% unlist %>% as.numeric ## testing - LDmat ridge.r2$Vu <- summary(ridge.r2)$varcomp$component[1] ridge.r2$u.var <- ridge.r2$vcoef$random * ridge.r2$sigma2 # var for u-uHat u.var <- ridge.r2$Vu - ridge.r2$u.var # var for uHat, Searle p. 272 id <- u.var > 1e-6 pval.ridge.r2 <- rep(1, length(ts.r2)) pval.ridge.r2[id] <- 2 * pnorm(abs(ridge.r2$coefficients$random[id]) / sqrt(u.var[id]), lower.tail = F) idx.ridge <- lapply(alpha, function(typeI){which(pval.ridge.r2 <= typeI)}) fpr.ridge <- lapply(idx.ridge, function(idx){fpr(idx, snp.pos[ts.r2], qtl.pos, win = 0.05)}) sens.ridge.r2[, idx.thresh] <- sens.ridge.r2[, idx.thresh] + list.select(fpr.ridge, sens) %>% unlist %>% as.numeric spec.ridge.r2[, idx.thresh] <- spec.ridge.r2[, idx.thresh] + list.select(fpr.ridge, spec) %>% unlist %>% as.numeric } } ## sensitivity specificity sens.ridge.all <- sens.ridge.all / (repetitions - fail.ridge) sens.ridge.cor <- sens.ridge.cor / (repetitions - fail.ridge) sens.ridge.r2 <- sens.ridge.r2 / (repetitions - fail.ridge) spec.ridge.all <- spec.ridge.all / (repetitions - fail.ridge) spec.ridge.cor <- spec.ridge.cor / (repetitions - fail.ridge) spec.ridge.r2 <- spec.ridge.r2 / (repetitions - fail.ridge) save(file = paste0(folder.dat, '/Results_N', nfathers, '_', nqtl, 'QTL.Rdata'), list = c("sens.ridge.all", "sens.ridge.cor", "sens.ridge.r2", "spec.ridge.all"), "spec.ridge.cor", "spec.ridge.r2", "fail.ridge", "bins") b <- lapply(bins, function(z){apply(z, 2, function(v){mean(v, na.rm = T)})}) %>% list.rbind b <- cbind(nfathers, nqtl, thresh, b) cat('average number bins (all, cor, r2): \n') print(b) b } stopCluster(cl) colnames(out) <- c("nfathers", "nqtl", "threshold", "snps", "nbins.cor", "len3.cor", "ch.cor", "nbins.r2", "len3.r2", "ch.r2") # 'out' contains results of Table 1 and Supplemental file 7 save(list = 'out', file = 'output.Rdata') ### plot ROC curves nfathers <- 1 nqtl <- 2 pdf(paste0(folder.fig, '/ROC_N', nfathers, '_', nqtl, 'QTL.pdf'), width = 14, pointsize = 14) par(mfcol = c(1, 2), xpd = T) lab <- 0 for(idx.thresh in c(1, 4)){ lab <- lab + 1 load(paste0(folder.dat, '/Results_N', nfathers, '_', nqtl, 'QTL.Rdata')) plot(0, 0, xlab = '1 - specificity (FPR)', ylab = 'sensitivity (TPR)', xlim = c(0, 1), ylim = c(0,1), main = '', type = 'n') abline(0, 1, col = 8, xpd = F) points(1 - spec.ridge.all[, idx.thresh], sens.ridge.all[, idx.thresh]) points(1 - spec.ridge.cor[, idx.thresh], sens.ridge.cor[, idx.thresh], col = 'orange', pch = 20) points(1 - spec.ridge.r2[, idx.thresh], sens.ridge.r2[, idx.thresh], col = 'blue', pch = 20) legend('bottomright', legend = c('all SNPs', 'tagSNPs: corr', 'tagSNPs: popLD'), col = c('black', 'orange', 'blue'), pch = c(1, 20, 20)) legend('topleft', legend = letters[lab], inset = c(-0.25, -0.1), bty = 'n', cex = 1.5) } dev.off()