### ### Plots of correlation matrices and population-LD matrices are produced based on results with additional files 1--4. ### Running this script requires an installation of snpStats availabe at Bioconductor. ### ### PACKAGES REQUIRED library(ggplot2) library(grid) library(gridExtra) library(RColorBrewer) library(rlist) library(tidyr) library(hscovar) library(snpStats) library(fpc) ### GLOBAL SETTINGS hm.palette1 <- colorRampPalette(c(rev(brewer.pal(9, "Blues")), brewer.pal(8, "YlOrRd"))) hm.palette2 <- colorRampPalette(brewer.pal(8, "YlOrRd")) ### FUNCTION DEFINITIONS make_figure <- function(R, X, fam.index = NULL, file.fig = '', plotpdf = F, snpstat = T){ if(plotpdf & (file.fig == '')) file.fig <- 'corrplot.pdf' add_margin <- function(...) grid::convertUnit(theme_get()[["plot.margin"]] + margin(...), "cm") ### 1: matrix of dependencies ### 1a: correlation matrix R valid <- colnames(R) ## 1b: population-LD matrix if(ncol(X) == ncol(R)){ coln <- valid } else coln <- 1:ncol(X) colnames(X) <- coln if(snpstat){ X1 <- new('SnpMatrix', X) ldmat <- ld(X1, stats = "R.squared", depth = ncol(X) - 1) popLD <- as.matrix(ldmat) } else{ # 2 lines per individual, one line each for paternal and maternal alleles popLD <- cor(X, use = 'everything', method = 'spearman') ^ 2 } # center genotypes within family and scale if(!is.null(fam.index) & length(unique(fam.index)) != length(fam.index)){ cat('genotypes are centred within family \n') M <- apply(X, 2, function(z){tapply(z, fam.index, mean)}) if(length(unique(fam.index)) == 1) { M <- matrix(M, nrow = 1, ncol = ncol(X)) rownames(M) <- fam.index[1] } Xc <- matrix(ncol = ncol(X), nrow = nrow(X)) for(s in unique(fam.index)){ id <- which(fam.index == s) Xc[id, ] <- sapply(1:ncol(X), function(i){(X[id, i] - M[s, i])}) } Xs <- apply(Xc, 2, function(z){if(sd(z) > 1e-6) z / sd(z) else z}) } else Xs <- apply(X, 2, function(z){if(sd(z) > 1e-6) scale(z) else z}) ### 2: representative SNPs ## 2a: TAGSNPS FROM R2-MATRIX cat('*** population-LD approach *** \n') a1 <- Sys.time() bin.r2 <- tagSNP(popLD, threshold = 0.8) a2 <- Sys.time() ls.r2 <- list.filter(bin.r2, length(snps) >= 3) %>% list.select(., tagsnp) %>% unlist %>% as.integer grp.r2 <- c() for(l in 1:length(bin.r2)) grp.r2[as.numeric(bin.r2[[l]]$snps)] <- l ch.r2 <- calinhara(t(Xs), grp.r2[as.numeric(coln)]) cat(length(ls.r2), 'SNPs with bin size >=3 \n') cat('CH index', ch.r2, '\n') cat(a2-a1, 'sec. for grouping \n') ## 2b: TAGSNPS FROM CORRELATION MATRIX cat('*** family approach *** \n') a1 <- Sys.time() bin.cor <- tagSNP(R, threshold = 0.8) a2 <- Sys.time() ls.cor <- list.filter(bin.cor, length(snps) >= 3) %>% list.select(., tagsnp) %>% unlist %>% as.integer grp.cor <- c() for(l in 1:length(bin.cor)) grp.cor[as.numeric(bin.cor[[l]]$snps)] <- l ch.cor <- calinhara(t(Xs[, colnames(X) %in% valid]), grp.cor[as.numeric(valid)]) cat(length(ls.cor), 'SNPs with bin size >=3 \n') cat('CH index', ch.cor, '\n') cat(a2-a1, 'sec. for grouping \n') p <- max(as.integer(c(valid, coln))) ### 3: ggplots ## 3a: PLOT OF POPULATION LD xval <- unlist(lapply(1:p, function(x){rep(x, p - x)})) yval <- unlist(lapply(1:(p - 1), function(x){(x + 1):p})) zval <- c() for(i in 1:length(xval)){ zval[i] <- popLD[match(xval[i], coln), match(yval[i], coln)] } data <- data.frame(xval, yval, zval) pts <- data.frame(x = ls.r2, y = ls.r2, z = 1) gg.r2 <- ggplot(data, aes(xval, yval, fill = zval)) + geom_tile() + xlab("locus 1") + ylab("locus 2") + theme(panel.background = element_blank(), panel.grid.major = element_line(colour = "grey", size = 0.1), panel.grid.minor = element_line(colour = "grey")) + theme(text = element_text(size = 12), plot.margin = add_margin(r = 0.5, unit = "cm")) + scale_fill_gradientn(colours = hm.palette2(50), name = expression(r^2), limits = c(0, 1+1e-10), na.value = 'white') + theme(legend.position = c(0.8, 0.3)) + geom_point(data = pts, aes(x = x, y = y, fill = z), size = 2, shape = 21) ## 3b: PLOT OF CORRELATION AMONG FAMILIES zval <- c() for(i in 1:length(xval)){ zval[i] <- R[match(xval[i], valid), match(yval[i], valid)] } data <- data.frame(xval, yval, zval) pts <- data.frame(x = ls.cor, y = ls.cor, z = 1) gg.cor <- ggplot(data, aes(xval, yval, fill = zval)) + geom_tile() + xlab("locus 1") + ylab("locus 2") + theme(panel.background = element_blank(), panel.grid.major = element_line(colour = "grey", size = 0.1), panel.grid.minor = element_line(colour = "grey")) + theme(text = element_text(size = 12), plot.margin = add_margin(r = 0.5, unit = "cm")) + scale_fill_gradientn(colours = hm.palette1(100), name = "correlation", limits = c(-(1+1e-10), 1+1e-10), na.value = "white") + theme(legend.position = c(0.8, 0.3)) + geom_point(data = pts, aes(x = x, y = y, fill = z), size = 2, shape = 21) if(plotpdf){ myplot1 <- arrangeGrob(gg.cor, top = textGrob("a", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(col="black", fontsize=18))) myplot2 <- arrangeGrob(gg.r2, top = textGrob("b", x = unit(0, "npc"), y = unit(1, "npc"), just=c("left","top"), gp=gpar(col="black", fontsize=18))) pdf(file.fig, width = 13, height = 6) grid.arrange(myplot1, myplot2, nrow = 1) dev.off() } return(list(corplot = gg.cor, r2plot = gg.r2)) } ### 1: real mouse data (394 SNPs on chromosome 17), 141 full-sib families ## run script 'additional_file_2_prepare_mouse_data.R' load('mouse/covmat.RData') K <- covmat.pat$K + covmat.mat$K %>% as.matrix R <- cov2cor(K) out <- make_figure(R, X, fam.index, file.fig = 'corrplot_mouse.pdf', plotpdf = T) ### 2: real half-sib data (medium-density window 300 SNPs), 5 families # run script 'additional_file_3_prepare_cattle_data.R' load('cattle/covmat.RData') out <- make_figure(covmat$R, X, fam.index, file.fig = 'corrplot_cattle.pdf', plotpdf = T) out <- make_figure(covmat$R, X[, covmat$valid.snps], fam.index) ### 3: real maize data (high density chromosome with 1276 SNPs), 13 half-sib family ## run script 'additional_file_4_prepare_maize_data.R' load('maize/covmat.RData') out <- make_figure(covmat$R, X, fam.index, file.fig = 'corrplot_maize.pdf', plotpdf = T, snpstat = F) out <- make_figure(covmat$R, X[, covmat$valid.snps], fam.index, snpstat = F) ### 4: simulated data of multiple half-sib families (high-density window 300 SNPs) # run script 'additional_file_5_simstudy_gwas.R' load('genomic_data/N_10QTL_5_2.RData') out <- make_figure(covmat$R, X, fam.index, file.fig = 'corrplot_simstudy.pdf', plotpdf = T)