Skip to contents

In order to fulfill the demands of conducting heritability enrichment analysis across a large number of traits across various genomic annotations, we offer a function sHDL.reduct.dim within the sHDL package. This function speed up the analysis by pre-processing the GWAS summary statistics and annotations.

Here are the workflow of the process:

Step 01. Convert GWAS summary statistics to \(\mathbf{z}_r\)

#!/usr/bin/env Rscript
library(sHDL)

## function to read GWAS Z-scores and sample size N from GWAS summary statistics ##
readZ <- function(gwas.file, LD.path, all.snps){
  cat('Reading', gwas.file, '\n')
  gwas.df <- data.table::fread(gwas.file, header=TRUE)
  gwas.df <- sHDL:::format.gwas(gwas.df, LD.path)
  Z <- rep(0, length(all.snps))
  names(Z) <- all.snps
  Z[gwas.df$SNP] <- gwas.df$Z
  Z <- unname(Z)
  N <- median(gwas.df$N, na.rm=TRUE)
  return(c(N, Z))
}


## extract all snps in the LD reference panel ##
LD.path <- '/path/to/UKB_array_SVD_eigen90_extraction'
all.snps <- do.call(rbind, lapply(
  sHDL:::list.LD.ref.files(LD.path, suffix='.bim'), data.table::fread, header=FALSE, select=2
))
all.snps <- unname(unlist(all.snps))

## combine all GWAS Z-scores into a large matrix matZ ##
gwas.dir <- '/path/to/all/gwas/summary/statistics'
gwas.files <- list.files(gwas.dir, pattern='*.gz', full.names=TRUE, recursive=FALSE)
phes <- gsub('\\.gz', '', basename(gwas.files)) # extract phenotype name
phe2N <- rep(0, length(phes))
matZ <- matrix(0, ncol=length(phes), nrow=length(all.snps))
colnames(matZ) <- phes
rownames(matZ) <- all.snps

for(i in seq_along(gwas.files)){
  zz <- readZ(gwas.files[i], LD.path, all.snps)
  phe2N[i] <- zz[1]
  matZ[, i] <- zz[-1]
}

## Convert matZ to zr ##
lam.cut <- 1
nthreads <- 8
ref.data <- sHDL.reduct.dim(
  LD.path, z=Z, D=NULL, lam.cut=lam.cut, Dr.path=NULL,
  overwrite=FALSE, mode='memory', nthreads=nthreads
)
zr <- lapply(ref.data, function(x) x$zr)
rs <- unlist(lapply(zr, function(x) nrow(x)))

## convert zr to list by phes
zr <- do.call(rbind, zr)
zr <- lapply(1:ncol(zr), function(i, rs=NULL, phes=NULL){
    zz <- zr[, i]
    e <- cumsum(rs)
    s <- c(1, e[-length(e)]+1)
    zz <- sapply(1:length(s), function(j) zz[s[j]:e[j]])
    return(zz)
  }, rs=rs, phes=colnames(Z)
)
names(zr) <- colnames(Z)
save(zr, phe2N, file='zr-phe2N.rda')

Step 02. Convert genomic annotations to \(\mathbf{D}_r\)

#!/usr/bin/env Rscript

read.annot <- function(annot.name){
  ## write this function to return the D vector of given annot.name
}

library(sHDL)

overwrite <- FALSE
nthreads <- 4
lam.cut <- 1
out.dir <- 'Dr-lam-1'
LD.path <- '/path/to/UKB_imputed_SVD_eigen99_extraction'
annot.names <- c('all', 'your', 'annot', 'names')

for(annot.name in annot.names){
  Dr.path <- paste0(out.dir, '/', annot.name)
  D <- read.annot(annot.name)
  ref.data <- sHDL.reduct.dim(
    LD.path, z=NULL, D=D, lam.cut=lam.cut, Dr.path=Dr.path,
    overwrite=FALSE, mode='disk', nthreads=nthreads
  )
}

Step 03. Run sHDL in parallel

#!/usr/bin/env Rscript
library(sHDL)
library(parallel)

run.sHDL <- function(phe, ref.data, phe2N, zr, tsv.dir,
  overwrite=FALSE){
  tsv.file <- paste0(tsv.dir, '/', phe, '.tsv')
  log.file <- paste0(tsv.dir, '/', phe, '.log')
  if(file.exists(tsv.file) && !overwrite) return(NULL)
  # add zr to ref.data
  for(i in seq_along(ref.data)){
    ref.data[[i]]$zr <- zr[[phe]][[i]]
  }
  start.v <- c(1, 0.1, 1)
  res <- sHDL::sHDL.optim(ref.data, phe2N[phe], start.v=start.v,
    output.file=tsv.file, log.file=log.file, stepwise = TRUE, verbose=TRUE
  )
  return(res)
}

load.Drs <- function(Dr.path){
  Drs <- lapply(sHDL:::list.LD.ref.files(Dr.path), function(i){
    load(i)
    return(list(Dr=Dr, lam=lam, Md=Md, M=M))
  })
  return(Drs)
}

overwrite <- FALSE
nthreads <- 10
lam.cut <- 1
out.dir <- 'res-sHDL'
Dr.dir <- 'Dr-lam-1'

annot.names <- c('all', 'your', 'annot', 'names')
load('zr-phe2N.rda') # zr, phe2N
phes <- names(zr)


for(annot.name in annot.names){
  Dr.path <- paste0(Dr.dir, '/', annot.name)
  tsv.dir <- paste0(out.dir, '/', annot.name)
  ref.data <- load.Drs(Dr.path)
  if(!dir.exists(tsv.dir)) dir.create(tsv.dir, recursive=TRUE)

  cat('annotation:', annot.name, '\n')  
  tmp <- mclapply(phes, run.sHDL, zr=zr, phe2N=phe2N, ref.data=ref.data,
    tsv.dir=tsv.dir, overwrite=overwrite, mc.cores=nthreads)
}