Run sHDL across a large number of traits

In order to fulfill the demands of conducting heritability enrichment analysis across a large number of traits across various genomic annotations, we offer a function sHDL.reduct.dim within the sHDL package. This function speed up the analysis by pre-processing the GWAS summary statistics and annotations.

Here are the workflow of the process:

Step 01. Convert GWAS summary statistics to \(\mathbf{z}_r\)

#!/usr/bin/env Rscript
library(sHDL)

## function to read GWAS Z-scores and sample size N from GWAS summary statistics ##
read.gwas <- function(matZ, phe, gwas.dir=NULL, Ns=NULL, suffix='.sumstats.gz'){
  gwas.file <- paste0(gwas.dir, '/', phe, suffix)
  cat('Reading ', gwas.file, '\n')
  dfi <- data.table::fread(gwas.file, sep='\t', header=TRUE)
  snps <- dfi$SNP
  N <- median(dfi$N, na.rm=TRUE)
  Z <- matrix(dfi$Z, ncol=1)
  ## add N to first row
  Z <- rbind(N, Z)
  row.names(Z) <- c('N', snps)

  colnames(Z) <- phe
  if(is.null(matZ)){
    matZ <- Z
  } else {
    matZ <- cbind(matZ, Z)
  }
  return(matZ)
}

convertZr.batch <- function(
  phes, gwas.dir, zr.dir, LD.path, step=500, gwas.suffix='.sumstats.gz',
  lam.cut=0.1, mc.cores=8, pattern='.*chr(\\d{1,2})\\.(\\d{1,2})[_\\.].*', overwrite=FALSE){

  if(length(phes) == 0) return(NULL)
  for(s in seq(1, length(phes), by=step)){
    e <- min(s+step-1, length(phes))
    Z <- reduce(phes[s:e], .f=read.gwas, gwas.dir=gwas.dir, suffix=gwas.suffix, .init=NULL)
    N <- Z[1, ]
    Z <- Z[-1, ]
    N <- as.numeric(unlist(N))
    names(N) <- colnames(Z)
    ref.data <- sHDL::sHDL.reduct.dim(LD.path, z=Z, D=NULL, lam.cut=lam.cut, Dr.path=NULL,
      overwrite=FALSE, mode='disk', mc.cores=mc.cores, pattern=pattern)
    zr <- lapply(ref.data, function(x) x$zr)
    rs <- unlist(lapply(zr, function(x) nrow(x)))

    ## convert zr to list by phes
    zr <- do.call(rbind, zr)
    zr <- lapply(1:ncol(zr), function(i, rs=NULL, phes=NULL){
      zz <- zr[, i]
      e <- cumsum(rs)
      s <- c(1, e[-length(e)]+1)
      zz <- sapply(1:length(s), function(j) zz[s[j]:e[j]])
      return(zz)
    }, rs=rs, phes=colnames(Z))
    names(zr) <- colnames(Z)
    for(phe in names(zr)){
      cur.zr <- list()
      cur.zr$zr <- zr[[phe]]
      cur.zr$N <- unname(unlist(N[phe]))
      saveRDS(cur.zr, file=paste0(zr.dir, '/', phe, '.rds'))
    }
  }
}

overwrite <- FALSE
LD.path <- '/path/to/UKB_array_SVD_eigen90_extraction'
lam.cut <- 1
mc.cores <- 8
step <- 500
gwas.dir <- '/path/to/gwas/sumstats'
gwas.suffix <- '.sumstats.gz'
zr.dir <- '/path/to/store/zr'
pattern <- '.*chr(\\d{1,2})\\.(\\d{1,2})[_\\.].*'

phes <- gsub(gwas.suffix, '', list.files(gwas.dir, pattern=gwas.suffix))
if(!dir.exists(zr.dir)) dir.create(zr.dir, recursive=TRUE)
exist.phes <- gsub('\\.rds', '', list.files(zr.dir, pattern='*.rds'))
if(!overwrite) phes <- setdiff(phes, exist.phes)
cat('Converting', length(phes), 'traits...\n')

#### convert z to zr ####
convertZr.batch(phes, gwas.dir, zr.dir, LD.path, step=step, gwas.suffix=gwas.suffix,
                lam.cut=lam.cut, mc.cores=mc.cores, pattern=pattern, overwrite=overwrite)

Step 02. Convert genomic annotations to \(\mathbf{D}_r\)

#!/usr/bin/env Rscript

read.annot <- function(ann.name, annot.dir){
  ## write this function to return the D matrix of given ann.name
  return(D)
}

create.D <- function(
  annot.names, LD.path, annot.dir,
  pattern='.*chr(\\d{1,2})\\.(\\d{1,2})[_\\.].*'){

  D <- NULL
  for(ann.name in annot.names){
    dD <- read.annot(ann.name, annot.dir)
    dD <- sHDL:::normD(dD, LD.path, norm.method = 'none', pattern = pattern)
    D <- cbind(D, dD)
  }
  return(D)
}

annot.dir <- '/path/to/directory/of/annotations'
lam.cut <- 1
Dr.path <- './sHDL_Dr_lam_1'
overwrite <- FALSE
mc.cores <- 8
pattern <- '.*chr(\\d{1,2})\\.(\\d{1,2})[_\\.].*'
norm.method <- 'minmax'
annot.names <- c('all', 'your', 'annot', 'names')

## read D matrix
D <- create.D(annot.names, LD.path, annot.dir, pattern)

## convert D to Dr
ref.data <- sHDL::sHDL.reduct.dim(
  LD.path, z=NULL, D=D, lam.cut=lam.cut, Dr.path=Dr.path, overwrite=overwrite,
  mode='disk', mc.cores=mc.cores, pattern=pattern, norm.method=norm.method
)

Step 03. Run sHDL in parallel

run.sHDL <- function(phe, zr.dir, res.dir, ref.data, overwrite=FALSE){
  tsv.file <- paste0(res.dir, '/', phe, '.tsv')
  log.file <- paste0(res.dir, '/', phe, '.log')
  if(file.exists(tsv.file) && !overwrite) return(NULL)
  if(file.exists(log.file)) file.remove(log.file)

  zr.N <- readRDS(paste0(zr.dir, '/', phe, '.rds'))
  N <- zr.N$N
  for(i in seq_len(length(ref.data))){
    rank <- length(ref.data[[i]]$lam)
    ref.data[[i]]$zr <- zr.N$zr[[i]][seq_len(rank)]
  }
  res <- sHDL::sHDL.optim(ref.data, N, output.file=tsv.file,
    log.file=log.file, stepwise = TRUE, verbose=TRUE
  )
  return(res)
}

overwrite <- FALSE
zr.dir <- '/path/to/store/zr'
Dr.path <- './sHDL_Dr_lam_1'
out.dir <- './res-sHDL'
mc.cores <- 8
mode <- 'memory'
lam.cut <- 1
pattern <- '.*chr(\\d{1,2})\\.(\\d{1,2})[_\\.].*'

annot.names <- list.dirs(Dr.path, recursive = FALSE, full.names=FALSE)
phes <- list.files(zr.dir, '*.rds$', recursive = FALSE, full.names=FALSE)
phes <- gsub('\\.rds$', '', phes)

for(ann.name in annot.names){
  res.dir <- paste0(out.dir, '/', ann.name)
  if(!dir.exists(res.dir)) dir.create(res.dir, recursive = TRUE)
  exist.phes <- gsub('\\.tsv$', '', list.files(res.dir, pattern='*.tsvs'))
  if(!overwrite) phes <- setdiff(phes, exist.phes)
  if(length(phes) < 1) next

  ref.data <- sHDL:::load.Dr(Dr.path, ann.name, lam.cut=lam.cut,
    pattern=pattern, mc.cores=mc.cores, mode=mode)
  tmp <- parallel::mclapply(
    phes, run.sHDL, zr.dir, res.dir, ref.data, mc.cores=mc.cores
  )
}

2025-05-06

Step 01. Convert GWAS summary statistics to \(\mathbf{z}_r\)

Step 02. Convert genomic annotations to \(\mathbf{D}_r\)

Step 03. Run sHDL in parallel