Skip to contents

Inroduction

It marks a significant breakthrough that sHDL integrates an interpretation framework for both binary and continuous genomic annotations.

This tutorial will illustrate the process of estimating heritability enrichment fold following the acquisition of genomic features.

!!!Caution!!!: When weights of different annotations aren’t comparable, direct cross-annotation comparisons of enrichment folds should be avoided. In such cases, comparing the p-values of enrichment fold estimation is more appropriate.

Data

Suppose you have gone through this tutorial and

  • prepared the LD reference panel: UKB_array_SVD_eigen90_extraction,

  • downloaded the gene locations information from GeneBreak package: ens.gene.ann.hg19.rda,

  • installed the sHDL and HDL packages.

Additionally, we download the gene specificity data of brain cell types (Skene et al. 2017) from the hjerling leffler lab.

wget -c http://www.hjerling-leffler-lab.org/data/scz_singlecell/ctdFiles.zip

unzip ctdFiles.zip

Data contents for this tutorial:

tree ./                                
# ./
# ├── ctd_AIBS.rda
# ├── ctd_allKI.rda
# ├── ctd_DRONC_human.rda
# ├── ctd_DRONC_mouse.rda
# ├── ens.gene.ann.hg19.rda  # gene locations
# └── UKB_array_SVD_eigen90_extraction  # LD reference panel

We will use the gene specificity of ASC cell type (Astrocytes) annotation in the ctd_DRONC_human.rda file for illustration.

load('ctd_DRONC_human.rda')
print(ctd[[1]]$specificity[1:5, 1:5])
#                  ASC       END        exCA        exDG       exPFC
# A1BG     0.065780027 0.0000000 0.134620574 0.108029999 0.073845381
# A1BG-AS1 0.021622098 0.1625062 0.168150754 0.113631342 0.197219791
# A1CF     0.064952275 0.2440825 0.053170623 0.000000000 0.000000000
# A2M      0.003421627 0.8803978 0.006920069 0.005024324 0.004067106
# A2M-AS1  0.035128297 0.3960231 0.028756397 0.069229097 0.083800251

gene.weights <- ctd[[1]]$specificity[, 'ASC']

summary(gene.weights)
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.00000 0.02942 0.05097 0.06333 0.07369 1.00000

Binary annotation

  • Step 01. we create a vector D containing annotated SNPs located in the 10% genes with the highest specificity scores.
LD.path <- 'UKB_array_SVD_eigen90_extraction'

## extract the top 10% genes ##
q90 <- quantile(gene.weights, 0.9)
top10.genes <- names(gene.weights)[gene.weights > q90]

## match gene locations ##
load('ens.gene.ann.hg19.rda') # load ens.gene.ann.hg19
idx <- ens.gene.ann.hg19$Gene %in% top10.genes
gene.pos <- unique(ens.gene.ann.hg19[idx, c(3:5)])
colnames(gene.pos) <- c('chrom', 'start', 'end')
autosomes <- as.character(1:22)
gene.pos <- gene.pos[gene.pos$chrom %in% autosomes, ] # keep autosomes only
gene.pos$chrom <- as.numeric(gene.pos$chrom)

## extract variants position from LD reference panel ##
bim <- do.call(rbind, lapply(
  sHDL:::list.LD.ref.files(LD.path, suffix='.bim'), data.table::fread, header=FALSE
))
bim <- bim[, c(2, 1, 4)]
colnames(bim) <- c('SNP', 'chrom', 'pos')

## match annotated variants ##
gene2snp <- function(i, j){
  cur.chrom <- gene.pos[j, 'chrom']
  s <- gene.pos[j, 'start']
  e <- gene.pos[j, 'end']
  bim <- bim[bim$chrom == cur.chrom, ]
  bim <- bim[bim$pos >= s & bim$pos <= e, ]
  snps <- unique(c(i, bim$SNP))
  nsnps <- length(snps)
  if(j %% 1000 == 0) cat(sprintf(
    '# of processed genes %d: # of annotated %d variants (%.2f%%) \n',
    j, nsnps, nsnps/M*100
  ))
  return(snps)
}
M <- nrow(bim)
snps <- Reduce(gene2snp, 1:nrow(gene.pos), init=c())

## create the annotation weights for sHDL ##
D <- rep(1, length(snps))
names(D) <- snps
  • Step 02. Run sHDL on this binary annotation with the birth weight GWAS summary statistics from HDL.
data(gwas1.example, package='HDL')
res <- sHDL::sHDL(
  D, gwas1.example, LD.path, nthreads=4,
  stepwise=TRUE, lam.cut=1, mode='memory', Dr.path=NULL
)

str(res)
# List of 13
#  $ time         : num 78.8
#  $ fold         : num 1.3
#  $ h2           : num 0.164
#  $ intercept    : num 0.963
#  $ fold.se      : num 0.105
#  $ h2.se        : num 0.00546
#  $ intercept.se : num 0.0107
#  $ fold.p       : num 0.00374
#  $ h2.p         : num 2.69e-198
#  $ intercept.p  : num 0
#  $ stepwise     : logi TRUE
#  $ converged    : logi TRUE
#  $ optim.message: chr "CONVERGENCE: NORM OF PROJECTED GRADIENT <= PGTOL"

Continuous annotation

In the context of continuous annotation, a normalization process should be employed to ensure that the annotation fits with the sHDL model.

  • Step 01. Extract the gene locations (hg19) and weights.

    ## extract gene infomation ##
    load('ens.gene.ann.hg19.rda') # load ens.gene.ann.hg19
    gene.df <- unique(ens.gene.ann.hg19[, c(1, 3:5)])
    colnames(gene.df) <- c('gene', 'chrom', 'start', 'end')
    autosomes <- as.character(1:22)
    gene.df <- gene.df[gene.df$chrom %in% autosomes, ] # keep autosomes only
    gene.df$chrom <- as.numeric(gene.df$chrom)
    gene.df <- gene.df[!duplicated(gene.df$gene), ] # remove duplicated genes
    gene.df$weight <- gene.weights[gene.df$gene]
    gene.df <- gene.df[!is.na(gene.df$weight), ] # remove genes without weight
    
    head(gene.df, 5)
    #            gene chrom  start    end     weight
    # 27828 LINC00115     1 761586 762902 0.13639639
    # 12129    SAMD11     1 860260 879955 0.52717077
    # 12571     NOC2L     1 879584 894689 0.05715007
    # 12699    KLHL17     1 895967 901095 0.07142173
    # 13025      HES4     1 934342 935552 0.09547616
  • Step 02. Map gene weights to SNP weights.

    LD.path <- 'UKB_array_SVD_eigen90_extraction'
    
    ## extract variants position from LD reference panel ##
    bim <- do.call(rbind, lapply(
      sHDL:::list.LD.ref.files(LD.path, suffix='.bim'), data.table::fread, header=FALSE
    ))
    bim <- bim[, c(2, 1, 4)]
    colnames(bim) <- c('SNP', 'chrom', 'pos')
    
    ## map weights ##
    mapw <- function(i, j){
      cur.chrom <- gene.df[j, 'chrom']
      s <- gene.df[j, 'start']
      e <- gene.df[j, 'end']
      w <- gene.df[j, 'weight']
      bim <- bim[bim$chrom == cur.chrom, ]
      bim <- bim[bim$pos >= s & bim$pos <= e, ]
    
      snps <- unique(c(names(i), bim$SNP))
      nsnps <- length(snps)
      snps.w <- rep(0, nsnps)
      names(snps.w) <- snps
      snps.w[names(i)] <- snps.w[names(i)] + i
      snps.w[bim$SNP] <- snps.w[bim$SNP] + w
      if(j %% 1000 == 0) cat(sprintf(
        '# of processed genes %d: # of annotated %d variants (%.2f%%) \n',
        j, nsnps, nsnps/M*100
      ))
      return(snps.w)
    }
    M <- nrow(bim)
    snps.w <- Reduce(mapw, 1:nrow(gene.df), init=c())
  • Step 03. Check the weights of variants.

    summary(snps.w)
    #    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    # 0.00000 0.02038 0.04425 0.06551 0.07747 1.65767 

Min-max normalization

One straightforward normalization method involves min-max normalization, where the normalized D value signifies the degree to which the selected variant pertains to the annotated category.

LD.path <- 'UKB_array_SVD_eigen90_extraction'
D.minmax <- sHDL:::normD(snps.w, LD.path, norm.method='minmax')

# Applied `minmax` weight nomalization on 121235 (39.424%) annotated variants.
# The theoretical upper boundary for enrichment fold is 61.419.

summary(D.minmax)
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.00000 0.00000 0.00000 0.01628 0.02030 1.00000 

Run sHDL with minmax normalization.

LD.path <- 'UKB_array_SVD_eigen90_extraction'
data(gwas1.example, package='HDL')
res <- sHDL::sHDL(
  snps.w, gwas1.example, LD.path, nthreads=4,
  stepwise=TRUE, lam.cut=1, mode='memory', Dr.path=NULL, norm.method='minmax'
)

str(res)
# List of 13
#  $ time         : num 64
#  $ fold         : num 8.74
#  $ h2           : num 0.164
#  $ intercept    : num 0.963
#  $ fold.se      : num 0.91
#  $ h2.se        : num 0.00546
#  $ intercept.se : num 0.0107
#  $ fold.p       : num 1.72e-17
#  $ h2.p         : num 2.69e-198
#  $ intercept.p  : num 0
#  $ stepwise     : logi TRUE
#  $ converged    : logi TRUE
#  $ optim.message: chr "CONVERGENCE: NORM OF PROJECTED GRADIENT <= PGTOL"

Scaled normalization

Alternatively, another approach is to scale sum(D) to the number of annotated variants.

LD.path <- 'UKB_array_SVD_eigen90_extraction'
D.scaled <- sHDL:::normD(snps.w, LD.path, norm.method='scaled')

# Applied `scaled` weight nomalization on 121235 (39.424%) annotated variants.
# The theoretical upper boundary for enrichment fold is 2.537.

summary(D.scaled)
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#  0.0000  0.0000  0.0000  0.3942  0.4916 24.2134 

Run sHDL with scaled normalization.

LD.path <- 'UKB_array_SVD_eigen90_extraction'
data(gwas1.example, package='HDL')
res <- sHDL::sHDL(
  snps.w, gwas1.example, LD.path, nthreads=4,
  stepwise=TRUE, lam.cut=1, mode='memory', Dr.path=NULL, norm.method='scaled'
)

str(res)
# List of 13
#  $ time         : num 73
#  $ fold         : num 1.2
#  $ h2           : num 0.164
#  $ intercept    : num 0.963
#  $ fold.se      : num 0.0231
#  $ h2.se        : num 0.00546
#  $ intercept.se : num 0.0107
#  $ fold.p       : num 1.73e-17
#  $ h2.p         : num 2.69e-198
#  $ intercept.p  : num 0
#  $ stepwise     : logi TRUE
#  $ converged    : logi TRUE
#  $ optim.message: chr "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"

No normalization

One can also choose to keep the original annotation weights without normalization.

LD.path <- 'UKB_array_SVD_eigen90_extraction'
D.none <- sHDL:::normD(snps.w, LD.path, norm.method='none')

# No normalization applied on 121235 (39.424%) annotated variants. The theoretical upper boundary for enrichment fold is M / Md = 37.051.

summary(D.none)
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.00000 0.00000 0.00000 0.02699 0.03366 1.65767 

Run sHDL without annotation weights normalization.

LD.path <- 'UKB_array_SVD_eigen90_extraction'
data(gwas1.example, package='HDL')
res <- sHDL::sHDL(
  snps.w, gwas1.example, LD.path, nthreads=4,
  stepwise=TRUE, lam.cut=1, mode='memory', Dr.path=NULL, norm.method='none'
)

str(res)
# List of 13
#  $ time         : num 70.8
#  $ fold         : num 5.62
#  $ h2           : num 0.164
#  $ intercept    : num 0.963
#  $ fold.se      : num 0.543
#  $ h2.se        : num 0.00546
#  $ intercept.se : num 0.0107
#  $ fold.p       : num 1.73e-17
#  $ h2.p         : num 2.69e-198
#  $ intercept.p  : num 0
#  $ stepwise     : logi TRUE
#  $ converged    : logi TRUE
#  $ optim.message: chr "CONVERGENCE: NORM OF PROJECTED GRADIENT <= PGTOL"