Genomic annotations

Inroduction

It marks a significant breakthrough that sHDL integrates an interpretation framework for both binary and continuous genomic annotations.

This tutorial will illustrate the process of estimating heritability enrichment fold following the acquisition of genomic features.

!!!Caution!!!: When weights of different annotations aren’t comparable, direct cross-annotation comparisons of enrichment folds should be avoided. In such cases, comparing the p-values of enrichment fold estimation is more appropriate.

Data

Suppose you have gone through this tutorial and

prepared the LD reference panel: UKB_array_SVD_eigen90_extraction,
downloaded the gene locations information from GeneBreak package: ens.gene.ann.hg19.rda,
installed the sHDL and HDL packages.

Additionally, we download the gene specificity data of brain cell types (Skene et al. 2017) from the hjerling leffler lab.

wget -c http://www.hjerling-leffler-lab.org/data/scz_singlecell/ctdFiles.zip

unzip ctdFiles.zip

Data contents for this tutorial:

tree ./                                
# ./
# ├── ctd_AIBS.rda
# ├── ctd_allKI.rda
# ├── ctd_DRONC_human.rda
# ├── ctd_DRONC_mouse.rda
# ├── ens.gene.ann.hg19.rda  # gene locations
# └── UKB_array_SVD_eigen90_extraction  # LD reference panel

We will use the gene specificity of ASC cell type (Astrocytes) annotation in the ctd_DRONC_human.rda file for illustration.

load('ctd_DRONC_human.rda')
print(ctd[[1]]$specificity[1:5, 1:5])
#                  ASC       END        exCA        exDG       exPFC
# A1BG     0.065780027 0.0000000 0.134620574 0.108029999 0.073845381
# A1BG-AS1 0.021622098 0.1625062 0.168150754 0.113631342 0.197219791
# A1CF     0.064952275 0.2440825 0.053170623 0.000000000 0.000000000
# A2M      0.003421627 0.8803978 0.006920069 0.005024324 0.004067106
# A2M-AS1  0.035128297 0.3960231 0.028756397 0.069229097 0.083800251

gene.weights <- ctd[[1]]$specificity[, 'ASC']

summary(gene.weights)
#    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.00000 0.02942 0.05097 0.06333 0.07369 1.00000

Binary annotation

Step 01. we create a vector D containing annotated SNPs located in the 10% genes with the highest specificity scores.

LD.path <- 'UKB_array_SVD_eigen90_extraction'

## extract the top 10% genes ##
q90 <- quantile(gene.weights, 0.9)
top10.genes <- names(gene.weights)[gene.weights > q90]

## match gene locations ##
load('ens.gene.ann.hg19.rda') # load ens.gene.ann.hg19
idx <- ens.gene.ann.hg19$Gene %in% top10.genes
gene.pos <- unique(ens.gene.ann.hg19[idx, c(3:5)])
colnames(gene.pos) <- c('chrom', 'start', 'end')
autosomes <- as.character(1:22)
gene.pos <- gene.pos[gene.pos$chrom %in% autosomes, ] # keep autosomes only
gene.pos$chrom <- as.numeric(gene.pos$chrom)

## extract variants position from LD reference panel ##
bim <- do.call(rbind, lapply(
  sHDL:::list.LD.ref.files(LD.path, suffix='.bim'), data.table::fread, header=FALSE
))
bim <- bim[, c(2, 1, 4)]
colnames(bim) <- c('SNP', 'chrom', 'pos')

## match annotated variants ##
gene2snp <- function(gene.pos, bim, mc.cores=4){
  map.gene <- function(i, gene.pos.chr, bim.chr){
    s <- gene.pos.chr[i, 'start']
    e <- gene.pos.chr[i, 'end']
    return(bim.chr[bim.chr$pos >= s & bim.chr$pos <= e, ]$SNP)
  }
  chroms <- unique(bim$chrom)
  all.snps <- c()
  for(cur.chrom in chroms){
    gene.pos.chr <- gene.pos[gene.pos$chrom == cur.chrom, ]
    bim.chr <- bim[bim$chrom == cur.chrom, ]
    snps <- parallel::mclapply(
      seq_len(nrow(gene.pos.chr)), map.gene,
      gene.pos.chr=gene.pos.chr, bim.chr=bim.chr,
      mc.cores=mc.cores
    )
    snps <- unique(unlist(snps))
    all.snps <- c(all.snps, snps)
  }
  return(all.snps)
}
snps <- gene2snp(gene.pos, bim, 4)

## create the annotation weights for sHDL ##
D <- matrix(1, nrow=length(snps), ncol=1, dimnames=list(snps, 'Top10.genes'))

Step 02. Run sHDL on this binary annotation with the birth weight GWAS summary statistics from HDL.

data(gwas1.example, package='HDL')
res <- sHDL::sHDL(
  D, gwas1.example, LD.path, mc.cores=4,
  stepwise=TRUE, lam.cut=1, mode='memory', Dr.path=NULL
)

# print(knitr::kable(res))

# |item             | estimation|        se|         p|note                                             |
# |:----------------|----------:|---------:|---------:|:------------------------------------------------|
# |time             | 60.7460063|        NA|        NA|seconds                                          |
# |h2               |  0.1639165| 0.0054560| 0.0000000|total heritability                               |
# |intercept        |  0.9632359| 0.0106590| 0.0000000|intercept                                        |
# |fold.Top10.genes |  1.3037835| 0.1047777| 0.0037398|enrichment fold                                  |
# |converged        |         NA|        NA|        NA|TRUE                                             |
# |message          |         NA|        NA|        NA|CONVERGENCE: NORM OF PROJECTED GRADIENT <= PGTOL |

Continuous annotation

In the context of continuous annotation, a normalization process should be employed to ensure that the annotation fits with the sHDL model.

Step 01. Extract the gene locations (hg19) and weights.

## extract gene infomation ##
load('ens.gene.ann.hg19.rda') # load ens.gene.ann.hg19
gene.df <- unique(ens.gene.ann.hg19[, c(1, 3:5)])
colnames(gene.df) <- c('gene', 'chrom', 'start', 'end')
autosomes <- as.character(1:22)
gene.df <- gene.df[gene.df$chrom %in% autosomes, ] # keep autosomes only
gene.df$chrom <- as.numeric(gene.df$chrom)
gene.df <- gene.df[!duplicated(gene.df$gene), ] # remove duplicated genes
gene.df$weight <- gene.weights[gene.df$gene]
gene.df <- gene.df[!is.na(gene.df$weight), ] # remove genes without weight

head(gene.df, 5)
#            gene chrom  start    end     weight
# 27828 LINC00115     1 761586 762902 0.13639639
# 12129    SAMD11     1 860260 879955 0.52717077
# 12571     NOC2L     1 879584 894689 0.05715007
# 12699    KLHL17     1 895967 901095 0.07142173
# 13025      HES4     1 934342 935552 0.09547616

Step 02. Map gene weights to SNP weights.

LD.path <- 'UKB_array_SVD_eigen90_extraction'

## extract variants position from LD reference panel ##
bim <- do.call(rbind, lapply(
  sHDL:::list.LD.ref.files(LD.path, suffix='.bim'), data.table::fread, header=FALSE
))
bim <- bim[, c(2, 1, 4)]
colnames(bim) <- c('SNP', 'chrom', 'pos')

## map weights ##
mapw <- function(i, j){
  cur.chrom <- gene.df[j, 'chrom']
  s <- gene.df[j, 'start']
  e <- gene.df[j, 'end']
  w <- gene.df[j, 'weight']
  bim <- bim[bim$chrom == cur.chrom, ]
  bim <- bim[bim$pos >= s & bim$pos <= e, ]

  snps <- unique(c(names(i), bim$SNP))
  nsnps <- length(snps)
  snps.w <- rep(0, nsnps)
  names(snps.w) <- snps
  snps.w[names(i)] <- snps.w[names(i)] + i
  snps.w[bim$SNP] <- snps.w[bim$SNP] + w
  if(j %% 1000 == 0) cat(sprintf(
    '# of processed genes %d: # of annotated %d variants (%.2f%%) \n',
    j, nsnps, nsnps/M*100
  ))
  return(snps.w)
}
M <- nrow(bim)
snps.w <- Reduce(mapw, 1:nrow(gene.df), init=c())
D <- matrix(snps.w, ncol=1, dimnames = list(names(snps.w), 'raw.W'))
saveRDS(D, 'raw.W.rds')

Step 03. Check the weights of variants.

summary(D)
 #     raw.W        
 # Min.   :0.00000  
 # 1st Qu.:0.02038  
 # Median :0.04425  
 # Mean   :0.06551  
 # 3rd Qu.:0.07747  
 # Max.   :1.65767

Min-max normalization

One straightforward normalization method involves min-max normalization, where the normalized D value signifies the degree to which the selected variant pertains to the annotated category.

LD.path <- 'UKB_array_SVD_eigen90_extraction'
D <- readRDS('raw.W.rds')
colnames(D) <- 'raw.W.minmax'
D.minmax <- sHDL:::normD(D, LD.path, norm.method='minmax')

# Annotation name(s): [raw.W.minmax].
# Applied `minmax` weight nomalization on [121235 (39.424%)] annotated variants. The theoretical upper boundary for enrichment fold is M / Md = [61.419].

summary(D.minmax)
 #  raw.W.minmax    
 # Min.   :0.00000
 # 1st Qu.:0.00000
 # Median :0.00000
 # Mean   :0.01628
 # 3rd Qu.:0.02030
 # Max.   :1.00000

Run sHDL with minmax normalization.

LD.path <- 'UKB_array_SVD_eigen90_extraction'
data(gwas1.example, package='HDL')
D <- readRDS('raw.W.rds')
colnames(D) <- 'raw.W.minmax'
res <- sHDL::sHDL(
  D, gwas1.example, LD.path, mc.cores=4,
  stepwise=TRUE, lam.cut=1, mode='memory', Dr.path=NULL, norm.method='minmax'
)

# print(str(res))
# 'data.frame':   6 obs. of  5 variables:
#  $ item      : chr  "time" "h2" "intercept" "fold.raw.W.minmax" ...
#  $ estimation: num  56.354 0.164 0.963 8.741 NA ...
#  $ se        : num  NA 0.00546 0.01066 0.90955 NA ...
#  $ p         : num  NA 2.69e-198 0.00 1.72e-17 NA ...
#  $ note      : chr  "seconds" "total heritability" "intercept" "enrichment fold" ...

Scaled normalization

Alternatively, another approach is to scale sum(D) to the number of annotated variants.

LD.path <- 'UKB_array_SVD_eigen90_extraction'
D <- readRDS('raw.W.rds')
colnames(D) <- 'raw.W.scaled'
D.scaled <- sHDL:::normD(D, LD.path, norm.method='scaled')

# Annotation name(s): [raw.W.scaled].
# Applied `scaled` weight nomalization on [121235 (39.424%)] annotated variants. The theoretical upper boundary for enrichment fold is M / Md = [2.537].

summary(D.scaled)
 #  raw.W.scaled    
 # Min.   : 0.0000  
 # 1st Qu.: 0.0000  
 # Median : 0.0000  
 # Mean   : 0.3942  
 # 3rd Qu.: 0.4916  
 # Max.   :24.2134

Run sHDL with scaled normalization.

LD.path <- 'UKB_array_SVD_eigen90_extraction'
data(gwas1.example, package='HDL')
D <- readRDS('raw.W.rds')
colnames(D) <- 'raw.W.scaled'
res <- sHDL::sHDL(
  D, gwas1.example, LD.path, mc.cores=4,
  stepwise=TRUE, lam.cut=1, mode='memory', Dr.path=NULL, norm.method='scaled'
)

# print(str(res))
# 'data.frame':   6 obs. of  5 variables:
#  $ item      : chr  "time" "h2" "intercept" "fold.raw.W.scaled" ...
#  $ estimation: num  64.204 0.164 0.963 1.197 NA ...
#  $ se        : num  NA 0.00546 0.01066 0.02313 NA ...
#  $ p         : num  NA 2.69e-198 0.00 1.73e-17 NA ...
#  $ note      : chr  "seconds" "total heritability" "intercept" "enrichment fold" ...

No normalization

One can also choose to keep the original annotation weights without normalization.

LD.path <- 'UKB_array_SVD_eigen90_extraction'
D <- readRDS('raw.W.rds')
D.none <- sHDL:::normD(D, LD.path, norm.method='none')

# Annotation name(s): [raw.W].
# No normalization applied on [121235 (39.424%)] annotated variants. The theoretical upper boundary for enrichment fold is M / Md = [37.051].

summary(D.none)
 #     raw.W        
 # Min.   :0.00000  
 # 1st Qu.:0.00000  
 # Median :0.00000  
 # Mean   :0.02699  
 # 3rd Qu.:0.03366  
 # Max.   :1.65767

Run sHDL without annotation weights normalization.

LD.path <- 'UKB_array_SVD_eigen90_extraction'
data(gwas1.example, package='HDL')
D <- readRDS('raw.W.rds')
res <- sHDL::sHDL(
  D, gwas1.example, LD.path, mc.cores=4,
  stepwise=TRUE, lam.cut=1, mode='memory', Dr.path=NULL, norm.method='none'
)

# print(str(res))
# 'data.frame':   6 obs. of  5 variables:
#  $ item      : chr  "time" "h2" "intercept" "fold.raw.W" ...
#  $ estimation: num  63.831 0.164 0.963 5.619 NA ...
#  $ se        : num  NA 0.00546 0.01066 0.54271 NA ...
#  $ p         : num  NA 2.69e-198 0.00 1.73e-17 NA ...
#  $ note      : chr  "seconds" "total heritability" "intercept" "enrichment fold" ...