Inroduction
It marks a significant breakthrough that sHDL
integrates
an interpretation framework for both binary and continuous genomic
annotations.
This tutorial will illustrate the process of estimating heritability enrichment fold following the acquisition of genomic features.
!!!Caution!!!: When weights of different annotations aren’t comparable, direct cross-annotation comparisons of enrichment folds should be avoided. In such cases, comparing the p-values of enrichment fold estimation is more appropriate.
Data
Suppose you have gone through this tutorial and
prepared the LD reference panel:
UKB_array_SVD_eigen90_extraction
,downloaded the gene locations information from
GeneBreak
package:ens.gene.ann.hg19.rda
,installed the
sHDL
andHDL
packages.
Additionally, we download the gene specificity data of brain cell types (Skene et al. 2017) from the hjerling leffler lab.
Data contents for this tutorial:
tree ./
# ./
# ├── ctd_AIBS.rda
# ├── ctd_allKI.rda
# ├── ctd_DRONC_human.rda
# ├── ctd_DRONC_mouse.rda
# ├── ens.gene.ann.hg19.rda # gene locations
# └── UKB_array_SVD_eigen90_extraction # LD reference panel
We will use the gene specificity of ASC
cell type
(Astrocytes) annotation in the ctd_DRONC_human.rda
file for
illustration.
load('ctd_DRONC_human.rda')
print(ctd[[1]]$specificity[1:5, 1:5])
# ASC END exCA exDG exPFC
# A1BG 0.065780027 0.0000000 0.134620574 0.108029999 0.073845381
# A1BG-AS1 0.021622098 0.1625062 0.168150754 0.113631342 0.197219791
# A1CF 0.064952275 0.2440825 0.053170623 0.000000000 0.000000000
# A2M 0.003421627 0.8803978 0.006920069 0.005024324 0.004067106
# A2M-AS1 0.035128297 0.3960231 0.028756397 0.069229097 0.083800251
gene.weights <- ctd[[1]]$specificity[, 'ASC']
summary(gene.weights)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.00000 0.02942 0.05097 0.06333 0.07369 1.00000
Binary annotation
- Step 01. we create a vector
D
containing annotated SNPs located in the 10% genes with the highest specificity scores.
LD.path <- 'UKB_array_SVD_eigen90_extraction'
## extract the top 10% genes ##
q90 <- quantile(gene.weights, 0.9)
top10.genes <- names(gene.weights)[gene.weights > q90]
## match gene locations ##
load('ens.gene.ann.hg19.rda') # load ens.gene.ann.hg19
idx <- ens.gene.ann.hg19$Gene %in% top10.genes
gene.pos <- unique(ens.gene.ann.hg19[idx, c(3:5)])
colnames(gene.pos) <- c('chrom', 'start', 'end')
autosomes <- as.character(1:22)
gene.pos <- gene.pos[gene.pos$chrom %in% autosomes, ] # keep autosomes only
gene.pos$chrom <- as.numeric(gene.pos$chrom)
## extract variants position from LD reference panel ##
bim <- do.call(rbind, lapply(
sHDL:::list.LD.ref.files(LD.path, suffix='.bim'), data.table::fread, header=FALSE
))
bim <- bim[, c(2, 1, 4)]
colnames(bim) <- c('SNP', 'chrom', 'pos')
## match annotated variants ##
gene2snp <- function(gene.pos, bim, mc.cores=4){
map.gene <- function(i, gene.pos.chr, bim.chr){
s <- gene.pos.chr[i, 'start']
e <- gene.pos.chr[i, 'end']
return(bim.chr[bim.chr$pos >= s & bim.chr$pos <= e, ]$SNP)
}
chroms <- unique(bim$chrom)
all.snps <- c()
for(cur.chrom in chroms){
gene.pos.chr <- gene.pos[gene.pos$chrom == cur.chrom, ]
bim.chr <- bim[bim$chrom == cur.chrom, ]
snps <- parallel::mclapply(
seq_len(nrow(gene.pos.chr)), map.gene,
gene.pos.chr=gene.pos.chr, bim.chr=bim.chr,
mc.cores=mc.cores
)
snps <- unique(unlist(snps))
all.snps <- c(all.snps, snps)
}
return(all.snps)
}
snps <- gene2snp(gene.pos, bim, 4)
## create the annotation weights for sHDL ##
D <- matrix(1, nrow=length(snps), ncol=1, dimnames=list(snps, 'Top10.genes'))
- Step 02. Run
sHDL
on this binary annotation with the birth weight GWAS summary statistics fromHDL
.
data(gwas1.example, package='HDL')
res <- sHDL::sHDL(
D, gwas1.example, LD.path, mc.cores=4,
stepwise=TRUE, lam.cut=1, mode='memory', Dr.path=NULL
)
# print(knitr::kable(res))
# |item | estimation| se| p|note |
# |:----------------|----------:|---------:|---------:|:------------------------------------------------|
# |time | 60.7460063| NA| NA|seconds |
# |h2 | 0.1639165| 0.0054560| 0.0000000|total heritability |
# |intercept | 0.9632359| 0.0106590| 0.0000000|intercept |
# |fold.Top10.genes | 1.3037835| 0.1047777| 0.0037398|enrichment fold |
# |converged | NA| NA| NA|TRUE |
# |message | NA| NA| NA|CONVERGENCE: NORM OF PROJECTED GRADIENT <= PGTOL |
Continuous annotation
In the context of continuous annotation, a normalization process should be employed to ensure that the annotation fits with the sHDL model.
-
Step 01. Extract the gene locations (hg19) and weights.
## extract gene infomation ## load('ens.gene.ann.hg19.rda') # load ens.gene.ann.hg19 gene.df <- unique(ens.gene.ann.hg19[, c(1, 3:5)]) colnames(gene.df) <- c('gene', 'chrom', 'start', 'end') autosomes <- as.character(1:22) gene.df <- gene.df[gene.df$chrom %in% autosomes, ] # keep autosomes only gene.df$chrom <- as.numeric(gene.df$chrom) gene.df <- gene.df[!duplicated(gene.df$gene), ] # remove duplicated genes gene.df$weight <- gene.weights[gene.df$gene] gene.df <- gene.df[!is.na(gene.df$weight), ] # remove genes without weight head(gene.df, 5) # gene chrom start end weight # 27828 LINC00115 1 761586 762902 0.13639639 # 12129 SAMD11 1 860260 879955 0.52717077 # 12571 NOC2L 1 879584 894689 0.05715007 # 12699 KLHL17 1 895967 901095 0.07142173 # 13025 HES4 1 934342 935552 0.09547616
-
Step 02. Map gene weights to SNP weights.
LD.path <- 'UKB_array_SVD_eigen90_extraction' ## extract variants position from LD reference panel ## bim <- do.call(rbind, lapply( sHDL:::list.LD.ref.files(LD.path, suffix='.bim'), data.table::fread, header=FALSE )) bim <- bim[, c(2, 1, 4)] colnames(bim) <- c('SNP', 'chrom', 'pos') ## map weights ## mapw <- function(i, j){ cur.chrom <- gene.df[j, 'chrom'] s <- gene.df[j, 'start'] e <- gene.df[j, 'end'] w <- gene.df[j, 'weight'] bim <- bim[bim$chrom == cur.chrom, ] bim <- bim[bim$pos >= s & bim$pos <= e, ] snps <- unique(c(names(i), bim$SNP)) nsnps <- length(snps) snps.w <- rep(0, nsnps) names(snps.w) <- snps snps.w[names(i)] <- snps.w[names(i)] + i snps.w[bim$SNP] <- snps.w[bim$SNP] + w if(j %% 1000 == 0) cat(sprintf( '# of processed genes %d: # of annotated %d variants (%.2f%%) \n', j, nsnps, nsnps/M*100 )) return(snps.w) } M <- nrow(bim) snps.w <- Reduce(mapw, 1:nrow(gene.df), init=c()) D <- matrix(snps.w, ncol=1, dimnames = list(names(snps.w), 'raw.W')) saveRDS(D, 'raw.W.rds')
-
Step 03. Check the weights of variants.
summary(D) # raw.W # Min. :0.00000 # 1st Qu.:0.02038 # Median :0.04425 # Mean :0.06551 # 3rd Qu.:0.07747 # Max. :1.65767
Min-max normalization
One straightforward normalization method involves min-max
normalization, where the normalized D
value signifies the
degree to which the selected variant pertains to the annotated
category.
LD.path <- 'UKB_array_SVD_eigen90_extraction'
D <- readRDS('raw.W.rds')
colnames(D) <- 'raw.W.minmax'
D.minmax <- sHDL:::normD(D, LD.path, norm.method='minmax')
# Annotation name(s): [raw.W.minmax].
# Applied `minmax` weight nomalization on [121235 (39.424%)] annotated variants. The theoretical upper boundary for enrichment fold is M / Md = [61.419].
summary(D.minmax)
# raw.W.minmax
# Min. :0.00000
# 1st Qu.:0.00000
# Median :0.00000
# Mean :0.01628
# 3rd Qu.:0.02030
# Max. :1.00000
Run sHDL
with minmax
normalization.
LD.path <- 'UKB_array_SVD_eigen90_extraction'
data(gwas1.example, package='HDL')
D <- readRDS('raw.W.rds')
colnames(D) <- 'raw.W.minmax'
res <- sHDL::sHDL(
D, gwas1.example, LD.path, mc.cores=4,
stepwise=TRUE, lam.cut=1, mode='memory', Dr.path=NULL, norm.method='minmax'
)
# print(str(res))
# 'data.frame': 6 obs. of 5 variables:
# $ item : chr "time" "h2" "intercept" "fold.raw.W.minmax" ...
# $ estimation: num 56.354 0.164 0.963 8.741 NA ...
# $ se : num NA 0.00546 0.01066 0.90955 NA ...
# $ p : num NA 2.69e-198 0.00 1.72e-17 NA ...
# $ note : chr "seconds" "total heritability" "intercept" "enrichment fold" ...
Scaled normalization
Alternatively, another approach is to scale sum(D)
to
the number of annotated variants.
LD.path <- 'UKB_array_SVD_eigen90_extraction'
D <- readRDS('raw.W.rds')
colnames(D) <- 'raw.W.scaled'
D.scaled <- sHDL:::normD(D, LD.path, norm.method='scaled')
# Annotation name(s): [raw.W.scaled].
# Applied `scaled` weight nomalization on [121235 (39.424%)] annotated variants. The theoretical upper boundary for enrichment fold is M / Md = [2.537].
summary(D.scaled)
# raw.W.scaled
# Min. : 0.0000
# 1st Qu.: 0.0000
# Median : 0.0000
# Mean : 0.3942
# 3rd Qu.: 0.4916
# Max. :24.2134
Run sHDL
with scaled
normalization.
LD.path <- 'UKB_array_SVD_eigen90_extraction'
data(gwas1.example, package='HDL')
D <- readRDS('raw.W.rds')
colnames(D) <- 'raw.W.scaled'
res <- sHDL::sHDL(
D, gwas1.example, LD.path, mc.cores=4,
stepwise=TRUE, lam.cut=1, mode='memory', Dr.path=NULL, norm.method='scaled'
)
# print(str(res))
# 'data.frame': 6 obs. of 5 variables:
# $ item : chr "time" "h2" "intercept" "fold.raw.W.scaled" ...
# $ estimation: num 64.204 0.164 0.963 1.197 NA ...
# $ se : num NA 0.00546 0.01066 0.02313 NA ...
# $ p : num NA 2.69e-198 0.00 1.73e-17 NA ...
# $ note : chr "seconds" "total heritability" "intercept" "enrichment fold" ...
No normalization
One can also choose to keep the original annotation weights without normalization.
LD.path <- 'UKB_array_SVD_eigen90_extraction'
D <- readRDS('raw.W.rds')
D.none <- sHDL:::normD(D, LD.path, norm.method='none')
# Annotation name(s): [raw.W].
# No normalization applied on [121235 (39.424%)] annotated variants. The theoretical upper boundary for enrichment fold is M / Md = [37.051].
summary(D.none)
# raw.W
# Min. :0.00000
# 1st Qu.:0.00000
# Median :0.00000
# Mean :0.02699
# 3rd Qu.:0.03366
# Max. :1.65767
Run sHDL
without annotation weights normalization.
LD.path <- 'UKB_array_SVD_eigen90_extraction'
data(gwas1.example, package='HDL')
D <- readRDS('raw.W.rds')
res <- sHDL::sHDL(
D, gwas1.example, LD.path, mc.cores=4,
stepwise=TRUE, lam.cut=1, mode='memory', Dr.path=NULL, norm.method='none'
)
# print(str(res))
# 'data.frame': 6 obs. of 5 variables:
# $ item : chr "time" "h2" "intercept" "fold.raw.W" ...
# $ estimation: num 63.831 0.164 0.963 5.619 NA ...
# $ se : num NA 0.00546 0.01066 0.54271 NA ...
# $ p : num NA 2.69e-198 0.00 1.73e-17 NA ...
# $ note : chr "seconds" "total heritability" "intercept" "enrichment fold" ...