Inroduction
It marks a significant breakthrough that sHDL
integrates
an interpretation framework for both binary and continuous genomic
annotations.
This tutorial will illustrate the process of estimating heritability enrichment fold following the acquisition of genomic features.
!!!Caution!!!: When weights of different annotations aren’t comparable, direct cross-annotation comparisons of enrichment folds should be avoided. In such cases, comparing the p-values of enrichment fold estimation is more appropriate.
Data
Suppose you have gone through this tutorial and
prepared the LD reference panel:
UKB_array_SVD_eigen90_extraction
,downloaded the gene locations information from
GeneBreak
package:ens.gene.ann.hg19.rda
,installed the
sHDL
andHDL
packages.
Additionally, we download the gene specificity data of brain cell types (Skene et al. 2017) from the hjerling leffler lab.
Data contents for this tutorial:
tree ./
# ./
# ├── ctd_AIBS.rda
# ├── ctd_allKI.rda
# ├── ctd_DRONC_human.rda
# ├── ctd_DRONC_mouse.rda
# ├── ens.gene.ann.hg19.rda # gene locations
# └── UKB_array_SVD_eigen90_extraction # LD reference panel
We will use the gene specificity of ASC
cell type
(Astrocytes) annotation in the ctd_DRONC_human.rda
file for
illustration.
load('ctd_DRONC_human.rda')
print(ctd[[1]]$specificity[1:5, 1:5])
# ASC END exCA exDG exPFC
# A1BG 0.065780027 0.0000000 0.134620574 0.108029999 0.073845381
# A1BG-AS1 0.021622098 0.1625062 0.168150754 0.113631342 0.197219791
# A1CF 0.064952275 0.2440825 0.053170623 0.000000000 0.000000000
# A2M 0.003421627 0.8803978 0.006920069 0.005024324 0.004067106
# A2M-AS1 0.035128297 0.3960231 0.028756397 0.069229097 0.083800251
gene.weights <- ctd[[1]]$specificity[, 'ASC']
summary(gene.weights)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.00000 0.02942 0.05097 0.06333 0.07369 1.00000
Binary annotation
- Step 01. we create a vector
D
containing annotated SNPs located in the 10% genes with the highest specificity scores.
LD.path <- 'UKB_array_SVD_eigen90_extraction'
## extract the top 10% genes ##
q90 <- quantile(gene.weights, 0.9)
top10.genes <- names(gene.weights)[gene.weights > q90]
## match gene locations ##
load('ens.gene.ann.hg19.rda') # load ens.gene.ann.hg19
idx <- ens.gene.ann.hg19$Gene %in% top10.genes
gene.pos <- unique(ens.gene.ann.hg19[idx, c(3:5)])
colnames(gene.pos) <- c('chrom', 'start', 'end')
autosomes <- as.character(1:22)
gene.pos <- gene.pos[gene.pos$chrom %in% autosomes, ] # keep autosomes only
gene.pos$chrom <- as.numeric(gene.pos$chrom)
## extract variants position from LD reference panel ##
bim <- do.call(rbind, lapply(
sHDL:::list.LD.ref.files(LD.path, suffix='.bim'), data.table::fread, header=FALSE
))
bim <- bim[, c(2, 1, 4)]
colnames(bim) <- c('SNP', 'chrom', 'pos')
## match annotated variants ##
gene2snp <- function(i, j){
cur.chrom <- gene.pos[j, 'chrom']
s <- gene.pos[j, 'start']
e <- gene.pos[j, 'end']
bim <- bim[bim$chrom == cur.chrom, ]
bim <- bim[bim$pos >= s & bim$pos <= e, ]
snps <- unique(c(i, bim$SNP))
nsnps <- length(snps)
if(j %% 1000 == 0) cat(sprintf(
'# of processed genes %d: # of annotated %d variants (%.2f%%) \n',
j, nsnps, nsnps/M*100
))
return(snps)
}
M <- nrow(bim)
snps <- Reduce(gene2snp, 1:nrow(gene.pos), init=c())
## create the annotation weights for sHDL ##
D <- rep(1, length(snps))
names(D) <- snps
- Step 02. Run
sHDL
on this binary annotation with the birth weight GWAS summary statistics fromHDL
.
data(gwas1.example, package='HDL')
res <- sHDL::sHDL(
D, gwas1.example, LD.path, nthreads=4,
stepwise=TRUE, lam.cut=1, mode='memory', Dr.path=NULL
)
str(res)
# List of 13
# $ time : num 78.8
# $ fold : num 1.3
# $ h2 : num 0.164
# $ intercept : num 0.963
# $ fold.se : num 0.105
# $ h2.se : num 0.00546
# $ intercept.se : num 0.0107
# $ fold.p : num 0.00374
# $ h2.p : num 2.69e-198
# $ intercept.p : num 0
# $ stepwise : logi TRUE
# $ converged : logi TRUE
# $ optim.message: chr "CONVERGENCE: NORM OF PROJECTED GRADIENT <= PGTOL"
Continuous annotation
In the context of continuous annotation, a normalization process should be employed to ensure that the annotation fits with the sHDL model.
-
Step 01. Extract the gene locations (hg19) and weights.
## extract gene infomation ## load('ens.gene.ann.hg19.rda') # load ens.gene.ann.hg19 gene.df <- unique(ens.gene.ann.hg19[, c(1, 3:5)]) colnames(gene.df) <- c('gene', 'chrom', 'start', 'end') autosomes <- as.character(1:22) gene.df <- gene.df[gene.df$chrom %in% autosomes, ] # keep autosomes only gene.df$chrom <- as.numeric(gene.df$chrom) gene.df <- gene.df[!duplicated(gene.df$gene), ] # remove duplicated genes gene.df$weight <- gene.weights[gene.df$gene] gene.df <- gene.df[!is.na(gene.df$weight), ] # remove genes without weight head(gene.df, 5) # gene chrom start end weight # 27828 LINC00115 1 761586 762902 0.13639639 # 12129 SAMD11 1 860260 879955 0.52717077 # 12571 NOC2L 1 879584 894689 0.05715007 # 12699 KLHL17 1 895967 901095 0.07142173 # 13025 HES4 1 934342 935552 0.09547616
-
Step 02. Map gene weights to SNP weights.
LD.path <- 'UKB_array_SVD_eigen90_extraction' ## extract variants position from LD reference panel ## bim <- do.call(rbind, lapply( sHDL:::list.LD.ref.files(LD.path, suffix='.bim'), data.table::fread, header=FALSE )) bim <- bim[, c(2, 1, 4)] colnames(bim) <- c('SNP', 'chrom', 'pos') ## map weights ## mapw <- function(i, j){ cur.chrom <- gene.df[j, 'chrom'] s <- gene.df[j, 'start'] e <- gene.df[j, 'end'] w <- gene.df[j, 'weight'] bim <- bim[bim$chrom == cur.chrom, ] bim <- bim[bim$pos >= s & bim$pos <= e, ] snps <- unique(c(names(i), bim$SNP)) nsnps <- length(snps) snps.w <- rep(0, nsnps) names(snps.w) <- snps snps.w[names(i)] <- snps.w[names(i)] + i snps.w[bim$SNP] <- snps.w[bim$SNP] + w if(j %% 1000 == 0) cat(sprintf( '# of processed genes %d: # of annotated %d variants (%.2f%%) \n', j, nsnps, nsnps/M*100 )) return(snps.w) } M <- nrow(bim) snps.w <- Reduce(mapw, 1:nrow(gene.df), init=c())
-
Step 03. Check the weights of variants.
summary(snps.w) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 0.00000 0.02038 0.04425 0.06551 0.07747 1.65767
Min-max normalization
One straightforward normalization method involves min-max
normalization, where the normalized D
value signifies the
degree to which the selected variant pertains to the annotated
category.
LD.path <- 'UKB_array_SVD_eigen90_extraction'
D.minmax <- sHDL:::normD(snps.w, LD.path, norm.method='minmax')
# Applied `minmax` weight nomalization on 121235 (39.424%) annotated variants.
# The theoretical upper boundary for enrichment fold is 61.419.
summary(D.minmax)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.00000 0.00000 0.00000 0.01628 0.02030 1.00000
Run sHDL
with minmax
normalization.
LD.path <- 'UKB_array_SVD_eigen90_extraction'
data(gwas1.example, package='HDL')
res <- sHDL::sHDL(
snps.w, gwas1.example, LD.path, nthreads=4,
stepwise=TRUE, lam.cut=1, mode='memory', Dr.path=NULL, norm.method='minmax'
)
str(res)
# List of 13
# $ time : num 64
# $ fold : num 8.74
# $ h2 : num 0.164
# $ intercept : num 0.963
# $ fold.se : num 0.91
# $ h2.se : num 0.00546
# $ intercept.se : num 0.0107
# $ fold.p : num 1.72e-17
# $ h2.p : num 2.69e-198
# $ intercept.p : num 0
# $ stepwise : logi TRUE
# $ converged : logi TRUE
# $ optim.message: chr "CONVERGENCE: NORM OF PROJECTED GRADIENT <= PGTOL"
Scaled normalization
Alternatively, another approach is to scale sum(D)
to
the number of annotated variants.
LD.path <- 'UKB_array_SVD_eigen90_extraction'
D.scaled <- sHDL:::normD(snps.w, LD.path, norm.method='scaled')
# Applied `scaled` weight nomalization on 121235 (39.424%) annotated variants.
# The theoretical upper boundary for enrichment fold is 2.537.
summary(D.scaled)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.0000 0.0000 0.0000 0.3942 0.4916 24.2134
Run sHDL
with scaled
normalization.
LD.path <- 'UKB_array_SVD_eigen90_extraction'
data(gwas1.example, package='HDL')
res <- sHDL::sHDL(
snps.w, gwas1.example, LD.path, nthreads=4,
stepwise=TRUE, lam.cut=1, mode='memory', Dr.path=NULL, norm.method='scaled'
)
str(res)
# List of 13
# $ time : num 73
# $ fold : num 1.2
# $ h2 : num 0.164
# $ intercept : num 0.963
# $ fold.se : num 0.0231
# $ h2.se : num 0.00546
# $ intercept.se : num 0.0107
# $ fold.p : num 1.73e-17
# $ h2.p : num 2.69e-198
# $ intercept.p : num 0
# $ stepwise : logi TRUE
# $ converged : logi TRUE
# $ optim.message: chr "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"
No normalization
One can also choose to keep the original annotation weights without normalization.
LD.path <- 'UKB_array_SVD_eigen90_extraction'
D.none <- sHDL:::normD(snps.w, LD.path, norm.method='none')
# No normalization applied on 121235 (39.424%) annotated variants. The theoretical upper boundary for enrichment fold is M / Md = 37.051.
summary(D.none)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.00000 0.00000 0.00000 0.02699 0.03366 1.65767
Run sHDL
without annotation weights normalization.
LD.path <- 'UKB_array_SVD_eigen90_extraction'
data(gwas1.example, package='HDL')
res <- sHDL::sHDL(
snps.w, gwas1.example, LD.path, nthreads=4,
stepwise=TRUE, lam.cut=1, mode='memory', Dr.path=NULL, norm.method='none'
)
str(res)
# List of 13
# $ time : num 70.8
# $ fold : num 5.62
# $ h2 : num 0.164
# $ intercept : num 0.963
# $ fold.se : num 0.543
# $ h2.se : num 0.00546
# $ intercept.se : num 0.0107
# $ fold.p : num 1.73e-17
# $ h2.p : num 2.69e-198
# $ intercept.p : num 0
# $ stepwise : logi TRUE
# $ converged : logi TRUE
# $ optim.message: chr "CONVERGENCE: NORM OF PROJECTED GRADIENT <= PGTOL"