vignettes/PONG2-R-api.Rmd
PONG2-R-api.RmdThis vignette demonstrates the PONG2 R API using the built-in example
dataset. PONG2 is primarily a command-line tool
(pong2 impute, pong2 train) — see the other
vignettes for CLI usage. The R API demonstrated here is the underlying
interface used by the CLI.
# From CRAN
install.packages("PONG2")
# From GitHub
remotes::install_github("NormanLabUCD/PONG2")
library(PONG2)
#> PONG2: Please add '/home/suraju/.local/bin' to your PATH:
#> export PATH="/home/suraju/.local/bin:$PATH"
#> ** initializing environment
#> PONG2 (Genotype Imputation with Attribute Bagging): v2.0.0
#> Supported by Streaming SIMD Extensions 2 (SSE2)
packageVersion("PONG2")
#> [1] '1.0.1'PONG2 includes a built-in example dataset with 50 samples and 200 SNPs in the KIR region (chr19), along with a pre-trained KIR3DL1 model.
# Load built-in example data
data(PONG2_example)
# SNP genotype data
cat("SNP genotype data:\n")
#> SNP genotype data:
cat(" Samples:", ncol(example_snp$genotype), "\n")
#> Samples: 50
cat(" SNPs: ", nrow(example_snp$genotype), "\n")
#> SNPs: 200
cat(" Assembly:", example_snp$assembly, "\n")
#> Assembly: hg19
# KIR allele table
cat("\nKIR allele table:\n")
#>
#> KIR allele table:
cat(" Locus: ", example_kir$locus, "\n")
#> Locus: KIR3DL1
cat(" Alleles:", length(example_kir$allele), "\n")
#> Alleles: 0
cat(" Samples:", length(example_kir$value$sample.id), "\n")
#> Samples: 50
head(example_kir$value, 3)
#> sample.id allele1 allele2
#> 1 S1 KIR3DL1*015 KIR3DL1*002
#> 2 S2 KIR3DL1*007 KIR3DL1*007
#> 3 S3 KIR3DL1*015 KIR3DL1*002
# Load the pre-trained example model
model <- hlaModelFromObj(example_mobj)
cat("Model summary:\n")
#> Model summary:
cat(" KIR locus: ", model$hla.locus, "\n")
#> KIR locus: KIR3DL1
cat(" Classifiers: ", length(example_mobj$classifiers), "\n")
#> Classifiers: 10
cat(" Alleles: ", length(model$hla.allele), "\n")
#> Alleles: 5
# Predict KIR genotypes on example SNP data
pred <- kirPredict(
object = model,
snp = example_snp,
type = "response+prob",
verbose = FALSE
)
# View predictions
cat("\nFirst 5 predictions:\n")
#>
#> First 5 predictions:
head(pred$value, 5)
#> sample.id allele1 allele2 prob
#> 1 S1 KIR3DL1*002 KIR3DL1*015 0.4767310
#> 2 S2 KIR3DL1*007 KIR3DL1*007 0.4504548
#> 3 S3 KIR3DL1*002 KIR3DL1*015 0.6123777
#> 4 S4 KIR3DL1*002 KIR3DL1*015 0.6277634
#> 5 S5 KIR3DL1*002 KIR3DL1*005 0.6243871
# Call rate
called <- sum(!is.na(pred$value$allele1))
call_rate <- round(called / nrow(pred$value) * 100, 1)
cat("\nCall rate:", call_rate, "%\n")
#>
#> Call rate: 100 %
# Clean up model handle
hlaClose(model)
# Set up parallel cluster
cl <- parallel::makeCluster(2)
# Train a small KIR3DL1 model using example data
model_trained <- kirParallelAttrBagging(
cl = cl,
hla = example_kir,
snp = example_snp,
nclassifier = 5,
verbose = FALSE
)
parallel::stopCluster(cl)
# View trained model summary
print(model_trained)
#> Gene: KIR3DL1
#> Training dataset: 50 samples X 200 SNPs
#> # of KIR3DL1/S1 alleles: 5
#> # of individual classifiers: 5
#> total # of SNPs used: 43
#> average # of SNPs in an individual classifier: 9.60, sd: 2.70, min: 6, max: 13
#> average # of haplotypes in an individual classifier: 122.20, sd: 73.74, min: 45, max: 225
#> average out-of-bag accuracy: 63.84%, sd: 4.48%, min: 59.52%, max: 68.75%
#> Genome assembly: hg19
# Save model as R object
mobj_trained <- hlaModelToObj(model_trained)
cat("\nModel saved with", length(mobj_trained$classifiers),
"classifiers\n")
#>
#> Model saved with 5 classifiers
# Clean up
hlaClose(model_trained)
# Split example data into train and test
set.seed(42)
n <- length(example_kir$value$sample.id)
idx <- sample(1:n, floor(n * 0.7))
train_samples <- example_kir$value$sample.id[idx]
test_samples <- example_kir$value$sample.id[-idx]
# Subset KIR and SNP data
train_kir <- hlaAlleleSubset(example_kir,
match(train_samples, example_kir$value$sample.id))
test_kir <- hlaAlleleSubset(example_kir,
match(test_samples, example_kir$value$sample.id))
# Train model on training set
cl <- parallel::makeCluster(2)
model_eval <- kirParallelAttrBagging(
cl = cl,
hla = train_kir,
snp = example_snp,
nclassifier = 5,
verbose = FALSE
)
parallel::stopCluster(cl)
# Subset SNP data to test samples
test_snp_idx <- match(test_samples, example_snp$sample.id)
test_snp <- hlaGenoSubset(example_snp, samp.sel = test_snp_idx)
# Predict on test set
pred_eval <- kirPredict(
object = model_eval,
snp = test_snp,
type = "response+prob",
verbose = FALSE
)
# Compare predictions to truth
comp <- hlaCompareAllele(
TrueHLA = test_kir,
PredHLA = pred_eval,
call.threshold = 0.5,
verbose = FALSE
)
cat("Evaluation results:\n")
#> Evaluation results:
cat(" Haplotype accuracy:",
round(comp$overall$acc.haplo * 100, 1), "%\n")
#> Haplotype accuracy: NaN %
cat(" Genotype accuracy: ",
round(comp$overall$acc.ind * 100, 1), "%\n")
#> Genotype accuracy: NaN %
cat(" Call rate: ",
round(comp$overall$call.rate * 100, 1), "%\n")
#> Call rate: 0 %
# Clean up
hlaClose(model_eval)For the complete command-line workflow used in the manuscript, see:
vignette("PONG2-basics") — installation and CLI quick
startvignette("PONG2-imputation") — full imputation
workflowvignette("PONG2-training") — custom model trainingOr run directly from the terminal:
# These commands run from the terminal after adding pong2 to PATH
system("pong2 impute -i example/chr19 -o output -l KIR3DL1 -a hg19")
system("pong2 train --bfile example/chr19 --kfile example/kir_call.csv \
--output output --locus KIR3DL1 --assembly hg19")
sessionInfo()
#> R version 4.5.2 (2025-10-31)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 22.04.5 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0 LAPACK version 3.10.0
#>
#> Random number generation:
#> RNG: L'Ecuyer-CMRG
#> Normal: Inversion
#> Sample: Rejection
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] PONG2_1.0.1
#>
#> loaded via a namespace (and not attached):
#> [1] digest_0.6.39 desc_1.4.3 R6_2.6.1 fastmap_1.2.0
#> [5] xfun_0.54 cachem_1.1.0 parallel_4.5.2 knitr_1.51
#> [9] htmltools_0.5.9 rmarkdown_2.31 lifecycle_1.0.5 cli_3.6.6
#> [13] sass_0.4.10 pkgdown_2.2.0 textshaping_1.0.5 jquerylib_0.1.4
#> [17] systemfonts_1.3.2 compiler_4.5.2 rstudioapi_0.17.1 tools_4.5.2
#> [21] ragg_1.4.0 bslib_0.10.0 evaluate_1.0.5 yaml_2.3.12
#> [25] otel_0.2.0 jsonlite_2.0.0 htmlwidgets_1.6.4 rlang_1.1.7
#> [29] fs_1.6.6