Overview

This vignette demonstrates the PONG2 R API using the built-in example dataset. PONG2 is primarily a command-line tool (pong2 impute, pong2 train) — see the other vignettes for CLI usage. The R API demonstrated here is the underlying interface used by the CLI.


1. Installation and Setup

# From CRAN
install.packages("PONG2")

# From GitHub
remotes::install_github("NormanLabUCD/PONG2")
library(PONG2)
#> PONG2: Please add '/home/suraju/.local/bin' to your PATH:
#>   export PATH="/home/suraju/.local/bin:$PATH"
#> ** initializing environment
#> PONG2 (Genotype Imputation with Attribute Bagging): v2.0.0
#> Supported by Streaming SIMD Extensions 2 (SSE2)
packageVersion("PONG2")
#> [1] '1.0.1'

2. Example Data

PONG2 includes a built-in example dataset with 50 samples and 200 SNPs in the KIR region (chr19), along with a pre-trained KIR3DL1 model.

# Load built-in example data
data(PONG2_example)

# SNP genotype data
cat("SNP genotype data:\n")
#> SNP genotype data:
cat("  Samples:", ncol(example_snp$genotype), "\n")
#>   Samples: 50
cat("  SNPs:   ", nrow(example_snp$genotype), "\n")
#>   SNPs:    200
cat("  Assembly:", example_snp$assembly, "\n")
#>   Assembly: hg19

# KIR allele table
cat("\nKIR allele table:\n")
#> 
#> KIR allele table:
cat("  Locus:  ", example_kir$locus, "\n")
#>   Locus:   KIR3DL1
cat("  Alleles:", length(example_kir$allele), "\n")
#>   Alleles: 0
cat("  Samples:", length(example_kir$value$sample.id), "\n")
#>   Samples: 50
head(example_kir$value, 3)
#>   sample.id     allele1     allele2
#> 1        S1 KIR3DL1*015 KIR3DL1*002
#> 2        S2 KIR3DL1*007 KIR3DL1*007
#> 3        S3 KIR3DL1*015 KIR3DL1*002

3. KIR Genotype Prediction

# Load the pre-trained example model
model <- hlaModelFromObj(example_mobj)

cat("Model summary:\n")
#> Model summary:
cat("  KIR locus:   ", model$hla.locus, "\n")
#>   KIR locus:    KIR3DL1
cat("  Classifiers: ", length(example_mobj$classifiers), "\n")
#>   Classifiers:  10
cat("  Alleles:     ", length(model$hla.allele), "\n")
#>   Alleles:      5

# Predict KIR genotypes on example SNP data
pred <- kirPredict(
  object  = model,
  snp     = example_snp,
  type    = "response+prob",
  verbose = FALSE
)

# View predictions
cat("\nFirst 5 predictions:\n")
#> 
#> First 5 predictions:
head(pred$value, 5)
#>   sample.id     allele1     allele2      prob
#> 1        S1 KIR3DL1*002 KIR3DL1*015 0.4767310
#> 2        S2 KIR3DL1*007 KIR3DL1*007 0.4504548
#> 3        S3 KIR3DL1*002 KIR3DL1*015 0.6123777
#> 4        S4 KIR3DL1*002 KIR3DL1*015 0.6277634
#> 5        S5 KIR3DL1*002 KIR3DL1*005 0.6243871

# Call rate
called    <- sum(!is.na(pred$value$allele1))
call_rate <- round(called / nrow(pred$value) * 100, 1)
cat("\nCall rate:", call_rate, "%\n")
#> 
#> Call rate: 100 %

# Clean up model handle
hlaClose(model)

4. Model Training

# Set up parallel cluster
cl <- parallel::makeCluster(2)

# Train a small KIR3DL1 model using example data
model_trained <- kirParallelAttrBagging(
  cl          = cl,
  hla         = example_kir,
  snp         = example_snp,
  nclassifier = 5,
  verbose     = FALSE
)

parallel::stopCluster(cl)

# View trained model summary
print(model_trained)
#> Gene: KIR3DL1
#> Training dataset: 50 samples X 200 SNPs
#>  # of KIR3DL1/S1 alleles: 5
#>  # of individual classifiers: 5
#>  total # of SNPs used: 43
#>  average # of SNPs in an individual classifier: 9.60, sd: 2.70, min: 6, max: 13
#>  average # of haplotypes in an individual classifier: 122.20, sd: 73.74, min: 45, max: 225
#>  average out-of-bag accuracy: 63.84%, sd: 4.48%, min: 59.52%, max: 68.75%
#> Genome assembly: hg19

# Save model as R object
mobj_trained <- hlaModelToObj(model_trained)
cat("\nModel saved with", length(mobj_trained$classifiers), 
    "classifiers\n")
#> 
#> Model saved with 5 classifiers

# Clean up
hlaClose(model_trained)

5. Model Evaluation

# Split example data into train and test
set.seed(42)
n       <- length(example_kir$value$sample.id)
idx     <- sample(1:n, floor(n * 0.7))
train_samples <- example_kir$value$sample.id[idx]
test_samples  <- example_kir$value$sample.id[-idx]

# Subset KIR and SNP data
train_kir <- hlaAlleleSubset(example_kir,
               match(train_samples, example_kir$value$sample.id))
test_kir  <- hlaAlleleSubset(example_kir,
               match(test_samples, example_kir$value$sample.id))

# Train model on training set
cl <- parallel::makeCluster(2)
model_eval <- kirParallelAttrBagging(
  cl          = cl,
  hla         = train_kir,
  snp         = example_snp,
  nclassifier = 5,
  verbose     = FALSE
)
parallel::stopCluster(cl)

# Subset SNP data to test samples
test_snp_idx <- match(test_samples, example_snp$sample.id)
test_snp     <- hlaGenoSubset(example_snp, samp.sel = test_snp_idx)

# Predict on test set
pred_eval <- kirPredict(
  object  = model_eval,
  snp     = test_snp,
  type    = "response+prob",
  verbose = FALSE
)

# Compare predictions to truth
comp <- hlaCompareAllele(
  TrueHLA        = test_kir,
  PredHLA        = pred_eval,
  call.threshold = 0.5,
  verbose        = FALSE
)

cat("Evaluation results:\n")
#> Evaluation results:
cat("  Haplotype accuracy:", 
    round(comp$overall$acc.haplo * 100, 1), "%\n")
#>   Haplotype accuracy: NaN %
cat("  Genotype accuracy: ", 
    round(comp$overall$acc.ind * 100, 1), "%\n")
#>   Genotype accuracy:  NaN %
cat("  Call rate:         ", 
    round(comp$overall$call.rate * 100, 1), "%\n")
#>   Call rate:          0 %

# Clean up
hlaClose(model_eval)

6. CLI Usage

For the complete command-line workflow used in the manuscript, see:

Or run directly from the terminal:

# These commands run from the terminal after adding pong2 to PATH
system("pong2 impute -i example/chr19 -o output -l KIR3DL1 -a hg19")
system("pong2 train --bfile example/chr19 --kfile example/kir_call.csv \
        --output output --locus KIR3DL1 --assembly hg19")

Session Info

sessionInfo()
#> R version 4.5.2 (2025-10-31)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 22.04.5 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0
#> 
#> Random number generation:
#>  RNG:     L'Ecuyer-CMRG 
#>  Normal:  Inversion 
#>  Sample:  Rejection 
#>  
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] PONG2_1.0.1
#> 
#> loaded via a namespace (and not attached):
#>  [1] digest_0.6.39     desc_1.4.3        R6_2.6.1          fastmap_1.2.0    
#>  [5] xfun_0.54         cachem_1.1.0      parallel_4.5.2    knitr_1.51       
#>  [9] htmltools_0.5.9   rmarkdown_2.31    lifecycle_1.0.5   cli_3.6.6        
#> [13] sass_0.4.10       pkgdown_2.2.0     textshaping_1.0.5 jquerylib_0.1.4  
#> [17] systemfonts_1.3.2 compiler_4.5.2    rstudioapi_0.17.1 tools_4.5.2      
#> [21] ragg_1.4.0        bslib_0.10.0      evaluate_1.0.5    yaml_2.3.12      
#> [25] otel_0.2.0        jsonlite_2.0.0    htmlwidgets_1.6.4 rlang_1.1.7      
#> [29] fs_1.6.6