use std::collections::BTreeSet;
use std::path::Path;
use vareffect::{FastaReader, TranscriptStore, VarEffect};
fn load_store() -> TranscriptStore {
let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR"));
let workspace_root = manifest_dir
.parent()
.expect("could not determine workspace root from CARGO_MANIFEST_DIR");
let path = workspace_root.join("data/vareffect/transcript_models.bin");
TranscriptStore::load_from_path(&path).unwrap_or_else(|e| {
panic!(
"failed to load transcript store from {}: {}. \
Run `cargo run -p vareffect-cli -- build` first.",
path.display(),
e,
)
})
}
fn load_fasta() -> FastaReader {
let path = std::env::var("FASTA_PATH")
.expect("FASTA_PATH env var must point to a GRCh38 genome binary");
FastaReader::open_with_patch_aliases(
Path::new(&path),
Some(
Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(|p| p.parent())
.expect("workspace root")
.join("data/vareffect/patch_chrom_aliases.csv")
.as_ref(),
),
)
.unwrap_or_else(|e| panic!("failed to open FASTA at {path}: {e}"))
}
#[allow(dead_code)]
struct ExpectedAnnotation {
label: &'static str,
chrom: &'static str,
pos: u64,
ref_allele: &'static [u8],
alt_allele: &'static [u8],
transcript: &'static str,
expected_hgvs_c: Option<&'static str>,
expected_hgvs_p: Option<&'static str>,
expected_consequences: Option<&'static [&'static str]>,
assert_not_intergenic: bool,
expected_nmd: Option<bool>,
category: &'static str,
}
const VARIANTS: &[ExpectedAnnotation] = &[
ExpectedAnnotation {
label: "BRCA1 c.5266dup (3' shift + NMD)",
chrom: "chr17",
pos: 43_057_062,
ref_allele: b"G",
alt_allele: b"GG",
transcript: "NM_007294.4",
expected_hgvs_c: Some("NM_007294.4:c.5266dup"),
expected_hgvs_p: None,
expected_consequences: Some(&["frameshift_variant"]),
assert_not_intergenic: false,
expected_nmd: Some(true),
category: "A: 3' norm (dup, minus)",
},
ExpectedAnnotation {
label: "ERBB2 c.2313_2324dup 12bp (3' shifted ins→dup)",
chrom: "chr17",
pos: 39_724_729,
ref_allele: b"C",
alt_allele: b"CATACGTGATGGC",
transcript: "NM_004448.4",
expected_hgvs_c: Some("NM_004448.4:c.2313_2324dup"),
expected_hgvs_p: Some("p.Tyr772_Ala775dup"),
expected_consequences: None,
assert_not_intergenic: false,
expected_nmd: None,
category: "A: 3' norm (ins→dup, plus)",
},
ExpectedAnnotation {
label: "BRCA2 c.1813del single-base (plus)",
chrom: "chr13",
pos: 32_333_289,
ref_allele: b"AA",
alt_allele: b"A",
transcript: "NM_000059.4",
expected_hgvs_c: Some("NM_000059.4:c.1813del"),
expected_hgvs_p: None,
expected_consequences: None,
assert_not_intergenic: false,
expected_nmd: None,
category: "B: 3' norm (del, mono-A)",
},
ExpectedAnnotation {
label: "CFTR deltaF508 c.1521_1523del (no shift, regression)",
chrom: "chr7",
pos: 117_559_590,
ref_allele: b"TCTT",
alt_allele: b"T",
transcript: "NM_000492.4",
expected_hgvs_c: Some("NM_000492.4:c.1521_1523del"),
expected_hgvs_p: Some("p.Phe508del"),
expected_consequences: None,
assert_not_intergenic: false,
expected_nmd: None,
category: "B: 3' norm (del, no shift, regression)",
},
ExpectedAnnotation {
label: "BRCA2 c.5946_5947insAA non-dup (plus)",
chrom: "chr13",
pos: 32_340_300,
ref_allele: b"T",
alt_allele: b"TAA",
transcript: "NM_000059.4",
expected_hgvs_c: Some("NM_000059.4:c.5946_5947insAA"),
expected_hgvs_p: None,
expected_consequences: None,
assert_not_intergenic: false,
expected_nmd: None,
category: "C: 3' norm (ins, non-dup)",
},
ExpectedAnnotation {
label: "EGFR exon20 3bp ins (non-dup, codon-boundary regression)",
chrom: "chr7",
pos: 55_181_318,
ref_allele: b"C",
alt_allele: b"CGGT",
transcript: "NM_005228.5",
expected_hgvs_c: Some("NM_005228.5:c.2310_2311insGGT"),
expected_hgvs_p: Some("p.Asp770_Asn771insGly"),
expected_consequences: None,
assert_not_intergenic: false,
expected_nmd: None,
category: "C: 3' norm (ins, non-repeat, regression)",
},
ExpectedAnnotation {
label: "Gene desert chr5:67.5M (intergenic)",
chrom: "chr5",
pos: 67_500_000,
ref_allele: b"T",
alt_allele: b"A",
transcript: "",
expected_hgvs_c: None,
expected_hgvs_p: None,
expected_consequences: Some(&["intergenic_variant"]),
assert_not_intergenic: false,
expected_nmd: None,
category: "D: intergenic",
},
ExpectedAnnotation {
label: "Gene desert chr13:19M (intergenic)",
chrom: "chr13",
pos: 19_000_000,
ref_allele: b"C",
alt_allele: b"T",
transcript: "",
expected_hgvs_c: None,
expected_hgvs_p: None,
expected_consequences: Some(&["intergenic_variant"]),
assert_not_intergenic: false,
expected_nmd: None,
category: "D: intergenic",
},
ExpectedAnnotation {
label: "TP53 R248W — NOT intergenic (negative control)",
chrom: "chr17",
pos: 7_674_220,
ref_allele: b"G",
alt_allele: b"A",
transcript: "NM_000546.6",
expected_hgvs_c: None,
expected_hgvs_p: None,
expected_consequences: Some(&["missense_variant"]),
assert_not_intergenic: true,
expected_nmd: None,
category: "D: NOT intergenic (negative control)",
},
ExpectedAnnotation {
label: "TP53 R196X stop_gained (NMD true, early CDS)",
chrom: "chr17",
pos: 7_674_944,
ref_allele: b"G",
alt_allele: b"A",
transcript: "NM_000546.6",
expected_hgvs_c: None,
expected_hgvs_p: None,
expected_consequences: Some(&["stop_gained"]),
assert_not_intergenic: false,
expected_nmd: Some(true),
category: "E: NMD (stop_gained, early, true)",
},
ExpectedAnnotation {
label: "APC R1450X stop_gained (NMD false, last exon)",
chrom: "chr5",
pos: 112_839_941,
ref_allele: b"C",
alt_allele: b"T",
transcript: "NM_000038.6",
expected_hgvs_c: None,
expected_hgvs_p: None,
expected_consequences: Some(&["stop_gained"]),
assert_not_intergenic: false,
expected_nmd: Some(false),
category: "E: NMD (stop_gained, last exon, false)",
},
ExpectedAnnotation {
label: "TP53 R342X stop_gained (NMD true, penultimate, dist=77)",
chrom: "chr17",
pos: 7_670_684,
ref_allele: b"G",
alt_allele: b"A",
transcript: "NM_000546.6",
expected_hgvs_c: None,
expected_hgvs_p: None,
expected_consequences: Some(&["stop_gained"]),
assert_not_intergenic: false,
expected_nmd: Some(true),
category: "E: NMD (stop_gained, penultimate, true)",
},
ExpectedAnnotation {
label: "BRCA1 c.68_69del frameshift (NMD true, early CDS)",
chrom: "chr17",
pos: 43_124_026,
ref_allele: b"ACT",
alt_allele: b"A",
transcript: "NM_007294.4",
expected_hgvs_c: None,
expected_hgvs_p: None,
expected_consequences: Some(&["frameshift_variant"]),
assert_not_intergenic: false,
expected_nmd: Some(true),
category: "E: NMD (frameshift, early, true)",
},
ExpectedAnnotation {
label: "APC c.3927_3931del frameshift (NMD false, last exon)",
chrom: "chr5",
pos: 112_839_519,
ref_allele: b"AAAAGA",
alt_allele: b"A",
transcript: "NM_000038.6",
expected_hgvs_c: None,
expected_hgvs_p: None,
expected_consequences: Some(&["frameshift_variant"]),
assert_not_intergenic: false,
expected_nmd: Some(false),
category: "E: NMD (frameshift, last exon, false)",
},
ExpectedAnnotation {
label: "PRNP Q160X stop_gained (NMD false, single-exon gene)",
chrom: "chr20",
pos: 4_699_697,
ref_allele: b"C",
alt_allele: b"T",
transcript: "NM_000311.5",
expected_hgvs_c: None,
expected_hgvs_p: None,
expected_consequences: Some(&["stop_gained"]),
assert_not_intergenic: false,
expected_nmd: Some(false),
category: "E: NMD (single-exon, false)",
},
ExpectedAnnotation {
label: "TP53 c.1051A>T Lys351Ter (NMD false, dist=50, boundary)",
chrom: "chr17",
pos: 7_670_657,
ref_allele: b"T",
alt_allele: b"A",
transcript: "NM_000546.6",
expected_hgvs_c: None,
expected_hgvs_p: None,
expected_consequences: Some(&["stop_gained"]),
assert_not_intergenic: false,
expected_nmd: Some(false),
category: "E: NMD (boundary, dist=50, false)",
},
ExpectedAnnotation {
label: "TP53 c.1050del frameshift (NMD true, dist=51, boundary)",
chrom: "chr17",
pos: 7_670_657,
ref_allele: b"TG",
alt_allele: b"T",
transcript: "NM_000546.6",
expected_hgvs_c: None,
expected_hgvs_p: None,
expected_consequences: Some(&["frameshift_variant"]),
assert_not_intergenic: false,
expected_nmd: Some(true),
category: "E: NMD (boundary, dist=51, true)",
},
ExpectedAnnotation {
label: "TP53 R248R synonymous (NMD false, no PTC)",
chrom: "chr17",
pos: 7_674_220,
ref_allele: b"G",
alt_allele: b"T",
transcript: "NM_000546.6",
expected_hgvs_c: None,
expected_hgvs_p: None,
expected_consequences: Some(&["synonymous_variant"]),
assert_not_intergenic: false,
expected_nmd: Some(false),
category: "F: synonymous (no NMD)",
},
];
fn check_variant(ve: &VarEffect, exp: &ExpectedAnnotation) -> Result<Vec<String>, String> {
let results = ve
.annotate(exp.chrom, exp.pos, exp.ref_allele, exp.alt_allele)
.map_err(|e| format!("annotate error: {e}"))?;
let result = if exp.transcript.is_empty() {
results
.iter()
.find(|r| r.transcript.is_empty())
.ok_or_else(|| {
let detail: Vec<String> = results
.iter()
.map(|r| {
let csqs: Vec<&str> = r.consequences.iter().map(|c| c.as_str()).collect();
format!("{}:{csqs:?}", r.transcript)
})
.collect();
format!("no intergenic result; found: {detail:?}")
})?
} else {
results
.iter()
.find(|r| r.transcript == exp.transcript)
.ok_or_else(|| {
let found: Vec<&str> = results.iter().map(|r| r.transcript.as_str()).collect();
format!(
"transcript {} not found in results (found: {found:?})",
exp.transcript,
)
})?
};
let mut mismatches: Vec<String> = Vec::new();
if let Some(expected) = exp.expected_hgvs_c
&& result.hgvs_c.as_deref() != Some(expected)
{
mismatches.push(format!(
"hgvs_c: expected {:?}, got {:?}",
expected, result.hgvs_c,
));
}
if let Some(expected) = exp.expected_hgvs_p
&& result.hgvs_p.as_deref() != Some(expected)
{
mismatches.push(format!(
"hgvs_p: expected {:?}, got {:?}",
expected, result.hgvs_p,
));
}
if let Some(expected_csq) = exp.expected_consequences {
let actual: BTreeSet<&str> = result.consequences.iter().map(|c| c.as_str()).collect();
let missing: Vec<&&str> = expected_csq
.iter()
.filter(|c| !actual.contains(**c))
.collect();
if !missing.is_empty() {
mismatches.push(format!(
"consequences: missing {missing:?} from actual {actual:?}",
));
}
}
if exp.assert_not_intergenic
&& result
.consequences
.iter()
.any(|c| c.as_str() == "intergenic_variant")
{
mismatches.push("intergenic_variant found but should NOT be present".into());
}
if let Some(expected_nmd) = exp.expected_nmd
&& result.predicts_nmd != expected_nmd
{
mismatches.push(format!(
"predicts_nmd: expected {expected_nmd}, got {}",
result.predicts_nmd,
));
}
Ok(mismatches)
}
#[test]
#[ignore]
fn vep_concordance_normalization_18() {
let ve = VarEffect::new(load_store(), load_fasta());
let total = VARIANTS.len();
let mut pass = 0u32;
let mut fail = 0u32;
let mut skip = 0u32;
let mut failures: Vec<String> = Vec::new();
for (i, exp) in VARIANTS.iter().enumerate() {
let num = i + 1;
match check_variant(&ve, exp) {
Err(msg) if msg.contains("not found") => {
eprintln!(
" [{num:>2}/{total}] SKIP {} [{}]: {msg}",
exp.label, exp.category,
);
skip += 1;
}
Err(msg) => {
eprintln!(
" [{num:>2}/{total}] FAIL {} [{}]: {msg}",
exp.label, exp.category,
);
failures.push(format!("#{num} {}: {msg}", exp.label));
fail += 1;
}
Ok(ref mismatches) if mismatches.is_empty() => {
eprintln!(
" [{num:>2}/{total}] PASS {} [{}]",
exp.label, exp.category,
);
pass += 1;
}
Ok(mismatches) => {
eprintln!(
" [{num:>2}/{total}] FAIL {} [{}]",
exp.label, exp.category,
);
let detail = mismatches.join("\n ");
failures.push(format!("#{num} {}:\n {detail}", exp.label));
fail += 1;
}
}
}
eprintln!(
"\n=== Normalization concordance: {pass}/{} passed, {skip} skipped ===",
pass + fail,
);
if !failures.is_empty() {
eprintln!("\nFailures:");
for f in &failures {
eprintln!(" {f}");
}
}
assert_eq!(
fail,
0,
"{fail} of {} variants failed normalization concordance check",
pass + fail,
);
}