use crate::codon::{complement, reverse_complement};
use crate::error::VarEffectError;
use crate::fasta::FastaReader;
use crate::locate::LocateIndex;
use crate::transcript::TranscriptStore;
use crate::types::{Biotype, Strand, TranscriptModel};
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct GenomicVariant {
pub chrom: String,
pub pos: u64,
pub ref_allele: Vec<u8>,
pub alt_allele: Vec<u8>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ResolvedHgvsC {
pub variant: GenomicVariant,
pub resolved_accession: String,
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct HgvsCPosition {
base: i64,
is_3utr: bool,
intronic_offset: Option<i64>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
enum HgvsCChange {
Substitution { ref_base: u8, alt_base: u8 },
Deletion,
Insertion { bases: Vec<u8> },
Duplication,
Delins { bases: Vec<u8> },
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct ParsedHgvsC {
accession: String,
start: HgvsCPosition,
end: Option<HgvsCPosition>,
change: HgvsCChange,
}
fn parse_hgvs_c(input: &str) -> Result<ParsedHgvsC, VarEffectError> {
let err = |msg: &str| VarEffectError::HgvsParseError(format!("{msg}: \"{input}\""));
let (accession, desc) = input
.split_once(":c.")
.ok_or_else(|| err("missing ':c.' separator"))?;
if accession.is_empty() {
return Err(err("empty accession"));
}
if let Some(idx) = desc.find("delins") {
let pos_str = &desc[..idx];
let inserted_str = &desc[idx + 6..]; let inserted = parse_seq(inserted_str, input)?;
let (start, end) = parse_position_range(pos_str, input)?;
Ok(ParsedHgvsC {
accession: accession.to_string(),
start,
end,
change: HgvsCChange::Delins { bases: inserted },
})
} else if let Some(ins_idx) = desc.find("ins")
&& desc.contains('_')
{
let pos_str = &desc[..ins_idx];
let inserted_str = &desc[ins_idx + 3..]; let inserted = parse_seq(inserted_str, input)?;
let (start, end_opt) = parse_position_range(pos_str, input)?;
let end = end_opt.ok_or_else(|| {
VarEffectError::HgvsParseError(format!(
"insertion requires two flanking positions: \"{input}\""
))
})?;
Ok(ParsedHgvsC {
accession: accession.to_string(),
start,
end: Some(end),
change: HgvsCChange::Insertion { bases: inserted },
})
} else if let Some(idx) = desc.find("del") {
let pos_str = &desc[..idx];
let (start, end) = parse_position_range(pos_str, input)?;
Ok(ParsedHgvsC {
accession: accession.to_string(),
start,
end,
change: HgvsCChange::Deletion,
})
} else if let Some(idx) = desc.find("dup") {
let pos_str = &desc[..idx];
let (start, end) = parse_position_range(pos_str, input)?;
Ok(ParsedHgvsC {
accession: accession.to_string(),
start,
end,
change: HgvsCChange::Duplication,
})
} else if let Some(gt_idx) = desc.find('>') {
if gt_idx == 0 {
return Err(err("substitution missing REF base"));
}
let ref_base = desc.as_bytes()[gt_idx - 1];
let pos_str = &desc[..gt_idx - 1];
let alt_str = &desc[gt_idx + 1..];
if alt_str.len() != 1 {
return Err(err("substitution ALT must be a single base"));
}
let alt_base = alt_str.as_bytes()[0];
validate_base(ref_base, input)?;
validate_base(alt_base, input)?;
let start = parse_single_position(pos_str, input)?;
Ok(ParsedHgvsC {
accession: accession.to_string(),
start,
end: None,
change: HgvsCChange::Substitution { ref_base, alt_base },
})
} else {
Err(err(
"unrecognized change type (expected del/dup/ins/delins or >)",
))
}
}
fn parse_position_range(
s: &str,
input: &str,
) -> Result<(HgvsCPosition, Option<HgvsCPosition>), VarEffectError> {
if let Some(sep_idx) = find_range_separator(s) {
let left = parse_single_position(&s[..sep_idx], input)?;
let right = parse_single_position(&s[sep_idx + 1..], input)?;
Ok((left, Some(right)))
} else {
let pos = parse_single_position(s, input)?;
Ok((pos, None))
}
}
fn find_range_separator(s: &str) -> Option<usize> {
let bytes = s.as_bytes();
let mut i = 0;
if i < bytes.len() && (bytes[i] == b'*' || bytes[i] == b'-') {
i += 1;
}
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
i += 1;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
}
if i < bytes.len() && bytes[i] == b'_' {
Some(i)
} else {
None
}
}
fn parse_single_position(s: &str, input: &str) -> Result<HgvsCPosition, VarEffectError> {
let err = |msg: &str| {
VarEffectError::HgvsParseError(format!("{msg} in position \"{s}\": \"{input}\""))
};
if s.is_empty() {
return Err(err("empty position"));
}
let bytes = s.as_bytes();
let mut i = 0;
let is_3utr = bytes[i] == b'*';
if is_3utr {
i += 1;
}
let is_negative = !is_3utr && i < bytes.len() && bytes[i] == b'-';
if is_negative {
i += 1;
}
let digit_start = i;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i == digit_start {
return Err(err("no digits in base position"));
}
let base_val: i64 = s[digit_start..i]
.parse()
.map_err(|_| err("base position integer overflow"))?;
if base_val == 0 {
return Err(err("c.0 is invalid (HGVS is 1-based)"));
}
let base = if is_negative { -base_val } else { base_val };
let intronic_offset = if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
let sign_positive = bytes[i] == b'+';
i += 1;
let offset_start = i;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i == offset_start {
return Err(err("no digits in intronic offset"));
}
let offset_val: i64 = s[offset_start..i]
.parse()
.map_err(|_| err("intronic offset integer overflow"))?;
Some(if sign_positive {
offset_val
} else {
-offset_val
})
} else {
None
};
if i != bytes.len() {
return Err(err("unexpected trailing characters"));
}
Ok(HgvsCPosition {
base,
is_3utr,
intronic_offset,
})
}
fn parse_seq(s: &str, input: &str) -> Result<Vec<u8>, VarEffectError> {
if s.is_empty() {
return Err(VarEffectError::HgvsParseError(format!(
"empty nucleotide sequence: \"{input}\""
)));
}
let mut bases = Vec::with_capacity(s.len());
for &b in s.as_bytes() {
let upper = b.to_ascii_uppercase();
validate_base(upper, input)?;
bases.push(upper);
}
Ok(bases)
}
fn validate_base(b: u8, input: &str) -> Result<(), VarEffectError> {
match b.to_ascii_uppercase() {
b'A' | b'C' | b'G' | b'T' => Ok(()),
_ => Err(VarEffectError::HgvsParseError(format!(
"invalid nucleotide '{}': \"{input}\"",
b as char,
))),
}
}
fn resolve_position(
pos: &HgvsCPosition,
transcript: &TranscriptModel,
index: &LocateIndex,
) -> Result<u64, VarEffectError> {
let accession = &transcript.accession;
let anchor = if pos.is_3utr {
resolve_3utr(pos.base, transcript, index)?
} else if pos.base < 0 {
resolve_5utr(pos.base, transcript, index)?
} else {
resolve_cds(pos.base, transcript, index)?
};
match pos.intronic_offset {
None => Ok(anchor),
Some(offset) => {
let genomic = match transcript.strand {
Strand::Plus => {
if offset >= 0 {
anchor.checked_add(offset as u64).ok_or_else(|| {
VarEffectError::PositionOutOfRange {
accession: accession.clone(),
detail: format!("intronic offset {offset} overflows"),
}
})?
} else {
anchor.checked_sub(offset.unsigned_abs()).ok_or_else(|| {
VarEffectError::PositionOutOfRange {
accession: accession.clone(),
detail: format!("intronic offset {offset} underflows"),
}
})?
}
}
Strand::Minus => {
if offset >= 0 {
anchor.checked_sub(offset as u64).ok_or_else(|| {
VarEffectError::PositionOutOfRange {
accession: accession.clone(),
detail: format!("intronic offset {offset} underflows"),
}
})?
} else {
anchor.checked_add(offset.unsigned_abs()).ok_or_else(|| {
VarEffectError::PositionOutOfRange {
accession: accession.clone(),
detail: format!("intronic offset {offset} overflows"),
}
})?
}
}
};
Ok(genomic)
}
}
}
fn resolve_cds(
base: i64,
transcript: &TranscriptModel,
index: &LocateIndex,
) -> Result<u64, VarEffectError> {
debug_assert!(base > 0);
let cds_offset = (base - 1) as u32;
let cumulative = index.cumulative_cds();
if cds_offset >= index.total_cds_length() {
return Err(VarEffectError::PositionOutOfRange {
accession: transcript.accession.clone(),
detail: format!("c.{base} exceeds CDS length {}", index.total_cds_length()),
});
}
let seg_idx = cumulative.partition_point(|&c| c <= cds_offset) - 1;
let seg = &transcript.cds_segments[seg_idx];
let local = cds_offset - cumulative[seg_idx];
let genomic = match transcript.strand {
Strand::Plus => seg.genomic_start + local as u64,
Strand::Minus => seg.genomic_end - 1 - local as u64,
};
Ok(genomic)
}
fn resolve_5utr(
base: i64,
transcript: &TranscriptModel,
index: &LocateIndex,
) -> Result<u64, VarEffectError> {
debug_assert!(base < 0);
let mut remaining = base.unsigned_abs();
let exons = &transcript.exons;
let cds_start_exon = index.cds_start_exon_idx().ok_or_else(|| {
VarEffectError::Malformed(format!(
"{}: 5'UTR position but no cds_start_exon_idx",
transcript.accession,
))
})?;
match transcript.strand {
Strand::Plus => {
let cds_start = transcript.cds_genomic_start.ok_or_else(|| {
VarEffectError::Malformed(format!(
"{}: 5'UTR position but no cds_genomic_start",
transcript.accession,
))
})?;
let available = cds_start - exons[cds_start_exon].genomic_start;
if remaining <= available {
return Ok(cds_start - remaining);
}
remaining -= available;
for i in (0..cds_start_exon).rev() {
let exon_len = exons[i].genomic_end - exons[i].genomic_start;
if remaining <= exon_len {
return Ok(exons[i].genomic_end - remaining);
}
remaining -= exon_len;
}
}
Strand::Minus => {
let cds_boundary = transcript.cds_genomic_end.ok_or_else(|| {
VarEffectError::Malformed(format!(
"{}: 5'UTR position but no cds_genomic_end",
transcript.accession,
))
})?;
let available = exons[cds_start_exon].genomic_end - cds_boundary;
if remaining <= available {
return Ok(cds_boundary + remaining - 1);
}
remaining -= available;
for i in (0..cds_start_exon).rev() {
let exon_len = exons[i].genomic_end - exons[i].genomic_start;
if remaining <= exon_len {
return Ok(exons[i].genomic_start + remaining - 1);
}
remaining -= exon_len;
}
}
}
Err(VarEffectError::PositionOutOfRange {
accession: transcript.accession.clone(),
detail: format!("c.{base} exceeds 5'UTR exonic extent"),
})
}
fn resolve_3utr(
base: i64,
transcript: &TranscriptModel,
index: &LocateIndex,
) -> Result<u64, VarEffectError> {
debug_assert!(base > 0);
let mut remaining = base as u64;
let exons = &transcript.exons;
let cds_end_exon = index.cds_end_exon_idx().ok_or_else(|| {
VarEffectError::Malformed(format!(
"{}: 3'UTR position but no cds_end_exon_idx",
transcript.accession,
))
})?;
match transcript.strand {
Strand::Plus => {
let cds_end = transcript.cds_genomic_end.ok_or_else(|| {
VarEffectError::Malformed(format!(
"{}: 3'UTR position but no cds_genomic_end",
transcript.accession,
))
})?;
let available = exons[cds_end_exon].genomic_end - cds_end;
if remaining <= available {
return Ok(cds_end + remaining - 1);
}
remaining -= available;
for exon in &exons[(cds_end_exon + 1)..] {
let exon_len = exon.genomic_end - exon.genomic_start;
if remaining <= exon_len {
return Ok(exon.genomic_start + remaining - 1);
}
remaining -= exon_len;
}
}
Strand::Minus => {
let cds_boundary = transcript.cds_genomic_start.ok_or_else(|| {
VarEffectError::Malformed(format!(
"{}: 3'UTR position but no cds_genomic_start",
transcript.accession,
))
})?;
let available = cds_boundary - exons[cds_end_exon].genomic_start;
if remaining <= available {
return Ok(cds_boundary - remaining);
}
remaining -= available;
for exon in &exons[(cds_end_exon + 1)..] {
let exon_len = exon.genomic_end - exon.genomic_start;
if remaining <= exon_len {
return Ok(exon.genomic_end - remaining);
}
remaining -= exon_len;
}
}
}
Err(VarEffectError::PositionOutOfRange {
accession: transcript.accession.clone(),
detail: format!("c.*{base} exceeds 3'UTR exonic extent"),
})
}
fn to_plus_strand_base(base: u8, strand: Strand) -> u8 {
match strand {
Strand::Plus => base.to_ascii_uppercase(),
Strand::Minus => complement(base).to_ascii_uppercase(),
}
}
fn to_plus_strand_seq(bases: &[u8], strand: Strand) -> Vec<u8> {
match strand {
Strand::Plus => bases.iter().map(|b| b.to_ascii_uppercase()).collect(),
Strand::Minus => {
let mut rc = reverse_complement(bases);
rc.make_ascii_uppercase();
rc
}
}
}
fn build_substitution(
parsed: &ParsedHgvsC,
transcript: &TranscriptModel,
index: &LocateIndex,
fasta: &FastaReader,
hgvs: &str,
) -> Result<GenomicVariant, VarEffectError> {
let HgvsCChange::Substitution { ref_base, alt_base } = parsed.change else {
unreachable!("build_substitution is only called for Substitution variants")
};
let genomic_pos = resolve_position(&parsed.start, transcript, index)?;
let chrom = &transcript.chrom;
let plus_ref = to_plus_strand_base(ref_base, transcript.strand);
let plus_alt = to_plus_strand_base(alt_base, transcript.strand);
let fasta_ref = fasta.fetch_base(chrom, genomic_pos)?;
if fasta_ref.to_ascii_uppercase() != plus_ref {
return Err(VarEffectError::HgvsRefMismatch {
hgvs: hgvs.to_string(),
chrom: chrom.clone(),
pos: genomic_pos,
expected: String::from(fasta_ref as char),
got: String::from(plus_ref as char),
});
}
Ok(GenomicVariant {
chrom: chrom.clone(),
pos: genomic_pos,
ref_allele: vec![plus_ref],
alt_allele: vec![plus_alt],
})
}
fn resolve_range(
start: &HgvsCPosition,
end: Option<&HgvsCPosition>,
transcript: &TranscriptModel,
index: &LocateIndex,
) -> Result<(u64, u64), VarEffectError> {
let g_start = resolve_position(start, transcript, index)?;
let g_end = match end {
Some(e) => resolve_position(e, transcript, index)?,
None => g_start,
};
Ok((g_start.min(g_end), g_start.max(g_end)))
}
fn build_deletion(
parsed: &ParsedHgvsC,
transcript: &TranscriptModel,
index: &LocateIndex,
fasta: &FastaReader,
) -> Result<GenomicVariant, VarEffectError> {
let (gstart, gend) = resolve_range(&parsed.start, parsed.end.as_ref(), transcript, index)?;
let chrom = &transcript.chrom;
let anchor_pos = gstart
.checked_sub(1)
.ok_or_else(|| VarEffectError::PositionOutOfRange {
accession: transcript.accession.clone(),
detail: "deletion at position 0 has no anchor base".into(),
})?;
let anchor_base = fasta.fetch_base(chrom, anchor_pos)?;
let deleted = fasta.fetch_sequence(chrom, gstart, gend + 1)?;
let mut ref_allele = Vec::with_capacity(1 + deleted.len());
ref_allele.push(anchor_base);
ref_allele.extend_from_slice(&deleted);
Ok(GenomicVariant {
chrom: chrom.clone(),
pos: anchor_pos,
ref_allele,
alt_allele: vec![anchor_base],
})
}
fn build_duplication(
parsed: &ParsedHgvsC,
transcript: &TranscriptModel,
index: &LocateIndex,
fasta: &FastaReader,
) -> Result<GenomicVariant, VarEffectError> {
let (gstart, gend) = resolve_range(&parsed.start, parsed.end.as_ref(), transcript, index)?;
let chrom = &transcript.chrom;
let dup_seq = fasta.fetch_sequence(chrom, gstart, gend + 1)?;
let anchor_base = fasta.fetch_base(chrom, gend)?;
let mut alt_allele = Vec::with_capacity(1 + dup_seq.len());
alt_allele.push(anchor_base);
alt_allele.extend_from_slice(&dup_seq);
Ok(GenomicVariant {
chrom: chrom.clone(),
pos: gend,
ref_allele: vec![anchor_base],
alt_allele,
})
}
fn build_insertion(
parsed: &ParsedHgvsC,
transcript: &TranscriptModel,
index: &LocateIndex,
fasta: &FastaReader,
) -> Result<GenomicVariant, VarEffectError> {
let HgvsCChange::Insertion { bases } = &parsed.change else {
unreachable!("build_insertion is only called for Insertion variants")
};
let end_pos = parsed
.end
.as_ref()
.expect("insertion must have end position");
let g_left = resolve_position(&parsed.start, transcript, index)?;
let g_right = resolve_position(end_pos, transcript, index)?;
let chrom = &transcript.chrom;
let anchor_pos = g_left.min(g_right);
let anchor_base = fasta.fetch_base(chrom, anchor_pos)?;
let plus_inserted = to_plus_strand_seq(bases, transcript.strand);
let mut alt_allele = Vec::with_capacity(1 + plus_inserted.len());
alt_allele.push(anchor_base);
alt_allele.extend_from_slice(&plus_inserted);
Ok(GenomicVariant {
chrom: chrom.clone(),
pos: anchor_pos,
ref_allele: vec![anchor_base],
alt_allele,
})
}
fn build_delins(
parsed: &ParsedHgvsC,
transcript: &TranscriptModel,
index: &LocateIndex,
fasta: &FastaReader,
) -> Result<GenomicVariant, VarEffectError> {
let HgvsCChange::Delins { bases } = &parsed.change else {
unreachable!("build_delins is only called for Delins variants")
};
let (gstart, gend) = resolve_range(&parsed.start, parsed.end.as_ref(), transcript, index)?;
let chrom = &transcript.chrom;
let ref_allele = fasta.fetch_sequence(chrom, gstart, gend + 1)?;
let alt_allele = to_plus_strand_seq(bases, transcript.strand);
Ok(GenomicVariant {
chrom: chrom.clone(),
pos: gstart,
ref_allele,
alt_allele,
})
}
fn lookup_transcript<'a>(
accession: &str,
store: &'a TranscriptStore,
) -> Result<(&'a TranscriptModel, &'a LocateIndex), VarEffectError> {
if let Some(pair) = store.get_by_accession(accession) {
return Ok(pair);
}
let base = accession.split_once('.').map_or(accession, |(b, _)| b);
let prefix = format!("{base}.");
let mut best: Option<(&TranscriptModel, u32)> = None;
for tx in store.transcripts() {
if let Some(suffix) = tx.accession.strip_prefix(&prefix)
&& let Ok(ver) = suffix.parse::<u32>()
&& best.is_none_or(|(_, v)| ver > v)
{
best = Some((tx, ver));
}
}
if let Some((tx, _)) = best
&& let Some(pair) = store.get_by_accession(&tx.accession)
{
return Ok(pair);
}
Err(VarEffectError::TranscriptNotFound {
accession: accession.to_string(),
})
}
pub(crate) fn resolve_hgvs_c_with_meta(
hgvs: &str,
store: &TranscriptStore,
fasta: &FastaReader,
) -> Result<ResolvedHgvsC, VarEffectError> {
let parsed = parse_hgvs_c(hgvs)?;
let (transcript, index) = lookup_transcript(&parsed.accession, store)?;
if !matches!(transcript.biotype, Biotype::ProteinCoding) {
return Err(VarEffectError::HgvsParseError(format!(
"c. notation requires a protein-coding transcript, but {} is {:?}",
transcript.accession, transcript.biotype,
)));
}
let resolved_accession = transcript.accession.clone();
let variant = match &parsed.change {
HgvsCChange::Substitution { .. } => {
build_substitution(&parsed, transcript, index, fasta, hgvs)
}
HgvsCChange::Deletion => build_deletion(&parsed, transcript, index, fasta),
HgvsCChange::Duplication => build_duplication(&parsed, transcript, index, fasta),
HgvsCChange::Insertion { .. } => build_insertion(&parsed, transcript, index, fasta),
HgvsCChange::Delins { .. } => build_delins(&parsed, transcript, index, fasta),
}?;
Ok(ResolvedHgvsC {
variant,
resolved_accession,
})
}
pub(crate) fn resolve_hgvs_c(
hgvs: &str,
store: &TranscriptStore,
fasta: &FastaReader,
) -> Result<GenomicVariant, VarEffectError> {
resolve_hgvs_c_with_meta(hgvs, store, fasta).map(|r| r.variant)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::fasta::write_genome_binary;
use crate::test_fixtures::{minus_strand_coding, plus_strand_coding};
use tempfile::TempDir;
#[test]
fn parse_cds_substitution() {
let p = parse_hgvs_c("NM_000546.6:c.742C>T").unwrap();
assert_eq!(p.accession, "NM_000546.6");
assert_eq!(
p.start,
HgvsCPosition {
base: 742,
is_3utr: false,
intronic_offset: None
}
);
assert!(p.end.is_none());
assert_eq!(
p.change,
HgvsCChange::Substitution {
ref_base: b'C',
alt_base: b'T'
}
);
}
#[test]
fn parse_5utr_substitution() {
let p = parse_hgvs_c("NM_000546.6:c.-14G>A").unwrap();
assert_eq!(
p.start,
HgvsCPosition {
base: -14,
is_3utr: false,
intronic_offset: None
}
);
assert_eq!(
p.change,
HgvsCChange::Substitution {
ref_base: b'G',
alt_base: b'A'
}
);
}
#[test]
fn parse_3utr_substitution() {
let p = parse_hgvs_c("NM_000546.6:c.*32G>A").unwrap();
assert_eq!(
p.start,
HgvsCPosition {
base: 32,
is_3utr: true,
intronic_offset: None
}
);
}
#[test]
fn parse_intronic_donor() {
let p = parse_hgvs_c("NM_000546.6:c.672+1G>A").unwrap();
assert_eq!(
p.start,
HgvsCPosition {
base: 672,
is_3utr: false,
intronic_offset: Some(1)
}
);
}
#[test]
fn parse_intronic_acceptor() {
let p = parse_hgvs_c("NM_000546.6:c.673-2A>G").unwrap();
assert_eq!(
p.start,
HgvsCPosition {
base: 673,
is_3utr: false,
intronic_offset: Some(-2)
}
);
}
#[test]
fn parse_intronic_5utr() {
let p = parse_hgvs_c("NM_XXX.1:c.-15+1G>A").unwrap();
assert_eq!(
p.start,
HgvsCPosition {
base: -15,
is_3utr: false,
intronic_offset: Some(1)
}
);
}
#[test]
fn parse_intronic_3utr() {
let p = parse_hgvs_c("NM_XXX.1:c.*37+1G>A").unwrap();
assert_eq!(
p.start,
HgvsCPosition {
base: 37,
is_3utr: true,
intronic_offset: Some(1)
}
);
}
#[test]
fn parse_single_del() {
let p = parse_hgvs_c("NM_000546.6:c.19del").unwrap();
assert_eq!(p.start.base, 19);
assert!(p.end.is_none());
assert_eq!(p.change, HgvsCChange::Deletion);
}
#[test]
fn parse_range_del() {
let p = parse_hgvs_c("NM_000546.6:c.19_21del").unwrap();
assert_eq!(p.start.base, 19);
assert_eq!(p.end.as_ref().unwrap().base, 21);
assert_eq!(p.change, HgvsCChange::Deletion);
}
#[test]
fn parse_single_dup() {
let p = parse_hgvs_c("NM_000546.6:c.20dup").unwrap();
assert_eq!(p.start.base, 20);
assert!(p.end.is_none());
assert_eq!(p.change, HgvsCChange::Duplication);
}
#[test]
fn parse_range_dup() {
let p = parse_hgvs_c("NM_000546.6:c.20_23dup").unwrap();
assert_eq!(p.start.base, 20);
assert_eq!(p.end.as_ref().unwrap().base, 23);
}
#[test]
fn parse_insertion() {
let p = parse_hgvs_c("NM_000546.6:c.76_77insT").unwrap();
assert_eq!(p.start.base, 76);
assert_eq!(p.end.as_ref().unwrap().base, 77);
assert_eq!(p.change, HgvsCChange::Insertion { bases: vec![b'T'] });
}
#[test]
fn parse_delins_range() {
let p = parse_hgvs_c("NM_000546.6:c.112_117delinsTG").unwrap();
assert_eq!(p.start.base, 112);
assert_eq!(p.end.as_ref().unwrap().base, 117);
assert_eq!(
p.change,
HgvsCChange::Delins {
bases: vec![b'T', b'G']
}
);
}
#[test]
fn parse_delins_single() {
let p = parse_hgvs_c("NM_000546.6:c.113delinsTACT").unwrap();
assert_eq!(p.start.base, 113);
assert!(p.end.is_none());
assert_eq!(
p.change,
HgvsCChange::Delins {
bases: vec![b'T', b'A', b'C', b'T']
}
);
}
#[test]
fn parse_invalid_missing_c_dot() {
assert!(parse_hgvs_c("NM_000546.6:742C>T").is_err());
}
#[test]
fn parse_invalid_c_zero() {
assert!(parse_hgvs_c("NM_000546.6:c.0A>G").is_err());
}
#[test]
fn parse_invalid_empty_accession() {
assert!(parse_hgvs_c(":c.742C>T").is_err());
}
#[test]
fn parse_invalid_bad_base() {
assert!(parse_hgvs_c("NM_000546.6:c.742X>T").is_err());
}
#[test]
fn parse_invalid_insertion_single_pos() {
assert!(parse_hgvs_c("NM_000546.6:c.76insT").is_err());
}
fn single_tx_store(tx: TranscriptModel) -> TranscriptStore {
TranscriptStore::from_transcripts(vec![tx])
}
#[test]
fn resolve_cds_first_base_plus() {
let tx = plus_strand_coding();
let store = single_tx_store(tx);
let (tx, idx) = store.get_by_accession("NM_TEST_PLUS.1").unwrap();
let pos = HgvsCPosition {
base: 1,
is_3utr: false,
intronic_offset: None,
};
assert_eq!(resolve_position(&pos, tx, idx).unwrap(), 1500);
}
#[test]
fn resolve_cds_first_base_minus() {
let tx = minus_strand_coding();
let store = single_tx_store(tx);
let (tx, idx) = store.get_by_accession("NM_TEST_MINUS.1").unwrap();
let pos = HgvsCPosition {
base: 1,
is_3utr: false,
intronic_offset: None,
};
assert_eq!(resolve_position(&pos, tx, idx).unwrap(), 19499);
}
#[test]
fn resolve_cds_cross_segment_plus() {
let tx = plus_strand_coding();
let store = single_tx_store(tx);
let (tx, idx) = store.get_by_accession("NM_TEST_PLUS.1").unwrap();
let pos = HgvsCPosition {
base: 501,
is_3utr: false,
intronic_offset: None,
};
assert_eq!(resolve_position(&pos, tx, idx).unwrap(), 3000);
}
#[test]
fn resolve_cds_last_base_of_segment_plus() {
let tx = plus_strand_coding();
let store = single_tx_store(tx);
let (tx, idx) = store.get_by_accession("NM_TEST_PLUS.1").unwrap();
let pos = HgvsCPosition {
base: 500,
is_3utr: false,
intronic_offset: None,
};
assert_eq!(resolve_position(&pos, tx, idx).unwrap(), 1999);
}
#[test]
fn resolve_5utr_same_exon_plus() {
let tx = plus_strand_coding();
let store = single_tx_store(tx);
let (tx, idx) = store.get_by_accession("NM_TEST_PLUS.1").unwrap();
let pos = HgvsCPosition {
base: -1,
is_3utr: false,
intronic_offset: None,
};
assert_eq!(resolve_position(&pos, tx, idx).unwrap(), 1499);
}
#[test]
fn resolve_5utr_same_exon_minus() {
let tx = minus_strand_coding();
let store = single_tx_store(tx);
let (tx, idx) = store.get_by_accession("NM_TEST_MINUS.1").unwrap();
let pos = HgvsCPosition {
base: -1,
is_3utr: false,
intronic_offset: None,
};
assert_eq!(resolve_position(&pos, tx, idx).unwrap(), 19500);
}
#[test]
fn resolve_3utr_same_exon_plus() {
let tx = plus_strand_coding();
let store = single_tx_store(tx);
let (tx, idx) = store.get_by_accession("NM_TEST_PLUS.1").unwrap();
let pos = HgvsCPosition {
base: 1,
is_3utr: true,
intronic_offset: None,
};
assert_eq!(resolve_position(&pos, tx, idx).unwrap(), 4500);
}
#[test]
fn resolve_3utr_same_exon_minus() {
let tx = minus_strand_coding();
let store = single_tx_store(tx);
let (tx, idx) = store.get_by_accession("NM_TEST_MINUS.1").unwrap();
let pos = HgvsCPosition {
base: 1,
is_3utr: true,
intronic_offset: None,
};
assert_eq!(resolve_position(&pos, tx, idx).unwrap(), 10999);
}
#[test]
fn resolve_intronic_donor_plus() {
let tx = plus_strand_coding();
let store = single_tx_store(tx);
let (tx, idx) = store.get_by_accession("NM_TEST_PLUS.1").unwrap();
let pos = HgvsCPosition {
base: 500,
is_3utr: false,
intronic_offset: Some(5),
};
assert_eq!(resolve_position(&pos, tx, idx).unwrap(), 2004);
}
#[test]
fn resolve_intronic_acceptor_plus() {
let tx = plus_strand_coding();
let store = single_tx_store(tx);
let (tx, idx) = store.get_by_accession("NM_TEST_PLUS.1").unwrap();
let pos = HgvsCPosition {
base: 501,
is_3utr: false,
intronic_offset: Some(-4),
};
assert_eq!(resolve_position(&pos, tx, idx).unwrap(), 2996);
}
#[test]
fn resolve_intronic_donor_minus() {
let tx = minus_strand_coding();
let store = single_tx_store(tx);
let (tx, idx) = store.get_by_accession("NM_TEST_MINUS.1").unwrap();
let pos = HgvsCPosition {
base: 1500,
is_3utr: false,
intronic_offset: Some(1),
};
assert_eq!(resolve_position(&pos, tx, idx).unwrap(), 17999);
}
#[test]
fn resolve_intronic_acceptor_minus() {
let tx = minus_strand_coding();
let store = single_tx_store(tx);
let (tx, idx) = store.get_by_accession("NM_TEST_MINUS.1").unwrap();
let pos = HgvsCPosition {
base: 1501,
is_3utr: false,
intronic_offset: Some(-2),
};
assert_eq!(resolve_position(&pos, tx, idx).unwrap(), 16001);
}
#[test]
fn resolve_out_of_range() {
let tx = plus_strand_coding();
let store = single_tx_store(tx);
let (tx, idx) = store.get_by_accession("NM_TEST_PLUS.1").unwrap();
let pos = HgvsCPosition {
base: 9999,
is_3utr: false,
intronic_offset: None,
};
assert!(matches!(
resolve_position(&pos, tx, idx),
Err(VarEffectError::PositionOutOfRange { .. })
));
}
fn build_test_fasta() -> (TempDir, FastaReader) {
let chr1 = build_repeating_seq(6000);
let chr17 = build_repeating_seq(21000);
let contigs: Vec<(&str, &[u8])> = vec![("chr1", &chr1), ("chr17", &chr17)];
let tmp = TempDir::new().unwrap();
let bin_path = tmp.path().join("test.bin");
let idx_path = tmp.path().join("test.bin.idx");
write_genome_binary(&contigs, "test", &bin_path, &idx_path).unwrap();
let reader = FastaReader::open(&bin_path).unwrap();
(tmp, reader)
}
fn build_repeating_seq(len: usize) -> Vec<u8> {
let pattern = b"ACGT";
(0..len).map(|i| pattern[i % 4]).collect()
}
#[test]
fn vcf_substitution_plus() {
let tx = plus_strand_coding();
let store = single_tx_store(tx);
let (_tmp, fasta) = build_test_fasta();
let hgvs = "NM_TEST_PLUS.1:c.1C>T";
let err = resolve_hgvs_c(hgvs, &store, &fasta).expect_err("expected mismatch");
let VarEffectError::HgvsRefMismatch {
hgvs: err_hgvs,
chrom,
pos,
expected,
got,
} = &err
else {
panic!("expected HgvsRefMismatch, got: {err:?}");
};
assert_eq!(err_hgvs, hgvs);
assert_eq!(chrom, "chr1");
assert_eq!(*pos, 1500);
assert_eq!(expected, "A");
assert_eq!(got, "C");
assert_eq!(
format!("{err}"),
"ref allele mismatch at position 1500 for 'NM_TEST_PLUS.1:c.1C>T': \
genome has 'A', HGVS states 'C'",
);
let result = resolve_hgvs_c("NM_TEST_PLUS.1:c.1A>T", &store, &fasta).unwrap();
assert_eq!(result.chrom, "chr1");
assert_eq!(result.pos, 1500);
assert_eq!(result.ref_allele, vec![b'A']);
assert_eq!(result.alt_allele, vec![b'T']);
}
#[test]
fn hgvs_ref_mismatch_minus_strand_projects_to_plus() {
let tx = minus_strand_coding();
let store = single_tx_store(tx);
let (_tmp, fasta) = build_test_fasta();
let hgvs = "NM_TEST_MINUS.1:c.1C>G";
let err = resolve_hgvs_c(hgvs, &store, &fasta).expect_err("expected mismatch");
let VarEffectError::HgvsRefMismatch {
hgvs: err_hgvs,
chrom,
pos,
expected,
got,
} = &err
else {
panic!("expected HgvsRefMismatch, got: {err:?}");
};
assert_eq!(err_hgvs, hgvs);
assert_eq!(chrom, "chr17");
assert_eq!(*pos, 19499);
assert_eq!(expected, "T"); assert_eq!(got, "G"); }
#[test]
fn vcf_substitution_minus() {
let tx = minus_strand_coding();
let store = single_tx_store(tx);
let (_tmp, fasta) = build_test_fasta();
let result = resolve_hgvs_c("NM_TEST_MINUS.1:c.1A>G", &store, &fasta).unwrap();
assert_eq!(result.chrom, "chr17");
assert_eq!(result.pos, 19499);
assert_eq!(result.ref_allele, vec![b'T']); assert_eq!(result.alt_allele, vec![b'C']); }
#[test]
fn vcf_deletion_anchor_base() {
let tx = plus_strand_coding();
let store = single_tx_store(tx);
let (_tmp, fasta) = build_test_fasta();
let result = resolve_hgvs_c("NM_TEST_PLUS.1:c.2del", &store, &fasta).unwrap();
assert_eq!(result.pos, 1500); assert_eq!(result.ref_allele, vec![b'A', b'C']); assert_eq!(result.alt_allele, vec![b'A']); }
#[test]
fn vcf_range_deletion() {
let tx = plus_strand_coding();
let store = single_tx_store(tx);
let (_tmp, fasta) = build_test_fasta();
let result = resolve_hgvs_c("NM_TEST_PLUS.1:c.2_4del", &store, &fasta).unwrap();
assert_eq!(result.pos, 1500);
assert_eq!(result.ref_allele, vec![b'A', b'C', b'G', b'T']);
assert_eq!(result.alt_allele, vec![b'A']);
}
#[test]
fn vcf_insertion_anchor_base() {
let tx = plus_strand_coding();
let store = single_tx_store(tx);
let (_tmp, fasta) = build_test_fasta();
let result = resolve_hgvs_c("NM_TEST_PLUS.1:c.1_2insG", &store, &fasta).unwrap();
assert_eq!(result.pos, 1500);
assert_eq!(result.ref_allele, vec![b'A']);
assert_eq!(result.alt_allele, vec![b'A', b'G']);
}
#[test]
fn vcf_duplication() {
let tx = plus_strand_coding();
let store = single_tx_store(tx);
let (_tmp, fasta) = build_test_fasta();
let result = resolve_hgvs_c("NM_TEST_PLUS.1:c.1dup", &store, &fasta).unwrap();
assert_eq!(result.pos, 1500);
assert_eq!(result.ref_allele, vec![b'A']);
assert_eq!(result.alt_allele, vec![b'A', b'A']);
}
#[test]
fn vcf_range_duplication() {
let tx = plus_strand_coding();
let store = single_tx_store(tx);
let (_tmp, fasta) = build_test_fasta();
let result = resolve_hgvs_c("NM_TEST_PLUS.1:c.1_3dup", &store, &fasta).unwrap();
assert_eq!(result.pos, 1502);
assert_eq!(result.ref_allele, vec![b'G']);
assert_eq!(result.alt_allele, vec![b'G', b'A', b'C', b'G']);
}
#[test]
fn vcf_delins() {
let tx = plus_strand_coding();
let store = single_tx_store(tx);
let (_tmp, fasta) = build_test_fasta();
let result = resolve_hgvs_c("NM_TEST_PLUS.1:c.2delinsTT", &store, &fasta).unwrap();
assert_eq!(result.pos, 1501);
assert_eq!(result.ref_allele, vec![b'C']);
assert_eq!(result.alt_allele, vec![b'T', b'T']);
}
#[test]
fn lookup_versioned() {
let store = single_tx_store(plus_strand_coding());
let (tx, _) = lookup_transcript("NM_TEST_PLUS.1", &store).unwrap();
assert_eq!(tx.accession, "NM_TEST_PLUS.1");
}
#[test]
fn lookup_not_found() {
let store = single_tx_store(plus_strand_coding());
assert!(matches!(
lookup_transcript("NM_NONEXISTENT.1", &store),
Err(VarEffectError::TranscriptNotFound { .. })
));
}
#[test]
fn lookup_unversioned() {
let store = single_tx_store(plus_strand_coding());
let (tx, _) = lookup_transcript("NM_TEST_PLUS", &store).unwrap();
assert_eq!(tx.accession, "NM_TEST_PLUS.1");
}
#[test]
fn lookup_versioned_but_missing_falls_back_to_available() {
let store = single_tx_store(plus_strand_coding());
let (tx, _) = lookup_transcript("NM_TEST_PLUS.2", &store).unwrap();
assert_eq!(tx.accession, "NM_TEST_PLUS.1");
}
#[test]
fn lookup_versioned_missing_picks_highest_available() {
let mut v1 = plus_strand_coding();
v1.accession = "NM_TEST_PLUS.1".into();
let mut v5 = plus_strand_coding();
v5.accession = "NM_TEST_PLUS.5".into();
let store = TranscriptStore::from_transcripts(vec![v1, v5]);
let (tx, _) = lookup_transcript("NM_TEST_PLUS.99", &store).unwrap();
assert_eq!(tx.accession, "NM_TEST_PLUS.5");
}
#[test]
fn lookup_versioned_missing_unknown_base_errors() {
let store = single_tx_store(plus_strand_coding());
assert!(matches!(
lookup_transcript("NM_DIFFERENT.1", &store),
Err(VarEffectError::TranscriptNotFound { .. })
));
}
#[test]
fn resolve_hgvs_c_tolerates_missing_version() {
let store = single_tx_store(plus_strand_coding());
let (_tmp, fasta) = build_test_fasta();
let drifted = resolve_hgvs_c("NM_TEST_PLUS.9:c.1A>T", &store, &fasta).unwrap();
let exact = resolve_hgvs_c("NM_TEST_PLUS.1:c.1A>T", &store, &fasta).unwrap();
assert_eq!(drifted.chrom, exact.chrom);
assert_eq!(drifted.pos, exact.pos);
assert_eq!(drifted.ref_allele, exact.ref_allele);
assert_eq!(drifted.alt_allele, exact.alt_allele);
}
#[test]
fn resolve_hgvs_c_with_meta_reports_resolved_version() {
let store = single_tx_store(plus_strand_coding());
let (_tmp, fasta) = build_test_fasta();
let exact = resolve_hgvs_c_with_meta("NM_TEST_PLUS.1:c.1A>T", &store, &fasta).unwrap();
assert_eq!(exact.resolved_accession, "NM_TEST_PLUS.1");
let drifted = resolve_hgvs_c_with_meta("NM_TEST_PLUS.9:c.1A>T", &store, &fasta).unwrap();
assert_eq!(drifted.resolved_accession, "NM_TEST_PLUS.1");
assert_eq!(drifted.variant, exact.variant);
}
#[test]
#[ignore]
fn reverse_map_tp53_missense() {
let store = load_store();
let fasta = load_fasta();
let result = resolve_hgvs_c("NM_000546.6:c.742C>T", &store, &fasta).unwrap();
assert_eq!(result.chrom, "chr17");
assert!(!result.ref_allele.is_empty());
assert!(!result.alt_allele.is_empty());
let hits = store.query_overlap(&result.chrom, result.pos, result.pos + 1);
let nm546 = hits
.iter()
.find(|(tx, _)| tx.accession == "NM_000546.6")
.expect("NM_000546.6 should overlap");
let csq = crate::annotate_snv(
&result.chrom,
result.pos,
result.ref_allele[0],
result.alt_allele[0],
nm546.0,
nm546.1,
&fasta,
)
.unwrap();
assert!(
csq.consequences
.iter()
.any(|c| c.as_str() == "missense_variant"),
"expected missense_variant, got {:?}",
csq.consequences,
);
}
#[test]
#[ignore]
fn reverse_map_tp53_splice() {
let store = load_store();
let fasta = load_fasta();
let result = resolve_hgvs_c("NM_000546.6:c.672+1G>A", &store, &fasta).unwrap();
assert_eq!(result.chrom, "chr17");
let hits = store.query_overlap(&result.chrom, result.pos, result.pos + 1);
let nm546 = hits
.iter()
.find(|(tx, _)| tx.accession == "NM_000546.6")
.expect("NM_000546.6 should overlap");
let csq = crate::annotate_snv(
&result.chrom,
result.pos,
result.ref_allele[0],
result.alt_allele[0],
nm546.0,
nm546.1,
&fasta,
)
.unwrap();
assert!(
csq.consequences
.iter()
.any(|c| c.as_str() == "splice_donor_variant"),
"expected splice_donor_variant, got {:?}",
csq.consequences,
);
}
#[cfg(test)]
fn load_store() -> TranscriptStore {
let manifest_dir = std::path::Path::new(env!("CARGO_MANIFEST_DIR"));
let workspace_root = manifest_dir
.parent()
.and_then(|p| p.parent())
.expect("workspace root");
let path = workspace_root.join("data/vareffect/transcript_models.bin");
TranscriptStore::load_from_path(&path).unwrap_or_else(|e| {
panic!(
"failed to load transcript store from {}: {}",
path.display(),
e,
)
})
}
#[cfg(test)]
fn load_fasta() -> FastaReader {
let path = std::env::var("FASTA_PATH")
.expect("FASTA_PATH env var must point to a GRCh38 genome binary");
FastaReader::open_with_patch_aliases(
std::path::Path::new(&path),
Some(
std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.parent()
.and_then(|p| p.parent())
.expect("workspace root")
.join("data/vareffect/patch_chrom_aliases.csv")
.as_ref(),
),
)
.unwrap_or_else(|e| panic!("failed to open FASTA at {path}: {e}"))
}
}