use super::SpdiVariant;
use crate::convert::CoordinateMapper;
use crate::coords::{OneBasedPos, ZeroBasedPos};
use crate::error::FerroError;
use crate::hgvs::edit::{InsertedSequence, NaEdit, RepeatCount, Sequence};
use crate::hgvs::interval::Interval;
use crate::hgvs::location::{CdsPos, GenomePos, RnaPos, TxPos};
use crate::hgvs::parser::accession::parse_accession;
use crate::hgvs::variant::{
Accession, CdsVariant, GenomeVariant, HgvsVariant, LocEdit, MtVariant, RnaVariant, TxVariant,
};
use crate::reference::provider::ReferenceProvider;
use crate::reference::transcript::Transcript;
use crate::sequence::reverse_complement;
const MAX_REPEAT_EXPANSION_BASES: usize = 100_000;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ConversionError {
UnsupportedVariantType {
description: String,
},
MissingReferenceData {
description: String,
},
UnsupportedEditType {
description: String,
},
InvalidPosition {
description: String,
},
InvalidAccession {
description: String,
},
ProviderRequired {
variant_type: String,
reason: String,
},
}
impl std::fmt::Display for ConversionError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ConversionError::UnsupportedVariantType { description } => {
write!(
f,
"unsupported variant type for conversion: {}",
description
)
}
ConversionError::MissingReferenceData { description } => {
write!(f, "missing reference data: {}", description)
}
ConversionError::UnsupportedEditType { description } => {
write!(f, "unsupported edit type for conversion: {}", description)
}
ConversionError::InvalidPosition { description } => {
write!(f, "invalid position: {}", description)
}
ConversionError::InvalidAccession { description } => {
write!(f, "invalid accession: {}", description)
}
ConversionError::ProviderRequired {
variant_type,
reason,
} => {
write!(
f,
"reference provider required to convert {}. variant: {}",
variant_type, reason
)
}
}
}
}
impl std::error::Error for ConversionError {}
impl From<ConversionError> for FerroError {
fn from(err: ConversionError) -> Self {
FerroError::ConversionError {
msg: err.to_string(),
}
}
}
fn sequence_to_string(seq: &Sequence) -> String {
seq.to_string()
}
fn inserted_sequence_to_string(seq: &InsertedSequence) -> Option<String> {
match seq {
InsertedSequence::Literal(s) => Some(s.to_string()),
_ => None,
}
}
fn get_start_pos(interval: &Interval<GenomePos>) -> Option<u64> {
interval.start.inner().map(|p| p.base)
}
fn get_end_pos(interval: &Interval<GenomePos>) -> Option<u64> {
interval.end.inner().map(|p| p.base)
}
pub fn hgvs_to_spdi_simple(variant: &HgvsVariant) -> Result<SpdiVariant, ConversionError> {
match variant {
HgvsVariant::Genome(g) => genome_to_spdi_simple(g),
HgvsVariant::Mt(m) => mt_to_spdi_simple(m),
HgvsVariant::Tx(n) => tx_to_spdi_simple(n),
HgvsVariant::Rna(r) => rna_to_spdi_simple(r),
HgvsVariant::Cds(_) => Err(ConversionError::ProviderRequired {
variant_type: "c".to_string(),
reason:
"CDS positions need transcript metadata (CDS start) to resolve to a transcript \
position; call hgvs_to_spdi with a ReferenceProvider"
.to_string(),
}),
HgvsVariant::Protein(_) => Err(ConversionError::UnsupportedVariantType {
description: "protein variants cannot be represented in SPDI; SPDI describes \
nucleotide variants on a sequence accession"
.to_string(),
}),
_ => Err(ConversionError::UnsupportedVariantType {
description: format!(
"variant type {} cannot be converted to SPDI",
variant.variant_type()
),
}),
}
}
pub fn hgvs_to_spdi<P: ReferenceProvider>(
variant: &HgvsVariant,
provider: &P,
) -> Result<SpdiVariant, ConversionError> {
match variant {
HgvsVariant::Genome(g) => genome_to_spdi_with_provider(g, provider),
HgvsVariant::Mt(m) => mt_to_spdi_with_provider(m, provider),
HgvsVariant::Tx(n) => tx_to_spdi_with_provider(n, provider),
HgvsVariant::Rna(r) => rna_to_spdi_with_provider(r, provider),
HgvsVariant::Cds(c) => cds_to_spdi_with_provider(c, provider),
HgvsVariant::Protein(_) => Err(ConversionError::UnsupportedVariantType {
description: "protein variants cannot be represented in SPDI; SPDI describes \
nucleotide variants on a sequence accession"
.to_string(),
}),
_ => Err(ConversionError::UnsupportedVariantType {
description: format!(
"variant type {} cannot be converted to SPDI",
variant.variant_type()
),
}),
}
}
fn genome_to_spdi_simple(variant: &GenomeVariant) -> Result<SpdiVariant, ConversionError> {
let edit = unwrap_edit(&variant.loc_edit.edit)?;
let start_pos = get_start_pos(&variant.loc_edit.location).ok_or_else(|| {
ConversionError::InvalidPosition {
description: "cannot convert variant with unknown start position".to_string(),
}
})?;
let end_pos = get_end_pos(&variant.loc_edit.location).unwrap_or(start_pos);
emit_spdi_for_edit(
variant.accession.to_string(),
start_pos,
end_pos,
edit,
AlphabetMode::Dna,
None::<&dyn ReferenceProvider>,
)
}
fn mt_to_spdi_simple(variant: &MtVariant) -> Result<SpdiVariant, ConversionError> {
let edit = unwrap_edit(&variant.loc_edit.edit)?;
let start_pos = get_start_pos(&variant.loc_edit.location).ok_or_else(|| {
ConversionError::InvalidPosition {
description: "cannot convert variant with unknown start position".to_string(),
}
})?;
let end_pos = get_end_pos(&variant.loc_edit.location).unwrap_or(start_pos);
emit_spdi_for_edit(
variant.accession.to_string(),
start_pos,
end_pos,
edit,
AlphabetMode::Dna,
None::<&dyn ReferenceProvider>,
)
}
fn tx_to_spdi_simple(variant: &TxVariant) -> Result<SpdiVariant, ConversionError> {
let edit = unwrap_edit(&variant.loc_edit.edit)?;
let start_tx = tx_pos_for_simple_path(&variant.loc_edit.location, "n")?;
let end_tx = tx_end_for_simple_path(&variant.loc_edit.location, start_tx, "n")?;
emit_spdi_for_edit(
variant.accession.to_string(),
start_tx,
end_tx,
edit,
AlphabetMode::Dna,
None::<&dyn ReferenceProvider>,
)
}
fn rna_to_spdi_simple(variant: &RnaVariant) -> Result<SpdiVariant, ConversionError> {
let edit = unwrap_edit(&variant.loc_edit.edit)?;
let start_pos = rna_pos_for_simple_path(&variant.loc_edit.location, "r")?;
let end_pos = rna_end_for_simple_path(&variant.loc_edit.location, start_pos, "r")?;
emit_spdi_for_edit(
variant.accession.to_string(),
start_pos,
end_pos,
edit,
AlphabetMode::Rna,
None::<&dyn ReferenceProvider>,
)
}
fn genome_to_spdi_with_provider<P: ReferenceProvider + ?Sized>(
variant: &GenomeVariant,
provider: &P,
) -> Result<SpdiVariant, ConversionError> {
let edit = unwrap_edit(&variant.loc_edit.edit)?;
let start_pos = get_start_pos(&variant.loc_edit.location).ok_or_else(|| {
ConversionError::InvalidPosition {
description: "cannot convert variant with unknown start position".to_string(),
}
})?;
let end_pos = get_end_pos(&variant.loc_edit.location).unwrap_or(start_pos);
emit_spdi_for_edit(
variant.accession.to_string(),
start_pos,
end_pos,
edit,
AlphabetMode::Dna,
Some(provider),
)
}
fn mt_to_spdi_with_provider<P: ReferenceProvider + ?Sized>(
variant: &MtVariant,
provider: &P,
) -> Result<SpdiVariant, ConversionError> {
let edit = unwrap_edit(&variant.loc_edit.edit)?;
let start_pos = get_start_pos(&variant.loc_edit.location).ok_or_else(|| {
ConversionError::InvalidPosition {
description: "cannot convert variant with unknown start position".to_string(),
}
})?;
let end_pos = get_end_pos(&variant.loc_edit.location).unwrap_or(start_pos);
emit_spdi_for_edit(
variant.accession.to_string(),
start_pos,
end_pos,
edit,
AlphabetMode::Dna,
Some(provider),
)
}
fn cds_to_spdi_with_provider<P: ReferenceProvider>(
variant: &CdsVariant,
provider: &P,
) -> Result<SpdiVariant, ConversionError> {
let edit = unwrap_edit(&variant.loc_edit.edit)?;
let start_cds = variant.loc_edit.location.start.inner().ok_or_else(|| {
ConversionError::InvalidPosition {
description: "cannot convert c. variant with unknown start position".to_string(),
}
})?;
let end_cds = variant
.loc_edit
.location
.end
.inner()
.copied()
.unwrap_or(*start_cds);
let (start_tx, end_tx) = resolve_cds_to_tx(&variant.accession, start_cds, &end_cds, provider)?;
emit_spdi_for_edit(
variant.accession.to_string(),
start_tx,
end_tx,
edit,
AlphabetMode::Dna,
Some(provider),
)
}
fn tx_to_spdi_with_provider<P: ReferenceProvider>(
variant: &TxVariant,
provider: &P,
) -> Result<SpdiVariant, ConversionError> {
let edit = unwrap_edit(&variant.loc_edit.edit)?;
let (start_tx, end_tx) = if tx_needs_provider(&variant.loc_edit.location) {
let start = variant.loc_edit.location.start.inner().ok_or_else(|| {
ConversionError::InvalidPosition {
description: "cannot convert n. variant with unknown start position".to_string(),
}
})?;
let end = variant
.loc_edit
.location
.end
.inner()
.copied()
.unwrap_or(*start);
resolve_tx_to_provider_tx(&variant.accession, start, &end, provider)?
} else {
let s = tx_pos_for_simple_path(&variant.loc_edit.location, "n")?;
let e = tx_end_for_simple_path(&variant.loc_edit.location, s, "n")?;
(s, e)
};
emit_spdi_for_edit(
variant.accession.to_string(),
start_tx,
end_tx,
edit,
AlphabetMode::Dna,
Some(provider),
)
}
fn rna_to_spdi_with_provider<P: ReferenceProvider>(
variant: &RnaVariant,
provider: &P,
) -> Result<SpdiVariant, ConversionError> {
let edit = unwrap_edit(&variant.loc_edit.edit)?;
let (start_tx, end_tx) = if rna_needs_provider(&variant.loc_edit.location) {
let start = variant.loc_edit.location.start.inner().ok_or_else(|| {
ConversionError::InvalidPosition {
description: "cannot convert r. variant with unknown start position".to_string(),
}
})?;
let end = variant
.loc_edit
.location
.end
.inner()
.copied()
.unwrap_or(*start);
resolve_rna_to_provider_tx(&variant.accession, start, &end, provider)?
} else {
let s = rna_pos_for_simple_path(&variant.loc_edit.location, "r")?;
let e = rna_end_for_simple_path(&variant.loc_edit.location, s, "r")?;
(s, e)
};
emit_spdi_for_edit(
variant.accession.to_string(),
start_tx,
end_tx,
edit,
AlphabetMode::Rna,
Some(provider),
)
}
#[derive(Debug, Clone, Copy)]
enum AlphabetMode {
Dna,
Rna,
}
fn unwrap_edit<E>(edit: &crate::hgvs::uncertainty::Mu<E>) -> Result<&E, ConversionError> {
edit.inner()
.ok_or_else(|| ConversionError::InvalidPosition {
description: "cannot convert variant with unknown edit".to_string(),
})
}
fn tx_pos_for_simple_path(interval: &Interval<TxPos>, coord: &str) -> Result<u64, ConversionError> {
let start = interval
.start
.inner()
.ok_or_else(|| ConversionError::InvalidPosition {
description: format!(
"cannot convert {}. variant with unknown start position",
coord
),
})?;
require_simple_tx_pos(start, coord)
}
fn tx_end_for_simple_path(
interval: &Interval<TxPos>,
fallback: u64,
coord: &str,
) -> Result<u64, ConversionError> {
match interval.end.inner() {
Some(end) => require_simple_tx_pos(end, coord),
None => Ok(fallback),
}
}
fn require_simple_tx_pos(pos: &TxPos, coord: &str) -> Result<u64, ConversionError> {
if pos.is_intronic() {
return Err(ConversionError::MissingReferenceData {
description: format!(
"intronic {}. position requires reference provider with exon data",
coord
),
});
}
if pos.is_downstream() {
return Err(ConversionError::MissingReferenceData {
description: format!(
"downstream {}. position (*N) requires reference provider with transcript length",
coord
),
});
}
if pos.base < 1 {
return Err(ConversionError::MissingReferenceData {
description: format!(
"non-positive {}. position {} requires reference provider with transcript length",
coord, pos.base
),
});
}
Ok(pos.base as u64)
}
fn rna_pos_for_simple_path(
interval: &Interval<RnaPos>,
coord: &str,
) -> Result<u64, ConversionError> {
let start = interval
.start
.inner()
.ok_or_else(|| ConversionError::InvalidPosition {
description: format!(
"cannot convert {}. variant with unknown start position",
coord
),
})?;
require_simple_rna_pos(start, coord)
}
fn rna_end_for_simple_path(
interval: &Interval<RnaPos>,
fallback: u64,
coord: &str,
) -> Result<u64, ConversionError> {
match interval.end.inner() {
Some(end) => require_simple_rna_pos(end, coord),
None => Ok(fallback),
}
}
fn require_simple_rna_pos(pos: &RnaPos, coord: &str) -> Result<u64, ConversionError> {
if pos.is_intronic() {
return Err(ConversionError::MissingReferenceData {
description: format!(
"intronic {}. position requires reference provider with exon data",
coord
),
});
}
if pos.utr3 {
return Err(ConversionError::MissingReferenceData {
description: format!(
"3' UTR {}. position (*N) requires reference provider with transcript length",
coord
),
});
}
if pos.base < 1 {
return Err(ConversionError::MissingReferenceData {
description: format!(
"non-positive {}. position {} requires reference provider with transcript length",
coord, pos.base
),
});
}
Ok(pos.base as u64)
}
fn tx_needs_provider(interval: &Interval<TxPos>) -> bool {
let needs = |p: &TxPos| p.is_intronic() || p.is_downstream() || p.base < 1;
interval.start.inner().is_some_and(needs) || interval.end.inner().is_some_and(needs)
}
fn rna_needs_provider(interval: &Interval<RnaPos>) -> bool {
let needs = |p: &RnaPos| p.is_intronic() || p.utr3 || p.base < 1;
interval.start.inner().is_some_and(needs) || interval.end.inner().is_some_and(needs)
}
fn resolve_cds_to_tx<P: ReferenceProvider>(
accession: &Accession,
start: &CdsPos,
end: &CdsPos,
provider: &P,
) -> Result<(u64, u64), ConversionError> {
if start.is_intronic() || end.is_intronic() {
return Err(ConversionError::MissingReferenceData {
description: "intronic c. positions cannot be expressed in SPDI without genomic \
projection; SPDI is positional and has no offset notation"
.to_string(),
});
}
let tx_id = accession.transcript_accession();
let transcript =
provider
.get_transcript(&tx_id)
.map_err(|e| ConversionError::MissingReferenceData {
description: format!("could not load transcript {}: {}", tx_id, e),
})?;
let mapper = CoordinateMapper::new(&transcript);
let s = mapper
.cds_to_tx(start)
.map_err(|e| ConversionError::MissingReferenceData {
description: format!("could not resolve {} to transcript position: {}", start, e),
})?;
let e = mapper
.cds_to_tx(end)
.map_err(|e| ConversionError::MissingReferenceData {
description: format!("could not resolve {} to transcript position: {}", end, e),
})?;
let s_u = ensure_positive_tx(s.base, "c", start)?;
let e_u = ensure_positive_tx(e.base, "c", end)?;
Ok((s_u, e_u))
}
fn resolve_tx_to_provider_tx<P: ReferenceProvider>(
accession: &Accession,
start: &TxPos,
end: &TxPos,
provider: &P,
) -> Result<(u64, u64), ConversionError> {
let tx_id = accession.transcript_accession();
let transcript =
provider
.get_transcript(&tx_id)
.map_err(|e| ConversionError::MissingReferenceData {
description: format!("could not load transcript {}: {}", tx_id, e),
})?;
let s = resolve_tx_pos(start, &transcript)?;
let e = resolve_tx_pos(end, &transcript)?;
Ok((s, e))
}
fn resolve_rna_to_provider_tx<P: ReferenceProvider>(
accession: &Accession,
start: &RnaPos,
end: &RnaPos,
provider: &P,
) -> Result<(u64, u64), ConversionError> {
let tx_id = accession.transcript_accession();
let transcript =
provider
.get_transcript(&tx_id)
.map_err(|e| ConversionError::MissingReferenceData {
description: format!("could not load transcript {}: {}", tx_id, e),
})?;
let s = resolve_rna_pos(start, &transcript)?;
let e = resolve_rna_pos(end, &transcript)?;
Ok((s, e))
}
fn resolve_tx_pos(pos: &TxPos, _transcript: &Transcript) -> Result<u64, ConversionError> {
if pos.is_intronic() {
return Err(ConversionError::MissingReferenceData {
description: format!(
"intronic n.{} cannot be expressed in SPDI without genomic projection; \
SPDI is positional and has no offset notation",
pos
),
});
}
if pos.is_downstream() {
return Err(ConversionError::InvalidPosition {
description: format!(
"downstream n.{} cannot be expressed in SPDI on the transcript accession \
without genomic projection",
pos
),
});
}
ensure_positive_tx(pos.base, "n", pos)
}
fn resolve_rna_pos(pos: &RnaPos, transcript: &Transcript) -> Result<u64, ConversionError> {
if pos.is_intronic() {
return Err(ConversionError::MissingReferenceData {
description: format!(
"intronic r.{} cannot be expressed in SPDI without genomic projection; \
SPDI is positional and has no offset notation",
pos
),
});
}
let tx_len = transcript.sequence_length();
if pos.utr3 {
if pos.base < 1 {
return Err(ConversionError::InvalidPosition {
description: format!("3' UTR position *{} must be >= 1", pos.base),
});
}
return Ok(tx_len.saturating_add(pos.base as u64));
}
ensure_positive_tx(pos.base, "r", pos)
}
fn ensure_positive_tx<P: std::fmt::Display>(
base: i64,
coord: &str,
pos: P,
) -> Result<u64, ConversionError> {
if base < 1 {
return Err(ConversionError::InvalidPosition {
description: format!(
"transcript position from {}. coordinate {} resolves to a non-positive base ({})",
coord, pos, base
),
});
}
Ok(base as u64)
}
fn emit_spdi_for_edit<P>(
sequence: String,
start_one_based: u64,
end_one_based: u64,
edit: &NaEdit,
alphabet: AlphabetMode,
provider: Option<&P>,
) -> Result<SpdiVariant, ConversionError>
where
P: ReferenceProvider + ?Sized,
{
let hgvs_pos_ob =
OneBasedPos::try_new(start_one_based).ok_or_else(|| ConversionError::InvalidPosition {
description: "position 0 is not valid in HGVS".to_string(),
})?;
let spdi_pos_zb: ZeroBasedPos = hgvs_pos_ob.to_zero_based();
let spdi_pos = spdi_pos_zb.value();
match edit {
NaEdit::Substitution {
reference,
alternative,
} => Ok(SpdiVariant::new(
sequence,
spdi_pos,
apply_alphabet(&reference.to_string(), alphabet),
apply_alphabet(&alternative.to_string(), alphabet),
)),
NaEdit::Insertion { sequence: inserted } => {
let ins_str = inserted_sequence_to_string(inserted).ok_or_else(|| {
ConversionError::MissingReferenceData {
description: "insertion sequence is not a literal sequence".to_string(),
}
})?;
Ok(SpdiVariant::new(
sequence,
spdi_pos,
"",
apply_alphabet(&ins_str, alphabet),
))
}
NaEdit::Duplication {
sequence: dup_seq, ..
} => {
let dup_str = match dup_seq {
Some(seq) => sequence_to_string(seq),
None => match provider {
Some(p) => fetch_reference_bases(p, &sequence, start_one_based, end_one_based)?,
None => {
return Err(ConversionError::MissingReferenceData {
description:
"duplication sequence not provided; reference data needed to determine duplicated bases"
.to_string(),
});
}
},
};
let end_pos_ob = OneBasedPos::new(end_one_based);
let spdi_end_zb = end_pos_ob.to_zero_based();
Ok(SpdiVariant::new(
sequence,
spdi_end_zb.value(),
"",
apply_alphabet(&dup_str, alphabet),
))
}
NaEdit::Deletion {
sequence: del_seq, ..
} => {
let del_str = match del_seq {
Some(seq) => sequence_to_string(seq),
None => match provider {
Some(p) => fetch_reference_bases(p, &sequence, start_one_based, end_one_based)?,
None => {
return Err(ConversionError::MissingReferenceData {
description:
"deleted sequence not provided; reference data needed to determine deleted bases"
.to_string(),
});
}
},
};
Ok(SpdiVariant::new(
sequence,
spdi_pos,
apply_alphabet(&del_str, alphabet),
"",
))
}
NaEdit::Delins {
sequence: ins_seq,
deleted,
deleted_length: _,
} => {
let ins_str = inserted_sequence_to_string(ins_seq).ok_or_else(|| {
ConversionError::MissingReferenceData {
description: "delins inserted sequence is not a literal sequence".to_string(),
}
})?;
let del_str = match deleted {
Some(seq) => sequence_to_string(seq),
None => match provider {
Some(p) => fetch_reference_bases(p, &sequence, start_one_based, end_one_based)?,
None => {
let del_len = end_one_based
.saturating_sub(start_one_based)
.saturating_add(1) as usize;
return Err(ConversionError::MissingReferenceData {
description: format!(
"Cannot convert delins to SPDI: deleted sequence of length {} is unknown (no reference data)",
del_len
),
});
}
},
};
Ok(SpdiVariant::new(
sequence,
spdi_pos,
apply_alphabet(&del_str, alphabet),
apply_alphabet(&ins_str, alphabet),
))
}
NaEdit::Identity {
sequence: id_seq, ..
} => {
let ref_base = id_seq
.as_ref()
.map(|s| apply_alphabet(&sequence_to_string(s), alphabet))
.unwrap_or_default();
Ok(SpdiVariant::new(
sequence,
spdi_pos,
ref_base.clone(),
ref_base,
))
}
NaEdit::Inversion {
sequence: inv_seq, ..
} => {
let del_raw = match inv_seq {
Some(seq) => sequence_to_string(seq),
None => match provider {
Some(p) => fetch_reference_bases(p, &sequence, start_one_based, end_one_based)?,
None => {
return Err(ConversionError::MissingReferenceData {
description:
"inversion sequence not provided; reference data needed to determine inverted bases"
.to_string(),
});
}
},
};
let del_str = apply_alphabet(&del_raw, alphabet);
let ins_str = reverse_complement(&del_str);
Ok(SpdiVariant::new(sequence, spdi_pos, del_str, ins_str))
}
NaEdit::Repeat {
sequence: unit_seq,
count,
additional_counts,
trailing,
} => {
if !additional_counts.is_empty() {
return Err(ConversionError::UnsupportedEditType {
description:
"genotype-style repeat (multiple counts) cannot be expressed as a single SPDI; emit each allele separately"
.to_string(),
});
}
if trailing.is_some() {
return Err(ConversionError::UnsupportedEditType {
description: "repeat with trailing sequence cannot be represented in SPDI"
.to_string(),
});
}
let unit = unit_seq
.as_ref()
.ok_or_else(|| ConversionError::MissingReferenceData {
description:
"repeat unit sequence not provided; cannot expand into SPDI delins"
.to_string(),
})?;
let n_post = match count {
RepeatCount::Exact(n) => *n as usize,
_ => {
return Err(ConversionError::UnsupportedEditType {
description:
"uncertain or range repeat counts cannot be represented in SPDI"
.to_string(),
});
}
};
let unit_str = apply_alphabet(&sequence_to_string(unit), alphabet);
let expansion_bases = unit_str.len().checked_mul(n_post).ok_or_else(|| {
ConversionError::UnsupportedEditType {
description: format!(
"repeat expansion {} x {} overflows usize",
unit_str.len(),
n_post
),
}
})?;
if expansion_bases > MAX_REPEAT_EXPANSION_BASES {
return Err(ConversionError::UnsupportedEditType {
description: format!(
"repeat expansion {} bases exceeds SPDI ins-string cap of {} bases",
expansion_bases, MAX_REPEAT_EXPANSION_BASES
),
});
}
let del_raw = match provider {
Some(p) => fetch_reference_bases(p, &sequence, start_one_based, end_one_based)?,
None => {
return Err(ConversionError::MissingReferenceData {
description:
"repeat reference span not provided; reference data needed to determine pre-expansion bases"
.to_string(),
});
}
};
let del_str = apply_alphabet(&del_raw, alphabet);
if unit_str.is_empty() || !del_str.len().is_multiple_of(unit_str.len()) {
return Err(ConversionError::InvalidPosition {
description: format!(
"repeat span {}:{}-{} length {} is not a multiple of unit length {}",
sequence,
start_one_based,
end_one_based,
del_str.len(),
unit_str.len()
),
});
}
let pre_count = del_str.len() / unit_str.len();
if del_str != unit_str.repeat(pre_count) {
return Err(ConversionError::InvalidPosition {
description: format!(
"repeat span {}:{}-{} does not match repeat unit {}",
sequence, start_one_based, end_one_based, unit_str
),
});
}
let ins_str = unit_str.repeat(n_post);
Ok(SpdiVariant::new(sequence, spdi_pos, del_str, ins_str))
}
NaEdit::CopyNumber { .. } => Err(ConversionError::UnsupportedEditType {
description: "copy number variants cannot be represented in SPDI format".to_string(),
}),
NaEdit::Conversion { .. } => Err(ConversionError::UnsupportedEditType {
description: "conversion variants cannot be represented in SPDI format".to_string(),
}),
_ => Err(ConversionError::UnsupportedEditType {
description: format!("unsupported edit type: {:?}", edit),
}),
}
}
fn fetch_reference_bases<P>(
provider: &P,
accession: &str,
start_one_based: u64,
end_one_based: u64,
) -> Result<String, ConversionError>
where
P: ReferenceProvider + ?Sized,
{
if start_one_based < 1 || end_one_based < start_one_based {
return Err(ConversionError::InvalidPosition {
description: format!(
"invalid 1-based interval [{}, {}] for reference fetch",
start_one_based, end_one_based
),
});
}
let zb_start = start_one_based - 1;
let zb_end = end_one_based;
let expected_len = (zb_end - zb_start) as usize;
let bases = match provider.get_genomic_sequence(accession, zb_start, zb_end) {
Ok(s) => s,
Err(_) => provider
.get_sequence(accession, zb_start, zb_end)
.map_err(|e| ConversionError::MissingReferenceData {
description: format!(
"could not fetch reference for {}:{}-{}: {}",
accession, start_one_based, end_one_based, e
),
})?,
};
if bases.len() != expected_len {
return Err(ConversionError::MissingReferenceData {
description: format!(
"reference fetch for {}:{}-{} returned {} bases, expected {}",
accession,
start_one_based,
end_one_based,
bases.len(),
expected_len
),
});
}
Ok(bases)
}
fn apply_alphabet(s: &str, alphabet: AlphabetMode) -> String {
match alphabet {
AlphabetMode::Dna => s.to_ascii_uppercase(),
AlphabetMode::Rna => s
.chars()
.map(|c| match c.to_ascii_uppercase() {
'U' => 'T',
other => other,
})
.collect(),
}
}
pub fn spdi_to_hgvs(spdi: &SpdiVariant) -> Result<HgvsVariant, ConversionError> {
let accession = parse_accession(&spdi.sequence)
.map(|(_, acc)| acc)
.map_err(|_| ConversionError::InvalidAccession {
description: format!("could not parse accession: {}", spdi.sequence),
})?;
let spdi_pos_zb = ZeroBasedPos::new(spdi.position);
let hgvs_pos_ob = spdi_pos_zb.to_one_based();
let hgvs_pos = hgvs_pos_ob.value();
let (interval, edit) = if spdi.is_identity() {
let seq = if spdi.deletion.is_empty() {
None
} else {
Some(string_to_sequence(&spdi.deletion)?)
};
(
Interval::point(GenomePos::new(hgvs_pos)),
NaEdit::Identity {
sequence: seq,
whole_entity: false,
},
)
} else if spdi.deletion.len() == 1 && spdi.insertion.len() == 1 {
let ref_base = char_to_base(spdi.deletion.chars().next().unwrap())?;
let alt_base = char_to_base(spdi.insertion.chars().next().unwrap())?;
(
Interval::point(GenomePos::new(hgvs_pos)),
NaEdit::Substitution {
reference: ref_base,
alternative: alt_base,
},
)
} else if spdi.is_deletion() {
let del_len = spdi.deletion.len();
let del_seq = string_to_sequence(&spdi.deletion)?;
let interval = if del_len > 1 {
Interval::new(
GenomePos::new(hgvs_pos),
GenomePos::new(hgvs_pos + del_len as u64 - 1),
)
} else {
Interval::point(GenomePos::new(hgvs_pos))
};
(
interval,
NaEdit::Deletion {
sequence: Some(del_seq),
length: None,
},
)
} else if spdi.is_insertion() {
let ins_seq = string_to_sequence(&spdi.insertion)?;
(
Interval::new(GenomePos::new(hgvs_pos), GenomePos::new(hgvs_pos + 1)),
NaEdit::Insertion {
sequence: InsertedSequence::Literal(ins_seq),
},
)
} else {
let del_len = spdi.deletion.len();
let ins_seq = string_to_sequence(&spdi.insertion)?;
let interval = if del_len > 1 {
Interval::new(
GenomePos::new(hgvs_pos),
GenomePos::new(hgvs_pos + del_len as u64 - 1),
)
} else {
Interval::point(GenomePos::new(hgvs_pos))
};
(
interval,
NaEdit::Delins {
sequence: InsertedSequence::Literal(ins_seq),
deleted: None,
deleted_length: None,
},
)
};
Ok(HgvsVariant::Genome(GenomeVariant {
accession,
gene_symbol: None,
loc_edit: LocEdit::new(interval, edit),
}))
}
pub fn spdi_to_hgvs_with_ref<R>(
spdi: &SpdiVariant,
reference: &R,
) -> Result<HgvsVariant, ConversionError>
where
R: crate::reference::provider::ReferenceProvider + ?Sized,
{
let base = spdi_to_hgvs(spdi)?;
if !spdi.is_insertion() {
return Ok(base);
}
if let Some(dup_variant) = recover_dup_from_insertion(spdi, reference, &base)? {
return Ok(dup_variant);
}
Ok(base)
}
fn recover_dup_from_insertion<R>(
spdi: &SpdiVariant,
reference: &R,
base: &HgvsVariant,
) -> Result<Option<HgvsVariant>, ConversionError>
where
R: crate::reference::provider::ReferenceProvider + ?Sized,
{
debug_assert!(spdi.is_insertion());
let ins = &spdi.insertion;
let ins_len = ins.len() as u64;
if ins_len == 0 {
return Ok(None);
}
let flank_end =
spdi.position
.checked_add(1)
.ok_or_else(|| ConversionError::InvalidPosition {
description: format!("SPDI position {} overflows on +1", spdi.position),
})?;
if flank_end < ins_len {
return Ok(None);
}
let flank_start = flank_end - ins_len;
let flank = match reference.get_genomic_sequence(&spdi.sequence, flank_start, flank_end) {
Ok(s) => s,
Err(_) => match reference.get_sequence(&spdi.sequence, flank_start, flank_end) {
Ok(s) => s,
Err(e) => {
return Err(ConversionError::MissingReferenceData {
description: format!(
"could not fetch 5' flank for {}:{}-{}: {}",
spdi.sequence, flank_start, flank_end, e
),
});
}
},
};
if flank.len() as u64 != ins_len {
return Ok(None);
}
if !flank.eq_ignore_ascii_case(ins) {
return Ok(None);
}
let end_one_based = flank_end; let start_one_based = end_one_based + 1 - ins_len;
let dup_seq = string_to_sequence(ins)?;
let interval = if ins_len == 1 {
Interval::point(GenomePos::new(end_one_based))
} else {
Interval::new(
GenomePos::new(start_one_based),
GenomePos::new(end_one_based),
)
};
let edit = NaEdit::Duplication {
sequence: Some(dup_seq),
length: None,
uncertain_extent: None,
};
let HgvsVariant::Genome(g) = base else {
return Ok(None);
};
Ok(Some(HgvsVariant::Genome(GenomeVariant {
accession: g.accession.clone(),
gene_symbol: g.gene_symbol.clone(),
loc_edit: LocEdit::new(interval, edit),
})))
}
fn string_to_sequence(s: &str) -> Result<Sequence, ConversionError> {
s.parse().map_err(|_| ConversionError::InvalidPosition {
description: format!("invalid sequence: {}", s),
})
}
fn char_to_base(c: char) -> Result<crate::hgvs::edit::Base, ConversionError> {
crate::hgvs::edit::Base::from_char(c).ok_or_else(|| ConversionError::InvalidPosition {
description: format!("invalid base character: {}", c),
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::hgvs::parser::parse_hgvs;
#[test]
fn test_hgvs_to_spdi_substitution() {
let hgvs = parse_hgvs("NC_000001.11:g.12345A>G").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.sequence, "NC_000001.11");
assert_eq!(spdi.position, 12344);
assert_eq!(spdi.deletion, "A");
assert_eq!(spdi.insertion, "G");
}
#[test]
fn test_hgvs_to_spdi_insertion() {
let hgvs = parse_hgvs("NC_000001.11:g.100_101insATG").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.position, 99); assert_eq!(spdi.deletion, "");
assert_eq!(spdi.insertion, "ATG");
}
#[test]
fn test_hgvs_to_spdi_deletion_with_seq() {
let hgvs = parse_hgvs("NC_000001.11:g.100_102delATG").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.position, 99);
assert_eq!(spdi.deletion, "ATG");
assert_eq!(spdi.insertion, "");
}
#[test]
fn test_hgvs_to_spdi_deletion_without_seq() {
let hgvs = parse_hgvs("NC_000001.11:g.100_102del").unwrap();
let result = hgvs_to_spdi_simple(&hgvs);
assert!(matches!(
result,
Err(ConversionError::MissingReferenceData { .. })
));
}
#[test]
fn test_hgvs_to_spdi_delins_without_ref() {
let hgvs = parse_hgvs("NC_000001.11:g.100_102delinsTTCC").unwrap();
let result = hgvs_to_spdi_simple(&hgvs);
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("unknown"));
}
#[test]
fn test_hgvs_to_spdi_delins_with_explicit_deleted_no_ref() {
let hgvs = parse_hgvs("NC_000001.11:g.100_102delATGinsTTCC").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs)
.expect("explicit deleted sequence should not require reference data");
assert_eq!(spdi.position, 99);
assert_eq!(spdi.deletion, "ATG");
assert_eq!(spdi.insertion, "TTCC");
}
#[test]
fn test_hgvs_to_spdi_duplication_with_seq() {
let hgvs = parse_hgvs("NC_000001.11:g.100_102dupATG").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.position, 101); assert_eq!(spdi.deletion, "");
assert_eq!(spdi.insertion, "ATG");
}
#[test]
fn test_hgvs_to_spdi_identity() {
let hgvs = parse_hgvs("NC_000001.11:g.100A=").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.position, 99);
assert_eq!(spdi.deletion, "A");
assert_eq!(spdi.insertion, "A");
}
#[test]
fn test_hgvs_to_spdi_simple_cds_requires_provider() {
let hgvs = parse_hgvs("NM_000088.3:c.100A>G").unwrap();
let result = hgvs_to_spdi_simple(&hgvs);
assert!(matches!(
result,
Err(ConversionError::ProviderRequired { .. })
));
let err = result.unwrap_err();
let msg = err.to_string();
assert!(msg.contains("c."), "message should mention c.: {}", msg);
assert!(
msg.contains("provider"),
"message should mention provider: {}",
msg
);
}
#[test]
fn test_hgvs_to_spdi_simple_short_form_inversion_requires_provider() {
let hgvs = parse_hgvs("NC_000001.11:g.100_200inv").unwrap();
let result = hgvs_to_spdi_simple(&hgvs);
assert!(matches!(
result,
Err(ConversionError::MissingReferenceData { .. })
));
}
#[test]
fn test_spdi_to_hgvs_substitution() {
let spdi = SpdiVariant::new("NC_000001.11", 12344, "A", "G");
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert_eq!(hgvs.to_string(), "NC_000001.11:g.12345A>G");
}
#[test]
fn test_spdi_to_hgvs_deletion() {
let spdi = SpdiVariant::deletion("NC_000001.11", 99, "ATG");
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert_eq!(hgvs.to_string(), "NC_000001.11:g.100_102delATG");
}
#[test]
fn test_spdi_to_hgvs_insertion() {
let spdi = SpdiVariant::insertion("NC_000001.11", 100, "ATG");
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert_eq!(hgvs.to_string(), "NC_000001.11:g.101_102insATG");
}
#[test]
fn test_spdi_to_hgvs_delins() {
let spdi = SpdiVariant::delins("NC_000001.11", 99, "ATG", "TTCC");
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert_eq!(hgvs.to_string(), "NC_000001.11:g.100_102delinsTTCC");
}
#[test]
fn test_spdi_to_hgvs_identity() {
let spdi = SpdiVariant::new("NC_000001.11", 99, "A", "A");
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert_eq!(hgvs.to_string(), "NC_000001.11:g.100A=");
}
#[test]
fn test_spdi_to_hgvs_single_del() {
let spdi = SpdiVariant::deletion("NC_000001.11", 99, "A");
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert_eq!(hgvs.to_string(), "NC_000001.11:g.100delA");
}
#[test]
fn test_roundtrip_substitution() {
let original = "NC_000001.11:g.12345A>G";
let hgvs = parse_hgvs(original).unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
let back = spdi_to_hgvs(&spdi).unwrap();
assert_eq!(back.to_string(), original);
}
#[test]
fn test_roundtrip_insertion() {
let original = "NC_000001.11:g.100_101insATG";
let hgvs = parse_hgvs(original).unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
let back = spdi_to_hgvs(&spdi).unwrap();
assert_eq!(back.to_string(), original);
}
#[test]
fn test_roundtrip_deletion_with_seq() {
let original = "NC_000001.11:g.100_102delATG";
let hgvs = parse_hgvs(original).unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
let back = spdi_to_hgvs(&spdi).unwrap();
assert_eq!(back.to_string(), original);
}
#[test]
fn test_error_display() {
let err = ConversionError::UnsupportedVariantType {
description: "test".to_string(),
};
assert!(err.to_string().contains("unsupported variant type"));
let err = ConversionError::MissingReferenceData {
description: "test".to_string(),
};
assert!(err.to_string().contains("missing reference data"));
}
fn make_test_genomic_provider() -> crate::reference::mock::MockProvider {
let mut p = crate::reference::mock::MockProvider::new();
let mut contig = String::new();
contig.push_str(&"N".repeat(99)); contig.push_str("ATG"); contig.push_str(&"N".repeat(97)); contig.push_str("GATTACA"); contig.push_str(&"N".repeat(793)); contig.push_str("AAACCCGGGT"); contig.push_str(&"N".repeat(50));
p.add_genomic_sequence("NC_000001.11", &contig);
p
}
#[test]
fn fetch_reference_bases_returns_genomic_bases() {
let provider = make_test_genomic_provider();
let bases = fetch_reference_bases(&provider, "NC_000001.11", 100, 102).unwrap();
assert_eq!(bases, "ATG");
}
#[test]
fn fetch_reference_bases_errors_when_provider_lacks_contig() {
let provider = crate::reference::mock::MockProvider::new();
let err = fetch_reference_bases(&provider, "NC_000099.99", 100, 102).unwrap_err();
assert!(matches!(err, ConversionError::MissingReferenceData { .. }));
let msg = err.to_string();
assert!(msg.contains("NC_000099.99"));
assert!(msg.contains("100"));
assert!(msg.contains("102"));
}
#[test]
fn fetch_reference_bases_errors_on_short_contig() {
let mut provider = crate::reference::mock::MockProvider::new();
provider.add_genomic_sequence("NC_000001.11", "ATG");
let err = fetch_reference_bases(&provider, "NC_000001.11", 100, 102).unwrap_err();
assert!(matches!(err, ConversionError::MissingReferenceData { .. }));
}
#[test]
fn hgvs_to_spdi_deletion_short_form_with_provider() {
let provider = make_test_genomic_provider();
let hgvs = parse_hgvs("NC_000001.11:g.100_102del").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.sequence, "NC_000001.11");
assert_eq!(spdi.position, 99);
assert_eq!(spdi.deletion, "ATG");
assert_eq!(spdi.insertion, "");
}
#[test]
fn hgvs_to_spdi_duplication_short_form_with_provider() {
let provider = make_test_genomic_provider();
let hgvs = parse_hgvs("NC_000001.11:g.100_102dup").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.position, 101);
assert_eq!(spdi.deletion, "");
assert_eq!(spdi.insertion, "ATG");
}
#[test]
fn hgvs_to_spdi_single_base_duplication_short_form_with_provider() {
let provider = make_test_genomic_provider();
let hgvs = parse_hgvs("NC_000001.11:g.100dup").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.position, 99);
assert_eq!(spdi.deletion, "");
assert_eq!(spdi.insertion, "A");
}
#[test]
fn hgvs_to_spdi_delins_short_form_with_provider() {
let provider = make_test_genomic_provider();
let hgvs = parse_hgvs("NC_000001.11:g.100_102delinsTTCC").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.position, 99);
assert_eq!(spdi.deletion, "ATG");
assert_eq!(spdi.insertion, "TTCC");
}
#[test]
fn hgvs_to_spdi_long_deletion_with_provider() {
let provider = make_test_genomic_provider();
let hgvs = parse_hgvs("NC_000001.11:g.1000_1009del").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.deletion, "AAACCCGGGT");
assert_eq!(spdi.deletion.len(), 10);
assert_eq!(spdi.insertion, "");
}
#[test]
fn hgvs_to_spdi_explicit_deletion_does_not_consult_provider() {
let provider = crate::reference::mock::MockProvider::new();
let hgvs = parse_hgvs("NC_000001.11:g.100_102delATG").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.deletion, "ATG");
assert_eq!(spdi.insertion, "");
}
#[test]
fn hgvs_to_spdi_explicit_duplication_does_not_consult_provider() {
let provider = crate::reference::mock::MockProvider::new();
let hgvs = parse_hgvs("NC_000001.11:g.100_102dupATG").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.position, 101);
assert_eq!(spdi.deletion, "");
assert_eq!(spdi.insertion, "ATG");
}
#[test]
fn hgvs_to_spdi_substitution_unaffected_by_provider() {
let provider = make_test_genomic_provider();
let hgvs = parse_hgvs("NC_000001.11:g.12345A>G").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.to_string(), "NC_000001.11:12344:A:G");
}
#[test]
fn hgvs_to_spdi_mnv_delins_with_provider_round_trips() {
let provider = make_test_genomic_provider();
let hgvs = parse_hgvs("NC_000001.11:g.100_102delinsGGG").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.deletion, "ATG");
assert_eq!(spdi.insertion, "GGG");
let back = spdi_to_hgvs(&spdi).unwrap();
assert_eq!(back.to_string(), "NC_000001.11:g.100_102delinsGGG");
}
#[test]
fn hgvs_to_spdi_deletion_round_trip_with_provider() {
let provider = make_test_genomic_provider();
let original = parse_hgvs("NC_000001.11:g.100_102del").unwrap();
let spdi = hgvs_to_spdi(&original, &provider).unwrap();
let back = spdi_to_hgvs(&spdi).unwrap();
assert_eq!(back.to_string(), "NC_000001.11:g.100_102delATG");
}
#[test]
fn hgvs_to_spdi_delins_round_trip_with_provider() {
let provider = make_test_genomic_provider();
let original = parse_hgvs("NC_000001.11:g.100_102delinsTTCC").unwrap();
let spdi = hgvs_to_spdi(&original, &provider).unwrap();
let back = spdi_to_hgvs(&spdi).unwrap();
assert_eq!(back.to_string(), "NC_000001.11:g.100_102delinsTTCC");
}
#[test]
fn hgvs_to_spdi_dup_round_trip_emits_ins_form_via_reference_free_path() {
let provider = make_test_genomic_provider();
let original = parse_hgvs("NC_000001.11:g.100_102dup").unwrap();
let spdi = hgvs_to_spdi(&original, &provider).unwrap();
assert_eq!(spdi.position, 101);
assert_eq!(spdi.insertion, "ATG");
let recovered = spdi_to_hgvs(&spdi).unwrap();
assert_eq!(recovered.to_string(), "NC_000001.11:g.102_103insATG");
}
#[test]
fn hgvs_to_spdi_mito_short_form_deletion_with_provider() {
let mut provider = crate::reference::mock::MockProvider::new();
let mut seq = "N".repeat(16559);
seq.push_str("GATC"); seq.push_str(&"N".repeat(20));
provider.add_genomic_sequence("NC_012920.1", &seq);
let hgvs = parse_hgvs("NC_012920.1:m.16560_16563del").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.deletion, "GATC");
assert_eq!(spdi.insertion, "");
}
#[test]
fn hgvs_to_spdi_deletion_with_provider_missing_data() {
let provider = crate::reference::mock::MockProvider::new();
let hgvs = parse_hgvs("NC_000001.11:g.100_102del").unwrap();
let result = hgvs_to_spdi(&hgvs, &provider);
assert!(matches!(
result,
Err(ConversionError::MissingReferenceData { .. })
));
let msg = result.unwrap_err().to_string();
assert!(msg.contains("NC_000001.11"));
}
#[test]
fn hgvs_to_spdi_simple_pins_existing_short_form_failures() {
for input in [
"NC_000001.11:g.100_102del",
"NC_000001.11:g.100_102dup",
"NC_000001.11:g.100_102delinsTTCC",
] {
let hgvs = parse_hgvs(input).unwrap();
let r = hgvs_to_spdi_simple(&hgvs);
assert!(
matches!(r, Err(ConversionError::MissingReferenceData { .. })),
"expected MissingReferenceData for {} (got {:?})",
input,
r
);
}
}
#[test]
fn hgvs_to_spdi_short_form_is_idempotent() {
let provider = make_test_genomic_provider();
let hgvs = parse_hgvs("NC_000001.11:g.100_102del").unwrap();
let a = hgvs_to_spdi(&hgvs, &provider).unwrap();
let b = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(a.to_string(), b.to_string());
}
#[test]
fn test_spdi_empty_deletion_insertion() {
let spdi = SpdiVariant::insertion("NC_000001.11", 100, "ATG");
assert!(spdi.is_insertion());
assert!(!spdi.is_deletion());
assert!(!spdi.is_identity());
assert_eq!(spdi.deletion, "");
assert_eq!(spdi.insertion, "ATG");
}
#[test]
fn test_spdi_empty_insertion_deletion() {
let spdi = SpdiVariant::deletion("NC_000001.11", 100, "ATG");
assert!(spdi.is_deletion());
assert!(!spdi.is_insertion());
assert!(!spdi.is_identity());
assert_eq!(spdi.deletion, "ATG");
assert_eq!(spdi.insertion, "");
}
#[test]
fn test_spdi_both_empty_is_identity() {
let spdi = SpdiVariant::new("NC_000001.11", 100, "", "");
assert!(spdi.is_identity());
assert!(!spdi.is_insertion());
assert!(!spdi.is_deletion());
}
#[test]
fn test_spdi_single_base_insertion() {
let spdi = SpdiVariant::insertion("NC_000001.11", 100, "A");
assert!(spdi.is_insertion());
assert_eq!(spdi.insertion.len(), 1);
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert!(hgvs.to_string().contains("ins"));
}
#[test]
fn test_spdi_single_base_deletion() {
let spdi = SpdiVariant::deletion("NC_000001.11", 100, "A");
assert!(spdi.is_deletion());
assert_eq!(spdi.deletion.len(), 1);
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert!(hgvs.to_string().contains("del"));
}
#[test]
fn test_spdi_long_insertion_100bp() {
let long_seq = "A".repeat(100);
let spdi = SpdiVariant::insertion("NC_000001.11", 12345, &long_seq);
assert!(spdi.is_insertion());
assert_eq!(spdi.insertion.len(), 100);
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert!(hgvs.to_string().contains("ins"));
assert!(hgvs.to_string().ends_with(&format!("ins{}", long_seq)));
}
#[test]
fn test_spdi_long_deletion_100bp() {
let long_seq = "ACGT".repeat(25); let spdi = SpdiVariant::deletion("NC_000001.11", 12345, &long_seq);
assert!(spdi.is_deletion());
assert_eq!(spdi.deletion.len(), 100);
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert!(hgvs.to_string().contains("del"));
}
#[test]
fn test_spdi_long_indel_asymmetric() {
let del_seq = "A".repeat(50);
let ins_seq = "G".repeat(150);
let spdi = SpdiVariant::delins("NC_000001.11", 12345, &del_seq, &ins_seq);
assert!(!spdi.is_insertion());
assert!(!spdi.is_deletion());
assert_eq!(spdi.deletion.len(), 50);
assert_eq!(spdi.insertion.len(), 150);
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert!(hgvs.to_string().contains("delins"));
}
#[test]
fn test_spdi_very_long_insertion_1000bp() {
let long_seq = "ACGT".repeat(250); let spdi = SpdiVariant::insertion("NC_000001.11", 50000, &long_seq);
assert!(spdi.is_insertion());
assert_eq!(spdi.insertion.len(), 1000);
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert!(hgvs.to_string().contains("ins"));
}
#[test]
fn test_spdi_position_zero() {
let spdi = SpdiVariant::new("NC_000001.11", 0, "A", "G");
assert_eq!(spdi.position, 0);
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert!(hgvs.to_string().contains("g.1A>G"));
}
#[test]
fn test_spdi_position_max() {
let spdi = SpdiVariant::new("NC_000001.11", 248956421, "A", "G");
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert!(hgvs.to_string().contains("248956422")); }
#[test]
fn test_spdi_lowercase_sequence() {
let spdi = SpdiVariant::new("NC_000001.11", 100, "a", "g");
assert_eq!(spdi.deletion, "a");
assert_eq!(spdi.insertion, "g");
}
#[test]
fn test_spdi_mixed_case_sequence() {
let spdi = SpdiVariant::new("NC_000001.11", 100, "AtGc", "GcTa");
assert_eq!(spdi.deletion, "AtGc");
assert_eq!(spdi.insertion, "GcTa");
}
#[test]
fn test_spdi_n_bases_in_sequence() {
let spdi = SpdiVariant::new("NC_000001.11", 100, "ANG", "TNC");
assert_eq!(spdi.deletion, "ANG");
assert_eq!(spdi.insertion, "TNC");
}
#[test]
fn test_spdi_complex_repeat_sequence() {
let repeat = "CAG".repeat(30); let spdi = SpdiVariant::insertion("NC_000004.12", 3074876, &repeat);
assert!(spdi.is_insertion());
assert_eq!(spdi.insertion.len(), 90);
}
#[test]
fn test_spdi_to_hgvs_delins_single_base_del() {
let spdi = SpdiVariant::delins("NC_000001.11", 100, "A", "TTTT");
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert!(hgvs.to_string().contains("delinsTTTT"));
}
#[test]
fn test_spdi_to_hgvs_delins_single_base_ins() {
let spdi = SpdiVariant::delins("NC_000001.11", 100, "AAAA", "T");
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert!(hgvs.to_string().contains("delinsT"));
}
#[test]
fn test_spdi_different_chromosome_formats() {
let test_cases = vec![
("NC_000001.11", "NC_000001.11"), ("NC_000023.11", "NC_000023.11"), ("NC_000024.10", "NC_000024.10"), ("NC_012920.1", "NC_012920.1"), ];
for (input_acc, expected_acc) in test_cases {
let spdi = SpdiVariant::new(input_acc, 100, "A", "G");
assert_eq!(spdi.sequence, expected_acc);
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert!(hgvs.to_string().starts_with(expected_acc));
}
}
#[test]
fn test_spdi_roundtrip_preserves_case_normalized() {
let spdi = SpdiVariant::new("NC_000001.11", 100, "A", "G");
let hgvs = spdi_to_hgvs(&spdi).unwrap();
let back = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(back.deletion, "A");
assert_eq!(back.insertion, "G");
let spdi_delins = SpdiVariant::new("NC_000001.11", 100, "ACGT", "TGCA");
let hgvs_delins = spdi_to_hgvs(&spdi_delins).unwrap();
let back_delins = hgvs_to_spdi_simple(&hgvs_delins);
assert!(back_delins.is_err());
}
#[test]
fn test_spdi_empty_seq_insertion_roundtrip() {
let spdi = SpdiVariant::insertion("NC_000001.11", 100, "ATG");
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert!(hgvs.to_string().contains("ins"));
let back = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(back.deletion, "");
assert_eq!(back.insertion, "ATG");
}
#[test]
fn test_hgvs_to_spdi_various_accession_types() {
let test_variants = vec![
"NC_000001.11:g.12345A>G", "NC_000023.11:g.12345A>G", "NC_012920.1:g.12345A>G", ];
for variant_str in test_variants {
let hgvs = parse_hgvs(variant_str).unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs);
assert!(spdi.is_ok(), "Failed for: {}", variant_str);
}
}
#[test]
fn test_spdi_display_format() {
let spdi = SpdiVariant::new("NC_000001.11", 12344, "A", "G");
assert_eq!(spdi.to_string(), "NC_000001.11:12344:A:G");
let spdi_del = SpdiVariant::deletion("NC_000001.11", 100, "ATG");
assert_eq!(spdi_del.to_string(), "NC_000001.11:100:ATG:");
let spdi_ins = SpdiVariant::insertion("NC_000001.11", 100, "ATG");
assert_eq!(spdi_ins.to_string(), "NC_000001.11:100::ATG");
}
#[test]
fn test_spdi_identity_various_lengths() {
let spdi1 = SpdiVariant::new("NC_000001.11", 100, "A", "A");
assert!(spdi1.is_identity());
let spdi2 = SpdiVariant::new("NC_000001.11", 100, "ATG", "ATG");
assert!(spdi2.is_identity());
let spdi3 = SpdiVariant::new("NC_000001.11", 100, "", "");
assert!(spdi3.is_identity());
}
fn provider_with_genomic(seq: &str) -> crate::reference::mock::MockProvider {
let mut p = crate::reference::mock::MockProvider::new();
p.add_genomic_sequence("NC_000001.11", seq);
p
}
#[test]
fn spdi_to_hgvs_with_ref_recovers_multi_base_dup() {
let mut contig = "N".repeat(99);
contig.push_str("ATG"); contig.push_str(&"N".repeat(50));
let provider = provider_with_genomic(&contig);
let spdi = SpdiVariant::insertion("NC_000001.11", 101, "ATG");
let hgvs = spdi_to_hgvs_with_ref(&spdi, &provider).unwrap();
assert_eq!(hgvs.to_string(), "NC_000001.11:g.100_102dupATG");
}
#[test]
fn spdi_to_hgvs_with_ref_recovers_single_base_dup() {
let mut contig = "N".repeat(99);
contig.push('A'); contig.push_str(&"N".repeat(20));
let provider = provider_with_genomic(&contig);
let spdi = SpdiVariant::insertion("NC_000001.11", 99, "A");
let hgvs = spdi_to_hgvs_with_ref(&spdi, &provider).unwrap();
assert_eq!(hgvs.to_string(), "NC_000001.11:g.100dupA");
}
#[test]
fn spdi_to_hgvs_with_ref_keeps_ins_when_no_match() {
let mut contig = "N".repeat(99);
contig.push_str("CCC"); contig.push_str(&"N".repeat(20));
let provider = provider_with_genomic(&contig);
let spdi = SpdiVariant::insertion("NC_000001.11", 101, "ATG");
let hgvs = spdi_to_hgvs_with_ref(&spdi, &provider).unwrap();
assert_eq!(hgvs.to_string(), "NC_000001.11:g.102_103insATG");
}
#[test]
fn spdi_to_hgvs_with_ref_keeps_ins_at_contig_start() {
let provider = provider_with_genomic("ATGCATGCATGC");
let spdi = SpdiVariant::insertion("NC_000001.11", 0, "ATG");
let hgvs = spdi_to_hgvs_with_ref(&spdi, &provider).unwrap();
assert_eq!(hgvs.to_string(), "NC_000001.11:g.1_2insATG");
}
#[test]
fn spdi_to_hgvs_with_ref_substitution_unchanged() {
let provider = provider_with_genomic(&"N".repeat(20000));
let spdi = SpdiVariant::new("NC_000001.11", 12344, "A", "G");
let hgvs = spdi_to_hgvs_with_ref(&spdi, &provider).unwrap();
assert_eq!(hgvs.to_string(), "NC_000001.11:g.12345A>G");
}
#[test]
fn spdi_to_hgvs_with_ref_deletion_unchanged() {
let provider = provider_with_genomic(&"N".repeat(2000));
let spdi = SpdiVariant::deletion("NC_000001.11", 99, "ATG");
let hgvs = spdi_to_hgvs_with_ref(&spdi, &provider).unwrap();
assert_eq!(hgvs.to_string(), "NC_000001.11:g.100_102delATG");
}
#[test]
fn spdi_to_hgvs_with_ref_delins_unchanged() {
let provider = provider_with_genomic(&"N".repeat(2000));
let spdi = SpdiVariant::delins("NC_000001.11", 99, "ATG", "TTCC");
let hgvs = spdi_to_hgvs_with_ref(&spdi, &provider).unwrap();
assert_eq!(hgvs.to_string(), "NC_000001.11:g.100_102delinsTTCC");
}
#[test]
fn spdi_to_hgvs_with_ref_identity_unchanged() {
let provider = provider_with_genomic(&"N".repeat(2000));
let spdi = SpdiVariant::new("NC_000001.11", 99, "A", "A");
let hgvs = spdi_to_hgvs_with_ref(&spdi, &provider).unwrap();
assert_eq!(hgvs.to_string(), "NC_000001.11:g.100A=");
}
#[test]
fn spdi_to_hgvs_with_ref_propagates_ref_error() {
let provider = crate::reference::mock::MockProvider::new();
let spdi = SpdiVariant::insertion("NC_000999.99", 101, "ATG");
let result = spdi_to_hgvs_with_ref(&spdi, &provider);
assert!(matches!(
result,
Err(ConversionError::MissingReferenceData { .. })
));
}
use crate::reference::mock::MockProvider;
use crate::reference::transcript::{Exon, GenomeBuild, ManeStatus, Strand};
fn make_test_provider() -> MockProvider {
let tx = Transcript::new(
"NM_TEST.1".to_string(),
Some("TEST".to_string()),
Strand::Plus,
"AAAAATGCCCAAAGGGTTTAGGCCCAAAGGGTTATAAA".to_string() + "AA",
Some(6),
Some(35),
vec![Exon::new(1, 1, 40)],
None,
None,
None,
GenomeBuild::default(),
ManeStatus::default(),
None,
None,
);
let mut provider = MockProvider::new();
provider.add_transcript(tx);
provider
}
fn make_intronic_provider() -> MockProvider {
let tx = Transcript::new(
"NM_INTRON.1".to_string(),
Some("INTRON".to_string()),
Strand::Plus,
"A".repeat(100),
Some(11),
Some(90),
vec![Exon::new(1, 1, 50), Exon::new(2, 51, 100)],
None,
None,
None,
GenomeBuild::default(),
ManeStatus::default(),
None,
None,
);
let mut provider = MockProvider::new();
provider.add_transcript(tx);
provider
}
#[test]
fn test_hgvs_to_spdi_simple_mt_substitution() {
let hgvs = parse_hgvs("NC_012920.1:m.3243A>G").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.sequence, "NC_012920.1");
assert_eq!(spdi.position, 3242);
assert_eq!(spdi.deletion, "A");
assert_eq!(spdi.insertion, "G");
assert_eq!(spdi.to_string(), "NC_012920.1:3242:A:G");
}
#[test]
fn test_hgvs_to_spdi_simple_mt_insertion() {
let hgvs = parse_hgvs("NC_012920.1:m.100_101insATG").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.position, 99);
assert_eq!(spdi.deletion, "");
assert_eq!(spdi.insertion, "ATG");
}
#[test]
fn test_hgvs_to_spdi_simple_mt_deletion_with_seq() {
let hgvs = parse_hgvs("NC_012920.1:m.3243_3245delAGG").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.position, 3242);
assert_eq!(spdi.deletion, "AGG");
assert_eq!(spdi.insertion, "");
}
#[test]
fn test_hgvs_to_spdi_simple_mt_deletion_without_seq_needs_ref() {
let hgvs = parse_hgvs("NC_012920.1:m.3243_3245del").unwrap();
let result = hgvs_to_spdi_simple(&hgvs);
assert!(matches!(
result,
Err(ConversionError::MissingReferenceData { .. })
));
}
#[test]
fn dup_hgvs_to_spdi_to_hgvs_with_ref_roundtrip_multi_base() {
let mut contig = "N".repeat(99);
contig.push_str("ATG");
contig.push_str(&"N".repeat(20));
let provider = provider_with_genomic(&contig);
let original = "NC_000001.11:g.100_102dupATG";
let hgvs = parse_hgvs(original).unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.to_string(), "NC_000001.11:101::ATG");
let recovered = spdi_to_hgvs_with_ref(&spdi, &provider).unwrap();
assert_eq!(recovered.to_string(), original);
}
#[test]
fn dup_hgvs_to_spdi_to_hgvs_with_ref_roundtrip_single_base() {
let mut contig = "N".repeat(99);
contig.push('A');
contig.push_str(&"N".repeat(20));
let provider = provider_with_genomic(&contig);
let original = "NC_000001.11:g.100dupA";
let hgvs = parse_hgvs(original).unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.to_string(), "NC_000001.11:99::A");
let recovered = spdi_to_hgvs_with_ref(&spdi, &provider).unwrap();
assert_eq!(recovered.to_string(), original);
}
#[test]
fn spdi_to_hgvs_with_ref_does_not_false_detect_non_tandem_insertion() {
let contig = "ATCGATCGATCGAGGGTCCC".to_string();
let provider = provider_with_genomic(&contig);
let spdi = SpdiVariant::insertion("NC_000001.11", 13, "ATCGATCGATCG");
let hgvs = spdi_to_hgvs_with_ref(&spdi, &provider).unwrap();
let s = hgvs.to_string();
assert!(s.contains("ins"), "expected ins-form, got {}", s);
assert!(!s.contains("dup"), "expected not dup, got {}", s);
}
#[test]
fn audit_pin_no_ref_spdi_to_hgvs_renders_dup_shape_as_ins() {
let spdi = SpdiVariant::insertion("NC_000001.11", 101, "ATG");
let hgvs = spdi_to_hgvs(&spdi).unwrap();
assert_eq!(hgvs.to_string(), "NC_000001.11:g.102_103insATG");
}
#[test]
fn dup_recovery_is_idempotent_through_two_roundtrips() {
let mut contig = "N".repeat(99);
contig.push_str("ATG");
contig.push_str(&"N".repeat(20));
let provider = provider_with_genomic(&contig);
let original = "NC_000001.11:g.100_102dupATG";
let hgvs1 = parse_hgvs(original).unwrap();
let spdi1 = hgvs_to_spdi_simple(&hgvs1).unwrap();
let hgvs2 = spdi_to_hgvs_with_ref(&spdi1, &provider).unwrap();
let spdi2 = hgvs_to_spdi_simple(&hgvs2).unwrap();
let hgvs3 = spdi_to_hgvs_with_ref(&spdi2, &provider).unwrap();
assert_eq!(spdi1, spdi2);
assert_eq!(hgvs2.to_string(), hgvs3.to_string());
assert_eq!(hgvs3.to_string(), original);
}
#[test]
fn spdi_to_hgvs_with_ref_recovers_dup_for_mito_accession() {
let mut contig = "N".repeat(99);
contig.push_str("ATG");
contig.push_str(&"N".repeat(20));
let mut provider = crate::reference::mock::MockProvider::new();
provider.add_genomic_sequence("NC_012920.1", &contig);
let spdi = SpdiVariant::insertion("NC_012920.1", 101, "ATG");
let hgvs = spdi_to_hgvs_with_ref(&spdi, &provider).unwrap();
assert_eq!(hgvs.to_string(), "NC_012920.1:g.100_102dupATG");
}
#[test]
fn test_hgvs_to_spdi_simple_mt_dup_with_seq() {
let hgvs = parse_hgvs("NC_012920.1:m.100_102dupATG").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.position, 101);
assert_eq!(spdi.insertion, "ATG");
}
#[test]
fn test_hgvs_to_spdi_simple_mt_identity() {
let hgvs = parse_hgvs("NC_012920.1:m.3243A=").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.position, 3242);
assert_eq!(spdi.deletion, "A");
assert_eq!(spdi.insertion, "A");
}
#[test]
fn test_hgvs_to_spdi_simple_tx_substitution() {
let hgvs = parse_hgvs("NR_046018.2:n.5C>G").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.sequence, "NR_046018.2");
assert_eq!(spdi.position, 4); assert_eq!(spdi.deletion, "C");
assert_eq!(spdi.insertion, "G");
}
#[test]
fn test_hgvs_to_spdi_simple_tx_insertion() {
let hgvs = parse_hgvs("NR_046018.2:n.10_11insATG").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.sequence, "NR_046018.2");
assert_eq!(spdi.position, 9);
assert_eq!(spdi.insertion, "ATG");
}
#[test]
fn test_hgvs_to_spdi_simple_tx_deletion_with_seq() {
let hgvs = parse_hgvs("NR_046018.2:n.10_12delATG").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.position, 9);
assert_eq!(spdi.deletion, "ATG");
assert_eq!(spdi.insertion, "");
}
#[test]
fn test_hgvs_to_spdi_simple_tx_identity() {
let hgvs = parse_hgvs("NR_046018.2:n.10A=").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.position, 9);
assert_eq!(spdi.deletion, "A");
assert_eq!(spdi.insertion, "A");
}
#[test]
fn test_hgvs_to_spdi_simple_tx_intronic_needs_provider() {
let hgvs = parse_hgvs("NR_046018.2:n.100+5A>G").unwrap();
let result = hgvs_to_spdi_simple(&hgvs);
assert!(matches!(
result,
Err(ConversionError::MissingReferenceData { .. })
));
let msg = result.unwrap_err().to_string();
assert!(msg.contains("intronic"), "msg: {}", msg);
}
#[test]
fn test_hgvs_to_spdi_simple_tx_downstream_needs_provider() {
let hgvs = parse_hgvs("NR_046018.2:n.*5A>G").unwrap();
let result = hgvs_to_spdi_simple(&hgvs);
assert!(matches!(
result,
Err(ConversionError::MissingReferenceData { .. })
));
}
#[test]
fn test_hgvs_to_spdi_simple_tx_negative_base_needs_provider() {
let hgvs = parse_hgvs("NR_046018.2:n.-3A>G").unwrap();
let result = hgvs_to_spdi_simple(&hgvs);
assert!(matches!(
result,
Err(ConversionError::MissingReferenceData { .. })
));
}
#[test]
fn test_hgvs_to_spdi_simple_dna_lowercase_uppercased() {
let hgvs = parse_hgvs("NC_000001.11:g.100a>g").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.deletion, "A");
assert_eq!(spdi.insertion, "G");
}
#[test]
fn test_hgvs_to_spdi_simple_rna_substitution_lowercase() {
let hgvs = parse_hgvs("NR_046018.2:r.5c>g").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.sequence, "NR_046018.2");
assert_eq!(spdi.position, 4);
assert_eq!(spdi.deletion, "C");
assert_eq!(spdi.insertion, "G");
}
#[test]
fn test_hgvs_to_spdi_simple_rna_substitution_u_to_t() {
let hgvs = parse_hgvs("NR_046018.2:r.5u>g").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.deletion, "T");
assert_eq!(spdi.insertion, "G");
}
#[test]
fn test_hgvs_to_spdi_simple_rna_insertion_u_to_t() {
let hgvs = parse_hgvs("NR_046018.2:r.10_11insauug").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.position, 9);
assert_eq!(spdi.deletion, "");
assert_eq!(spdi.insertion, "ATTG");
}
#[test]
fn test_hgvs_to_spdi_simple_rna_deletion_with_seq() {
let hgvs = parse_hgvs("NR_046018.2:r.10_12delauu").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.position, 9);
assert_eq!(spdi.deletion, "ATT"); }
#[test]
fn test_hgvs_to_spdi_simple_rna_intronic_needs_provider() {
let hgvs = parse_hgvs("NR_046018.2:r.10+5a>g").unwrap();
let result = hgvs_to_spdi_simple(&hgvs);
assert!(matches!(
result,
Err(ConversionError::MissingReferenceData { .. })
));
}
#[test]
fn test_hgvs_to_spdi_simple_protein_rejected() {
let hgvs = parse_hgvs("NP_000079.2:p.Arg600Gln").unwrap();
let result = hgvs_to_spdi_simple(&hgvs);
assert!(matches!(
result,
Err(ConversionError::UnsupportedVariantType { .. })
));
let msg = result.unwrap_err().to_string();
assert!(
msg.contains("protein") && msg.contains("SPDI"),
"expected helpful protein-rejection message; got: {}",
msg
);
}
#[test]
fn test_hgvs_to_spdi_with_provider_protein_rejected() {
let provider = MockProvider::new();
let hgvs = parse_hgvs("NP_000079.2:p.Arg600Gln").unwrap();
let result = hgvs_to_spdi(&hgvs, &provider);
assert!(matches!(
result,
Err(ConversionError::UnsupportedVariantType { .. })
));
}
#[test]
fn test_hgvs_to_spdi_with_provider_cds_substitution() {
let provider = make_test_provider();
let hgvs = parse_hgvs("NM_TEST.1:c.1A>G").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.sequence, "NM_TEST.1");
assert_eq!(spdi.position, 5);
assert_eq!(spdi.deletion, "A");
assert_eq!(spdi.insertion, "G");
}
#[test]
fn test_hgvs_to_spdi_with_provider_cds_insertion() {
let provider = make_test_provider();
let hgvs = parse_hgvs("NM_TEST.1:c.1_2insATG").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.position, 5);
assert_eq!(spdi.insertion, "ATG");
}
#[test]
fn test_hgvs_to_spdi_with_provider_cds_deletion_with_seq() {
let provider = make_test_provider();
let hgvs = parse_hgvs("NM_TEST.1:c.1_3delATG").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.position, 5);
assert_eq!(spdi.deletion, "ATG");
assert_eq!(spdi.insertion, "");
}
#[test]
fn test_hgvs_to_spdi_with_provider_cds_5utr() {
let provider = make_test_provider();
let hgvs = parse_hgvs("NM_TEST.1:c.-3A>G").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.position, 2);
assert_eq!(spdi.deletion, "A");
assert_eq!(spdi.insertion, "G");
}
#[test]
fn test_hgvs_to_spdi_with_provider_cds_3utr() {
let provider = make_test_provider();
let hgvs = parse_hgvs("NM_TEST.1:c.*2A>G").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.position, 36);
assert_eq!(spdi.deletion, "A");
assert_eq!(spdi.insertion, "G");
}
#[test]
fn test_hgvs_to_spdi_with_provider_cds_intronic_rejected() {
let provider = make_intronic_provider();
let hgvs = parse_hgvs("NM_INTRON.1:c.10+5A>G").unwrap();
let result = hgvs_to_spdi(&hgvs, &provider);
assert!(matches!(
result,
Err(ConversionError::MissingReferenceData { .. })
));
}
#[test]
fn test_hgvs_to_spdi_with_provider_unknown_transcript() {
let provider = MockProvider::new();
let hgvs = parse_hgvs("NM_TEST.1:c.1A>G").unwrap();
let result = hgvs_to_spdi(&hgvs, &provider);
assert!(matches!(
result,
Err(ConversionError::MissingReferenceData { .. })
));
let msg = result.unwrap_err().to_string();
assert!(msg.contains("transcript") || msg.contains("NM_TEST"));
}
#[test]
fn test_hgvs_to_spdi_with_provider_falls_through_to_simple_for_genome() {
let provider = MockProvider::new();
let hgvs = parse_hgvs("NC_000001.11:g.12345A>G").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.to_string(), "NC_000001.11:12344:A:G");
}
#[test]
fn test_hgvs_to_spdi_with_provider_falls_through_to_simple_for_mt() {
let provider = MockProvider::new();
let hgvs = parse_hgvs("NC_012920.1:m.3243A>G").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.to_string(), "NC_012920.1:3242:A:G");
}
#[test]
fn test_hgvs_to_spdi_with_provider_falls_through_to_simple_for_exonic_n() {
let provider = MockProvider::new();
let hgvs = parse_hgvs("NR_046018.2:n.5C>G").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.to_string(), "NR_046018.2:4:C:G");
}
#[test]
fn test_hgvs_to_spdi_with_provider_n_downstream_rejected() {
let tx = Transcript::new(
"NR_NONCODING.1".to_string(),
Some("NONCODING".to_string()),
Strand::Plus,
"A".repeat(40),
None,
None,
vec![Exon::new(1, 1, 40)],
None,
None,
None,
GenomeBuild::default(),
ManeStatus::default(),
None,
None,
);
let mut provider = MockProvider::new();
provider.add_transcript(tx);
let hgvs = parse_hgvs("NR_NONCODING.1:n.*5A>G").unwrap();
let err = hgvs_to_spdi(&hgvs, &provider).unwrap_err();
assert!(matches!(err, ConversionError::InvalidPosition { .. }));
let msg = err.to_string();
assert!(
msg.contains("downstream n.") && msg.contains("genomic projection"),
"expected downstream-n rejection, got: {}",
msg
);
}
#[test]
fn test_hgvs_to_spdi_simple_n_short_form_inversion_requires_provider() {
let hgvs = parse_hgvs("NR_046018.2:n.10_20inv").unwrap();
let result = hgvs_to_spdi_simple(&hgvs);
assert!(matches!(
result,
Err(ConversionError::MissingReferenceData { .. })
));
}
#[test]
fn test_hgvs_to_spdi_simple_m_short_form_inversion_requires_provider() {
let hgvs = parse_hgvs("NC_012920.1:m.100_200inv").unwrap();
let result = hgvs_to_spdi_simple(&hgvs);
assert!(matches!(
result,
Err(ConversionError::MissingReferenceData { .. })
));
}
#[test]
fn test_hgvs_to_spdi_inversion_short_form_with_provider() {
let provider = make_test_genomic_provider();
let hgvs = parse_hgvs("NC_000001.11:g.100_102inv").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.sequence, "NC_000001.11");
assert_eq!(spdi.position, 99);
assert_eq!(spdi.deletion, "ATG");
assert_eq!(spdi.insertion, "CAT");
}
#[test]
fn test_hgvs_to_spdi_inversion_explicit_sequence_no_provider() {
let hgvs = parse_hgvs("NC_000001.11:g.100_102invATG").unwrap();
let spdi = hgvs_to_spdi_simple(&hgvs).unwrap();
assert_eq!(spdi.position, 99);
assert_eq!(spdi.deletion, "ATG");
assert_eq!(spdi.insertion, "CAT");
}
#[test]
fn test_hgvs_to_spdi_inversion_explicit_sequence_does_not_consult_provider() {
let provider = crate::reference::mock::MockProvider::new();
let hgvs = parse_hgvs("NC_000001.11:g.100_102invATG").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.deletion, "ATG");
assert_eq!(spdi.insertion, "CAT");
}
#[test]
fn test_hgvs_to_spdi_inversion_single_base() {
let provider = make_test_genomic_provider();
let hgvs = parse_hgvs("NC_000001.11:g.100_100inv").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.position, 99);
assert_eq!(spdi.deletion, "A");
assert_eq!(spdi.insertion, "T");
}
#[test]
fn test_hgvs_to_spdi_inversion_palindrome_round_trip() {
let mut provider = crate::reference::mock::MockProvider::new();
let mut contig = "N".repeat(99);
contig.push_str("ATAT");
contig.push_str(&"N".repeat(50));
provider.add_genomic_sequence("NC_000001.11", &contig);
let hgvs = parse_hgvs("NC_000001.11:g.100_103inv").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.deletion, "ATAT");
assert_eq!(spdi.insertion, "ATAT");
}
#[test]
fn test_hgvs_to_spdi_inversion_short_form_missing_provider_data() {
let provider = crate::reference::mock::MockProvider::new();
let hgvs = parse_hgvs("NC_000001.11:g.100_102inv").unwrap();
let err = hgvs_to_spdi(&hgvs, &provider).unwrap_err();
assert!(matches!(err, ConversionError::MissingReferenceData { .. }));
let msg = err.to_string();
assert!(msg.contains("NC_000001.11"));
assert!(msg.contains("100"));
assert!(msg.contains("102"));
}
#[test]
fn test_hgvs_to_spdi_inversion_m_short_form_with_provider() {
let mut provider = crate::reference::mock::MockProvider::new();
let mut contig = "N".repeat(99);
contig.push_str("ATG");
contig.push_str(&"N".repeat(50));
provider.add_genomic_sequence("NC_012920.1", &contig);
let hgvs = parse_hgvs("NC_012920.1:m.100_102inv").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.sequence, "NC_012920.1");
assert_eq!(spdi.deletion, "ATG");
assert_eq!(spdi.insertion, "CAT");
}
#[test]
fn test_hgvs_to_spdi_inversion_n_short_form_with_provider() {
let mut provider = crate::reference::mock::MockProvider::new();
let mut contig = "N".repeat(9);
contig.push_str("ATGCATGC"); contig.push_str(&"N".repeat(50));
provider.add_genomic_sequence("NR_046018.2", &contig);
let hgvs = parse_hgvs("NR_046018.2:n.10_12inv").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.sequence, "NR_046018.2");
assert_eq!(spdi.position, 9);
assert_eq!(spdi.deletion, "ATG");
assert_eq!(spdi.insertion, "CAT");
}
#[test]
fn test_hgvs_to_spdi_inversion_r_short_form_dna_alphabet() {
let mut provider = crate::reference::mock::MockProvider::new();
let mut contig = "N".repeat(9);
contig.push_str("ATGCATGC");
contig.push_str(&"N".repeat(50));
provider.add_genomic_sequence("NR_046018.2", &contig);
let hgvs = parse_hgvs("NR_046018.2:r.10_12inv").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.deletion, "ATG");
assert_eq!(spdi.insertion, "CAT");
assert!(!spdi.deletion.contains('U'));
assert!(!spdi.insertion.contains('U'));
}
fn make_repeat_provider() -> crate::reference::mock::MockProvider {
let mut p = crate::reference::mock::MockProvider::new();
let mut contig = "N".repeat(99);
contig.push_str("ATATAT"); contig.push_str(&"N".repeat(94)); contig.push_str("ATATATATAT"); contig.push_str(&"N".repeat(50));
p.add_genomic_sequence("NC_000001.11", &contig);
p
}
#[test]
fn test_hgvs_to_spdi_repeat_expansion_with_provider() {
let provider = make_repeat_provider();
let hgvs = parse_hgvs("NC_000001.11:g.100_105AT[5]").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.sequence, "NC_000001.11");
assert_eq!(spdi.position, 99);
assert_eq!(spdi.deletion, "ATATAT"); assert_eq!(spdi.insertion, "ATATATATAT"); }
#[test]
fn test_hgvs_to_spdi_repeat_contraction_with_provider() {
let provider = make_repeat_provider();
let hgvs = parse_hgvs("NC_000001.11:g.200_209AT[3]").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.position, 199);
assert_eq!(spdi.deletion, "ATATATATAT"); assert_eq!(spdi.insertion, "ATATAT"); }
#[test]
fn test_hgvs_to_spdi_repeat_no_change_with_provider() {
let provider = make_repeat_provider();
let hgvs = parse_hgvs("NC_000001.11:g.100_105AT[3]").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.deletion, "ATATAT");
assert_eq!(spdi.insertion, "ATATAT");
}
#[test]
fn test_hgvs_to_spdi_simple_repeat_requires_provider() {
let hgvs = parse_hgvs("NC_000001.11:g.100_105AT[5]").unwrap();
let err = hgvs_to_spdi_simple(&hgvs).unwrap_err();
assert!(matches!(err, ConversionError::MissingReferenceData { .. }));
}
#[test]
fn test_hgvs_to_spdi_repeat_missing_provider_data() {
let provider = crate::reference::mock::MockProvider::new();
let hgvs = parse_hgvs("NC_000001.11:g.100_105AT[5]").unwrap();
let err = hgvs_to_spdi(&hgvs, &provider).unwrap_err();
assert!(matches!(err, ConversionError::MissingReferenceData { .. }));
let msg = err.to_string();
assert!(msg.contains("NC_000001.11"));
assert!(msg.contains("100"));
assert!(msg.contains("105"));
}
#[test]
fn test_hgvs_to_spdi_repeat_uncertain_count_unsupported() {
let provider = make_repeat_provider();
let hgvs = parse_hgvs("NC_000001.11:g.100_105AT[3_5]").unwrap();
let err = hgvs_to_spdi(&hgvs, &provider).unwrap_err();
assert!(matches!(err, ConversionError::UnsupportedEditType { .. }));
}
#[test]
fn test_hgvs_to_spdi_repeat_unknown_count_unsupported() {
let provider = make_repeat_provider();
let hgvs = parse_hgvs("NC_000001.11:g.100_105AT[?]").unwrap();
let err = hgvs_to_spdi(&hgvs, &provider).unwrap_err();
assert!(matches!(err, ConversionError::UnsupportedEditType { .. }));
}
#[test]
fn test_hgvs_to_spdi_repeat_genotype_unsupported() {
let provider = make_repeat_provider();
let hgvs = parse_hgvs("NC_000001.11:g.100_105AT[3][5]").unwrap();
let err = hgvs_to_spdi(&hgvs, &provider).unwrap_err();
assert!(matches!(err, ConversionError::UnsupportedEditType { .. }));
}
#[test]
fn test_hgvs_to_spdi_repeat_no_unit_unsupported() {
let provider = make_repeat_provider();
let hgvs = parse_hgvs("NC_000001.11:g.100_105(5)").unwrap();
let err = hgvs_to_spdi(&hgvs, &provider).unwrap_err();
assert!(matches!(err, ConversionError::MissingReferenceData { .. }));
}
#[test]
fn test_hgvs_to_spdi_repeat_span_not_multiple_of_unit() {
let provider = make_repeat_provider();
let hgvs = parse_hgvs("NC_000001.11:g.100_104AT[5]").unwrap();
let err = hgvs_to_spdi(&hgvs, &provider).unwrap_err();
assert!(matches!(err, ConversionError::InvalidPosition { .. }));
let msg = err.to_string();
assert!(msg.contains("not a multiple"));
}
#[test]
fn test_hgvs_to_spdi_repeat_span_does_not_match_unit() {
let mut provider = crate::reference::mock::MockProvider::new();
let mut contig = "N".repeat(99);
contig.push_str("ATGCAT"); contig.push_str(&"N".repeat(50));
provider.add_genomic_sequence("NC_000001.11", &contig);
let hgvs = parse_hgvs("NC_000001.11:g.100_105AT[5]").unwrap();
let err = hgvs_to_spdi(&hgvs, &provider).unwrap_err();
assert!(matches!(err, ConversionError::InvalidPosition { .. }));
let msg = err.to_string();
assert!(
msg.contains("does not match repeat unit"),
"expected mismatch message, got: {}",
msg
);
}
#[test]
fn test_hgvs_to_spdi_repeat_n_with_provider() {
let mut provider = crate::reference::mock::MockProvider::new();
let mut contig = "N".repeat(9);
contig.push_str("ATATAT"); contig.push_str(&"N".repeat(50));
provider.add_genomic_sequence("NR_046018.2", &contig);
let hgvs = parse_hgvs("NR_046018.2:n.10_15AT[5]").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.sequence, "NR_046018.2");
assert_eq!(spdi.position, 9);
assert_eq!(spdi.deletion, "ATATAT");
assert_eq!(spdi.insertion, "ATATATATAT");
}
#[test]
fn test_hgvs_to_spdi_repeat_expansion_too_large() {
let provider = make_repeat_provider();
let huge = MAX_REPEAT_EXPANSION_BASES / 2 + 1; let hgvs = parse_hgvs(&format!("NC_000001.11:g.100_105AT[{}]", huge)).unwrap();
let err = hgvs_to_spdi(&hgvs, &provider).unwrap_err();
assert!(matches!(err, ConversionError::UnsupportedEditType { .. }));
let msg = err.to_string();
assert!(
msg.contains("exceeds SPDI ins-string cap"),
"expected size-cap message, got: {}",
msg
);
}
#[test]
fn test_hgvs_to_spdi_repeat_m_with_provider() {
let mut provider = crate::reference::mock::MockProvider::new();
let mut contig = "N".repeat(99);
contig.push_str("ATATAT"); contig.push_str(&"N".repeat(50));
provider.add_genomic_sequence("NC_012920.1", &contig);
let hgvs = parse_hgvs("NC_012920.1:m.100_105AT[5]").unwrap();
let spdi = hgvs_to_spdi(&hgvs, &provider).unwrap();
assert_eq!(spdi.sequence, "NC_012920.1");
assert_eq!(spdi.deletion, "ATATAT");
assert_eq!(spdi.insertion, "ATATATATAT");
}
}