use crate::hgvs::edit::{Base, InsertedSequence, NaEdit, Sequence};
use crate::hgvs::interval::{Interval, UncertainBoundary};
use crate::hgvs::location::{CdsPos, GenomePos, RnaPos, TxPos};
use crate::hgvs::uncertainty::Mu;
use crate::hgvs::variant::{
AllelePhase, CdsVariant, GenomeVariant, HgvsVariant, LocEdit, MtVariant, RnaVariant, TxVariant,
};
use crate::reference::ReferenceProvider;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum Region {
Genome,
Cds,
FivePrimeUtr,
ThreePrimeUtr,
Tx,
TxUpstream,
TxDownstream,
}
#[derive(Debug, Clone)]
struct Anchor {
region: Region,
start: i64,
end: i64,
alt: Vec<Base>,
}
pub(crate) fn merge_consecutive_edits<P: ReferenceProvider>(
variants: Vec<HgvsVariant>,
phase: AllelePhase,
provider: &P,
) -> Vec<HgvsVariant> {
if phase != AllelePhase::Cis || variants.len() < 2 {
return variants;
}
let mut output: Vec<HgvsVariant> = Vec::with_capacity(variants.len());
let mut head_anchor: Option<Anchor> = None;
let mut head_merged = false;
for next in variants {
let merged = 'try_merge: {
let Some(prev_a) = head_anchor.as_mut() else {
break 'try_merge false;
};
let Some((next_region, next_start, _)) = simple_range_for_variant(&next) else {
break 'try_merge false;
};
if prev_a.region != next_region {
break 'try_merge false;
}
let strict_adjacent = prev_a.end.checked_add(1) == Some(next_start);
let codon_frame_eligible = !strict_adjacent
&& prev_a.region == Region::Cds
&& prev_a.end.checked_add(2) == Some(next_start)
&& prev_a.alt.len() == 1
&& prev_a.start == prev_a.end
&& same_codon(prev_a.end, next_start);
if !strict_adjacent && !codon_frame_eligible {
break 'try_merge false;
}
if !same_accession_and_kind(output.last().unwrap(), &next) {
break 'try_merge false;
}
let Some(next_anchor) = anchor_for_variant(&next) else {
break 'try_merge false;
};
if codon_frame_eligible {
if next_anchor.alt.len() != 1 || next_anchor.start != next_anchor.end {
break 'try_merge false;
}
let Some(middle_ref) =
lookup_cds_middle_ref(provider, output.last().unwrap(), prev_a.end + 1)
else {
break 'try_merge false;
};
prev_a.alt.push(middle_ref);
}
prev_a.alt.extend(next_anchor.alt);
prev_a.start = prev_a.start.min(next_anchor.start);
prev_a.end = prev_a.end.max(next_anchor.end);
true
};
if merged {
head_merged = true;
continue;
}
if head_merged {
reconcile_head(&mut output, head_anchor.take().unwrap());
head_merged = false;
}
head_anchor = anchor_for_variant(&next);
output.push(next);
}
if head_merged {
reconcile_head(&mut output, head_anchor.take().unwrap());
}
output
}
fn reconcile_head(output: &mut [HgvsVariant], anchor: Anchor) {
let last = output.last_mut().expect("head must exist when head_merged");
*last = build_merged(last, anchor);
}
fn same_accession_and_kind(a: &HgvsVariant, b: &HgvsVariant) -> bool {
match (a, b) {
(HgvsVariant::Genome(av), HgvsVariant::Genome(bv)) => av.accession == bv.accession,
(HgvsVariant::Cds(av), HgvsVariant::Cds(bv)) => av.accession == bv.accession,
(HgvsVariant::Tx(av), HgvsVariant::Tx(bv)) => av.accession == bv.accession,
(HgvsVariant::Rna(av), HgvsVariant::Rna(bv)) => av.accession == bv.accession,
(HgvsVariant::Mt(av), HgvsVariant::Mt(bv)) => av.accession == bv.accession,
_ => false,
}
}
fn simple_range_for_variant(v: &HgvsVariant) -> Option<(Region, i64, i64)> {
match v {
HgvsVariant::Genome(g) => simple_range_for_loc_edit(&g.loc_edit, simple_genome_range),
HgvsVariant::Cds(c) => simple_range_for_loc_edit(&c.loc_edit, simple_cds_range),
HgvsVariant::Tx(t) => simple_range_for_loc_edit(&t.loc_edit, simple_tx_range),
HgvsVariant::Rna(r) => simple_range_for_loc_edit(&r.loc_edit, simple_rna_range),
HgvsVariant::Mt(m) => simple_range_for_loc_edit(&m.loc_edit, simple_genome_range),
_ => None,
}
}
fn anchor_for_variant(v: &HgvsVariant) -> Option<Anchor> {
match v {
HgvsVariant::Genome(g) => anchor_from_loc_edit(&g.loc_edit, simple_genome_range),
HgvsVariant::Cds(c) => anchor_from_loc_edit(&c.loc_edit, simple_cds_range),
HgvsVariant::Tx(t) => anchor_from_loc_edit(&t.loc_edit, simple_tx_range),
HgvsVariant::Rna(r) => anchor_from_loc_edit(&r.loc_edit, simple_rna_range),
HgvsVariant::Mt(m) => anchor_from_loc_edit(&m.loc_edit, simple_genome_range),
_ => None,
}
}
fn simple_range_for_loc_edit<L>(
loc_edit: &LocEdit<Interval<L>, NaEdit>,
range_fn: impl Fn(&Interval<L>) -> Option<(Region, i64, i64)>,
) -> Option<(Region, i64, i64)> {
if !loc_edit.edit.is_certain() {
return None;
}
let edit = loc_edit.edit.inner()?;
let (region, start, end) = range_fn(&loc_edit.location)?;
match edit {
NaEdit::Substitution { .. }
| NaEdit::SubstitutionNoRef { .. }
| NaEdit::Deletion { .. } => Some((region, start, end)),
NaEdit::Delins { sequence } => {
sequence.bases()?;
Some((region, start, end))
}
NaEdit::Insertion { sequence } => {
sequence.bases()?;
(end == start.checked_add(1)?).then_some((region, end, start))
}
_ => None,
}
}
fn build_merged(head: &HgvsVariant, merged: Anchor) -> HgvsVariant {
match head {
HgvsVariant::Genome(g) => HgvsVariant::Genome(build_genome_merged(g, merged)),
HgvsVariant::Cds(c) => HgvsVariant::Cds(build_cds_merged(c, merged)),
HgvsVariant::Tx(t) => HgvsVariant::Tx(build_tx_merged(t, merged)),
HgvsVariant::Rna(r) => HgvsVariant::Rna(build_rna_merged(r, merged)),
HgvsVariant::Mt(m) => HgvsVariant::Mt(build_mt_merged(m, merged)),
_ => unreachable!("build_merged called with non-NaEdit variant kind"),
}
}
fn anchor_from_loc_edit<L>(
loc_edit: &LocEdit<Interval<L>, NaEdit>,
range_fn: impl Fn(&Interval<L>) -> Option<(Region, i64, i64)>,
) -> Option<Anchor> {
if !loc_edit.edit.is_certain() {
return None;
}
let edit = loc_edit.edit.inner()?;
let (region, start, end) = range_fn(&loc_edit.location)?;
match edit {
NaEdit::Substitution { alternative, .. } | NaEdit::SubstitutionNoRef { alternative } => {
Some(Anchor {
region,
start,
end,
alt: vec![*alternative],
})
}
NaEdit::Deletion { .. } => Some(Anchor {
region,
start,
end,
alt: Vec::new(),
}),
NaEdit::Delins { sequence } => {
let bases = sequence.bases()?.to_vec();
Some(Anchor {
region,
start,
end,
alt: bases,
})
}
NaEdit::Insertion { sequence } => {
if end != start.checked_add(1)? {
return None;
}
let bases = sequence.bases()?.to_vec();
Some(Anchor {
region,
start: end,
end: start,
alt: bases,
})
}
_ => None,
}
}
fn join_pos(
start: Option<(Region, i64)>,
end: Option<(Region, i64)>,
) -> Option<(Region, i64, i64)> {
let (rs, s) = start?;
let (re, e) = end?;
if rs != re {
return None;
}
Some((rs, s, e))
}
fn simple_genome_range(interval: &Interval<GenomePos>) -> Option<(Region, i64, i64)> {
join_pos(
simple_genome_pos(&interval.start),
simple_genome_pos(&interval.end),
)
}
fn simple_genome_pos(boundary: &UncertainBoundary<GenomePos>) -> Option<(Region, i64)> {
let pos = boundary.as_single().and_then(|mu| match mu {
Mu::Certain(p) => Some(p),
_ => None,
})?;
if pos.is_special() || pos.offset.is_some() {
return None;
}
let v = i64::try_from(pos.base).ok()?;
Some((Region::Genome, v))
}
fn simple_cds_range(interval: &Interval<CdsPos>) -> Option<(Region, i64, i64)> {
join_pos(
simple_cds_pos(&interval.start),
simple_cds_pos(&interval.end),
)
}
fn simple_cds_pos(boundary: &UncertainBoundary<CdsPos>) -> Option<(Region, i64)> {
let pos = boundary.as_single().and_then(|mu| match mu {
Mu::Certain(p) => Some(p),
_ => None,
})?;
if pos.is_unknown() || pos.is_intronic() {
return None;
}
if pos.is_3utr() {
if pos.base < 1 {
return None;
}
return Some((Region::ThreePrimeUtr, pos.base));
}
if pos.base < 0 {
return Some((Region::FivePrimeUtr, pos.base));
}
if pos.base > 0 {
return Some((Region::Cds, pos.base));
}
None
}
fn simple_tx_range(interval: &Interval<TxPos>) -> Option<(Region, i64, i64)> {
join_pos(simple_tx_pos(&interval.start), simple_tx_pos(&interval.end))
}
fn simple_tx_pos(boundary: &UncertainBoundary<TxPos>) -> Option<(Region, i64)> {
let pos = boundary.as_single().and_then(|mu| match mu {
Mu::Certain(p) => Some(p),
_ => None,
})?;
if pos.is_intronic() {
return None;
}
if pos.is_downstream() {
if pos.base < 1 {
return None;
}
return Some((Region::TxDownstream, pos.base));
}
if pos.base < 0 {
return Some((Region::TxUpstream, pos.base));
}
if pos.base > 0 {
return Some((Region::Tx, pos.base));
}
None
}
fn simple_rna_range(interval: &Interval<RnaPos>) -> Option<(Region, i64, i64)> {
join_pos(
simple_rna_pos(&interval.start),
simple_rna_pos(&interval.end),
)
}
fn simple_rna_pos(boundary: &UncertainBoundary<RnaPos>) -> Option<(Region, i64)> {
let pos = boundary.as_single().and_then(|mu| match mu {
Mu::Certain(p) => Some(p),
_ => None,
})?;
if pos.is_intronic() {
return None;
}
if pos.is_3utr() {
if pos.base < 1 {
return None;
}
return Some((Region::ThreePrimeUtr, pos.base));
}
if pos.base < 0 {
return Some((Region::FivePrimeUtr, pos.base));
}
if pos.base > 0 {
return Some((Region::Cds, pos.base));
}
None
}
fn build_genome_merged(template: &GenomeVariant, merged: Anchor) -> GenomeVariant {
debug_assert_eq!(merged.region, Region::Genome);
let (location, edit) = build_naedit(merged, |_, b| {
GenomePos::new(u64::try_from(b).expect("genome anchor base must be non-negative"))
});
GenomeVariant {
accession: template.accession.clone(),
gene_symbol: template.gene_symbol.clone(),
loc_edit: LocEdit::new(location, edit),
}
}
fn build_cds_merged(template: &CdsVariant, merged: Anchor) -> CdsVariant {
let (location, edit) = build_naedit(merged, |region, b| match region {
Region::Cds | Region::FivePrimeUtr => CdsPos::new(b),
Region::ThreePrimeUtr => CdsPos {
base: b,
offset: None,
utr3: true,
},
_ => unreachable!("non-c. region {:?} on CdsVariant", region),
});
CdsVariant {
accession: template.accession.clone(),
gene_symbol: template.gene_symbol.clone(),
loc_edit: LocEdit::new(location, edit),
}
}
fn build_tx_merged(template: &TxVariant, merged: Anchor) -> TxVariant {
let (location, edit) = build_naedit(merged, |region, b| match region {
Region::Tx | Region::TxUpstream => TxPos::new(b),
Region::TxDownstream => TxPos::downstream(b),
_ => unreachable!("non-n. region {:?} on TxVariant", region),
});
TxVariant {
accession: template.accession.clone(),
gene_symbol: template.gene_symbol.clone(),
loc_edit: LocEdit::new(location, edit),
}
}
fn build_rna_merged(template: &RnaVariant, merged: Anchor) -> RnaVariant {
let (location, edit) = build_naedit(merged, |region, b| match region {
Region::Cds | Region::FivePrimeUtr => RnaPos::new(b),
Region::ThreePrimeUtr => RnaPos::utr3(b),
_ => unreachable!("non-r. region {:?} on RnaVariant", region),
});
RnaVariant {
accession: template.accession.clone(),
gene_symbol: template.gene_symbol.clone(),
loc_edit: LocEdit::new(location, edit),
}
}
fn build_mt_merged(template: &MtVariant, merged: Anchor) -> MtVariant {
debug_assert_eq!(merged.region, Region::Genome);
let (location, edit) = build_naedit(merged, |_, b| {
GenomePos::new(u64::try_from(b).expect("mitochondrial anchor base must be non-negative"))
});
MtVariant {
accession: template.accession.clone(),
gene_symbol: template.gene_symbol.clone(),
loc_edit: LocEdit::new(location, edit),
}
}
fn same_codon(a: i64, b: i64) -> bool {
if a < 1 || b < 1 {
return false;
}
(a - 1) / 3 == (b - 1) / 3
}
fn lookup_cds_middle_ref<P: ReferenceProvider>(
provider: &P,
head: &HgvsVariant,
cds_axis: i64,
) -> Option<Base> {
let cds_var = match head {
HgvsVariant::Cds(v) => v,
_ => return None,
};
if cds_axis < 1 {
return None;
}
let tx = provider
.get_transcript(&cds_var.accession.transcript_accession())
.ok()?;
let cds_start = tx.cds_start?;
let tx_idx_1based = cds_start.checked_add(cds_axis as u64)?.checked_sub(1)?;
let byte = *tx
.sequence
.as_bytes()
.get(tx_idx_1based.checked_sub(1)? as usize)?;
Base::from_char(byte as char)
}
fn build_naedit<P>(
merged: Anchor,
mut to_pos: impl FnMut(Region, i64) -> P,
) -> (Interval<P>, NaEdit) {
let edit = if merged.start > merged.end {
debug_assert_eq!(
merged.start,
merged.end + 1,
"invariant: insertion anchor span = 1 nt"
);
NaEdit::Insertion {
sequence: InsertedSequence::Literal(Sequence::new(merged.alt)),
}
} else if merged.alt.is_empty() {
NaEdit::Deletion {
sequence: None,
length: None,
}
} else {
NaEdit::Delins {
sequence: InsertedSequence::Literal(Sequence::new(merged.alt)),
}
};
let (lo, hi) = if merged.start > merged.end {
(merged.end, merged.start)
} else {
(merged.start, merged.end)
};
(
Interval::new(to_pos(merged.region, lo), to_pos(merged.region, hi)),
edit,
)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::hgvs::variant::Accession;
use crate::reference::MockProvider;
#[test]
fn empty_input_returns_empty() {
assert!(merge_consecutive_edits(vec![], AllelePhase::Cis, &MockProvider::new()).is_empty());
}
#[test]
fn single_input_passes_through() {
let acc = Accession::new("NC", "000001", Some(11));
let v = HgvsVariant::Genome(GenomeVariant {
accession: acc,
gene_symbol: None,
loc_edit: LocEdit::new(
Interval::new(GenomePos::new(100), GenomePos::new(100)),
NaEdit::Substitution {
reference: Base::G,
alternative: Base::A,
},
),
});
let out = merge_consecutive_edits(vec![v], AllelePhase::Cis, &MockProvider::new());
assert_eq!(out.len(), 1);
}
#[test]
fn same_codon_classifies_correctly() {
assert!(same_codon(1, 3));
assert!(same_codon(4, 6));
assert!(same_codon(145, 147));
assert!(!same_codon(3, 5)); assert!(!same_codon(0, 2)); assert!(!same_codon(-3, -1));
}
}