use crate::coords::index_to_hgvs_pos;
use crate::hgvs::edit::NaEdit;
pub fn needs_normalization(edit: &NaEdit) -> bool {
if matches!(
edit,
NaEdit::Deletion { .. }
| NaEdit::Insertion { .. }
| NaEdit::Duplication { .. }
| NaEdit::Delins { .. }
| NaEdit::Inversion { .. }
| NaEdit::Repeat { .. }
) {
return true;
}
matches!(
edit,
NaEdit::Substitution { reference, alternative } if reference == alternative
)
}
pub fn insertion_is_duplication(ref_seq: &[u8], pos: u64, inserted_seq: &[u8]) -> bool {
let ins_len = inserted_seq.len();
let pos_idx = pos as usize;
if ref_seq.is_empty() || pos_idx > ref_seq.len() {
return false;
}
if pos_idx >= ins_len && pos_idx <= ref_seq.len() {
let before_start = pos_idx - ins_len;
if ref_seq[before_start..pos_idx] == inserted_seq[..] {
return true;
}
}
if pos_idx + ins_len <= ref_seq.len() && ref_seq[pos_idx..pos_idx + ins_len] == inserted_seq[..]
{
return true;
}
false
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CanonicalForm {
Deletion,
Duplication,
Delins,
Insertion,
Repeat {
base: u8,
count: u64,
start: u64,
end: u64,
},
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RepeatAnalysis {
pub is_homopolymer: bool,
pub base: Option<u8>,
pub ref_start: usize,
pub ref_end: usize,
pub ref_count: u64,
}
pub fn find_homopolymer_at(ref_seq: &[u8], pos: usize) -> Option<RepeatAnalysis> {
if pos >= ref_seq.len() {
return None;
}
let base = ref_seq[pos];
let mut start = pos;
while start > 0 && ref_seq[start - 1] == base {
start -= 1;
}
let mut end = pos + 1;
while end < ref_seq.len() && ref_seq[end] == base {
end += 1;
}
let count = (end - start) as u64;
if count < 2 {
return None;
}
Some(RepeatAnalysis {
is_homopolymer: true,
base: Some(base),
ref_start: start,
ref_end: end,
ref_count: count,
})
}
pub fn insertion_to_repeat(
ref_seq: &[u8],
pos: u64,
inserted_seq: &[u8],
is_coding: bool,
) -> Option<(u8, u64, u64, u64, Vec<u8>)> {
if inserted_seq.is_empty() {
return None;
}
let base_unit = smallest_repeat_unit(inserted_seq);
let added_copies = (inserted_seq.len() / base_unit.len()) as u64;
if added_copies < 2 {
return None;
}
if is_coding && !base_unit.len().is_multiple_of(3) {
return None;
}
let u_len = base_unit.len();
let mut best: Option<(Vec<u8>, usize, u64)> = None;
for r in 0..u_len {
let mut rotated = Vec::with_capacity(u_len);
rotated.extend_from_slice(&base_unit[r..]);
rotated.extend_from_slice(&base_unit[..r]);
if let Some((ref_start, ref_count)) = find_tandem_extent(ref_seq, pos as usize, &rotated) {
if ref_count > 0 && best.as_ref().is_none_or(|(_, _, bc)| ref_count > *bc) {
best = Some((rotated, ref_start, ref_count));
}
}
}
let (unit, ref_start, ref_count) = best?;
let total_count = ref_count + added_copies;
let ref_end = ref_start + ref_count as usize * unit.len() - 1;
Some((
unit[0],
total_count,
index_to_hgvs_pos(ref_start),
index_to_hgvs_pos(ref_end),
unit,
))
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct InsToDupResult {
pub unit: Vec<u8>,
pub start: u64,
pub end: u64,
}
pub(crate) fn insertion_to_duplication(
ref_seq: &[u8],
pos: u64,
inserted_seq: &[u8],
) -> Option<InsToDupResult> {
if inserted_seq.is_empty() {
return None;
}
let base_unit = smallest_repeat_unit(inserted_seq);
let added_copies = inserted_seq.len() / base_unit.len();
if added_copies != 1 {
return None;
}
let u_len = base_unit.len();
let mut best: Option<(Vec<u8>, usize, u64)> = None;
for r in 0..u_len {
let mut rotated = Vec::with_capacity(u_len);
rotated.extend_from_slice(&base_unit[r..]);
rotated.extend_from_slice(&base_unit[..r]);
if let Some((ref_start, ref_count)) = find_tandem_extent(ref_seq, pos as usize, &rotated) {
if ref_count > 0 && best.as_ref().is_none_or(|(_, _, bc)| ref_count > *bc) {
best = Some((rotated, ref_start, ref_count));
}
}
}
let (unit, ref_start, ref_count) = best?;
let tract_end_idx = ref_start + (ref_count as usize) * unit.len();
let dup_start_idx = tract_end_idx - unit.len();
let dup_end_idx = tract_end_idx - 1;
Some(InsToDupResult {
unit,
start: index_to_hgvs_pos(dup_start_idx),
end: index_to_hgvs_pos(dup_end_idx),
})
}
pub(crate) fn smallest_repeat_unit(seq: &[u8]) -> &[u8] {
let n = seq.len();
for u in 1..=n {
if !n.is_multiple_of(u) {
continue;
}
let unit = &seq[..u];
if seq.chunks_exact(u).all(|c| c == unit) {
return unit;
}
}
seq }
fn find_tandem_extent(ref_seq: &[u8], pos: usize, unit: &[u8]) -> Option<(usize, u64)> {
let u = unit.len();
if u == 0 {
return None;
}
let ins_point = pos + 1;
let lo = ins_point.saturating_sub(u);
let hi = ins_point.min(ref_seq.len());
let mut best: Option<(usize, u64)> = None;
for anchor in lo..=hi {
if anchor + u > ref_seq.len() {
continue;
}
if &ref_seq[anchor..anchor + u] != unit {
continue;
}
let Some(TandemTract {
start,
end,
ref_count: count,
}) = extend_tandem_tract(ref_seq, anchor..anchor + u, unit)
else {
continue;
};
if ins_point < start || ins_point > end {
continue;
}
match best {
None => best = Some((start, count)),
Some((_, bc)) if count > bc => best = Some((start, count)),
_ => {}
}
}
best
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct TandemTract {
pub start: usize,
pub end: usize,
pub ref_count: u64,
}
pub(crate) fn extend_tandem_tract(
ref_seq: &[u8],
anchor: std::ops::Range<usize>,
unit: &[u8],
) -> Option<TandemTract> {
let u = unit.len();
if u == 0 || anchor.start > anchor.end || anchor.end > ref_seq.len() {
return None;
}
if !(anchor.end - anchor.start).is_multiple_of(u) {
return None;
}
if !ref_seq[anchor.start..anchor.end]
.chunks_exact(u)
.all(|chunk| chunk == unit)
{
return None;
}
let mut start = anchor.start;
while start >= u && &ref_seq[start - u..start] == unit {
start -= u;
}
let mut end = anchor.end;
while end + u <= ref_seq.len() && &ref_seq[end..end + u] == unit {
end += u;
}
let ref_count = ((end - start) / u) as u64;
Some(TandemTract {
start,
end,
ref_count,
})
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct DelToRepeatResult {
pub unit: Vec<u8>,
pub count: u64,
pub start: u64,
pub end: u64,
}
pub(crate) fn deletion_to_repeat(
ref_seq: &[u8],
del_start: usize,
del_end: usize,
is_coding: bool,
) -> Option<DelToRepeatResult> {
if del_start >= del_end || del_end > ref_seq.len() {
return None;
}
let del_slice = &ref_seq[del_start..del_end];
let unit_slice = smallest_repeat_unit(del_slice);
let p = unit_slice.len();
if p == 0 || !(del_end - del_start).is_multiple_of(p) {
return None;
}
if is_coding && !p.is_multiple_of(3) {
return None;
}
let tract = extend_tandem_tract(ref_seq, del_start..del_end, unit_slice)?;
if tract.ref_count < 2 {
return None;
}
let k = ((del_end - del_start) / p) as u64;
if k < 2 {
return None;
}
let post_count = tract.ref_count - k;
if post_count == 0 {
return None;
}
Some(DelToRepeatResult {
unit: unit_slice.to_vec(),
count: post_count,
start: index_to_hgvs_pos(tract.start),
end: index_to_hgvs_pos(tract.end - 1),
})
}
fn complement(base: u8) -> u8 {
match base {
b'A' | b'a' => b'T',
b'T' | b't' => b'A',
b'G' | b'g' => b'C',
b'C' | b'c' => b'G',
b'U' | b'u' => b'A', _ => base, }
}
pub fn shorten_inversion(ref_seq: &[u8], start: usize, end: usize) -> Option<(usize, usize)> {
if start >= end || end > ref_seq.len() {
return None;
}
let mut s = start;
let mut e = end;
while s < e {
let first = ref_seq[s];
let last = ref_seq[e - 1];
if complement(first) == last {
s += 1;
e -= 1;
} else {
break;
}
}
if e <= s + 1 {
return None; }
Some((s, e))
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum DelinsCanonical {
Identity,
Substitution {
position: usize,
reference: crate::hgvs::edit::Base,
alternative: crate::hgvs::edit::Base,
},
Deletion { start: usize, end: usize },
Insertion {
after_index: usize,
sequence: Vec<u8>,
},
Inversion { start: usize, end: usize },
Duplication { start: usize, end: usize },
KeepAsDelins {
start: usize,
end: usize,
sequence: Vec<u8>,
},
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum DelinsSubedit {
Substitution {
position: usize,
reference: crate::hgvs::edit::Base,
alternative: crate::hgvs::edit::Base,
},
Inversion { start: usize, end: usize },
IdentityAt { position: usize },
}
pub fn decompose_delins_inv(
ref_seq: &[u8],
start: usize,
end: usize,
inserted_seq: &[u8],
) -> Option<Vec<DelinsSubedit>> {
use crate::hgvs::edit::Base;
if start >= end || end > ref_seq.len() {
return None;
}
let n = end - start;
if inserted_seq.len() != n || n < 2 {
return None;
}
let deleted = &ref_seq[start..end];
let mut emitted: Vec<DelinsSubedit> = Vec::new();
let mut has_inv = false;
let mut i = 0;
while i < n {
let mut longest: Option<(usize, usize, usize)> = None;
let mut j = i + 2;
while j <= n {
if is_revcomp(&deleted[i..j], &inserted_seq[i..j]) {
if let Some((s, e)) = shorten_inversion(ref_seq, start + i, start + j) {
longest = Some((j, s, e));
}
}
j += 1;
}
if let Some((j, s, e)) = longest {
emitted.push(DelinsSubedit::Inversion { start: s, end: e });
has_inv = true;
i = j;
} else if deleted[i] != inserted_seq[i] {
let r = Base::from_char(deleted[i] as char)?;
let a = Base::from_char(inserted_seq[i] as char)?;
emitted.push(DelinsSubedit::Substitution {
position: start + i,
reference: r,
alternative: a,
});
i += 1;
} else {
emitted.push(DelinsSubedit::IdentityAt {
position: start + i,
});
i += 1;
}
}
if has_inv && emitted.len() >= 2 {
Some(emitted)
} else {
None
}
}
pub fn canonicalize_delins(
ref_seq: &[u8],
start: usize,
end: usize,
inserted_seq: &[u8],
) -> DelinsCanonical {
use crate::hgvs::edit::Base;
if start >= end || end > ref_seq.len() || inserted_seq.is_empty() {
return DelinsCanonical::KeepAsDelins {
start,
end,
sequence: inserted_seq.to_vec(),
};
}
let deleted = &ref_seq[start..end];
if deleted == inserted_seq {
return DelinsCanonical::Identity;
}
if inserted_seq.len() == 2 * deleted.len() {
let (first_half, second_half) = inserted_seq.split_at(deleted.len());
if first_half == deleted && second_half == deleted {
return DelinsCanonical::Duplication { start, end };
}
}
let (k_prefix, l_suffix) = shared_affix_lengths(deleted, inserted_seq);
let trim_start = start + k_prefix;
let trim_end = end - l_suffix;
let trim_insert = &inserted_seq[k_prefix..inserted_seq.len() - l_suffix];
if trim_start == trim_end {
debug_assert!(!trim_insert.is_empty(), "Identity case caught above");
return DelinsCanonical::Insertion {
after_index: trim_start,
sequence: trim_insert.to_vec(),
};
}
if trim_insert.is_empty() {
return DelinsCanonical::Deletion {
start: trim_start,
end: trim_end,
};
}
let trim_deleted = &ref_seq[trim_start..trim_end];
if trim_deleted.len() == 1 && trim_insert.len() == 1 {
if let (Some(reference), Some(alternative)) = (
Base::from_char(trim_deleted[0] as char),
Base::from_char(trim_insert[0] as char),
) {
return DelinsCanonical::Substitution {
position: trim_start,
reference,
alternative,
};
}
}
if trim_deleted.len() >= 2
&& trim_insert.len() == trim_deleted.len()
&& is_revcomp(trim_deleted, trim_insert)
{
let (s, e) = shorten_inversion(ref_seq, trim_start, trim_end).expect(
"revcomp delins cannot collapse to identity under shortening; \
that case is handled by the Identity / Insertion branches above",
);
debug_assert!(e > s + 1, "Inversion interval must contain >=2 bases");
return DelinsCanonical::Inversion { start: s, end: e };
}
DelinsCanonical::KeepAsDelins {
start: trim_start,
end: trim_end,
sequence: trim_insert.to_vec(),
}
}
fn shared_affix_lengths(deleted: &[u8], inserted: &[u8]) -> (usize, usize) {
let max_total = deleted.len().min(inserted.len());
let mut k = 0;
while k < max_total && deleted[k] == inserted[k] {
k += 1;
}
let mut l = 0;
while k + l < max_total && deleted[deleted.len() - 1 - l] == inserted[inserted.len() - 1 - l] {
l += 1;
}
(k, l)
}
fn is_revcomp(deleted: &[u8], inserted: &[u8]) -> bool {
deleted.len() == inserted.len()
&& deleted
.iter()
.rev()
.zip(inserted.iter())
.all(|(d, i)| complement(*d) == *i)
}
#[derive(Debug, Clone)]
pub enum DupToRepeatResult {
Homopolymer {
base: u8,
count: u64,
start: u64, end: u64, },
TandemRepeat {
unit: Vec<u8>,
count: u64,
start: u64, end: u64, },
GatedInsertion {
start: u64,
end: u64,
sequence: Vec<u8>,
},
}
pub fn duplication_to_repeat(
ref_seq: &[u8],
start: u64,
end: u64,
is_coding: bool,
) -> Option<DupToRepeatResult> {
let start_idx = start as usize;
let end_idx = end as usize;
if start_idx >= ref_seq.len() || end_idx > ref_seq.len() || start_idx >= end_idx {
return None;
}
let dup_seq = &ref_seq[start_idx..end_idx];
if dup_seq.is_empty() {
return None;
}
let dup_len = dup_seq.len();
let first = dup_seq[0];
if dup_len >= 2 && dup_seq.iter().all(|&b| b == first) {
if let Some(analysis) = find_homopolymer_at(ref_seq, start_idx) {
if analysis.base == Some(first) {
if is_coding {
let last_tract_idx = analysis.ref_start + analysis.ref_count as usize - 1;
return Some(DupToRepeatResult::GatedInsertion {
start: index_to_hgvs_pos(last_tract_idx),
end: index_to_hgvs_pos(last_tract_idx) + 1,
sequence: dup_seq.to_vec(),
});
}
let total_count = analysis.ref_count + dup_len as u64;
return Some(DupToRepeatResult::Homopolymer {
base: first,
count: total_count,
start: index_to_hgvs_pos(analysis.ref_start),
end: index_to_hgvs_pos(analysis.ref_start + analysis.ref_count as usize - 1),
});
}
}
}
for unit_len in 1..=dup_len / 2 {
if !dup_len.is_multiple_of(unit_len) {
continue;
}
let unit = &dup_seq[0..unit_len];
let copies_in_dup = dup_len / unit_len;
if copies_in_dup < 2 {
continue;
}
let is_repeat = (0..copies_in_dup).all(|i| {
let chunk = &dup_seq[i * unit_len..(i + 1) * unit_len];
chunk == unit
});
if !is_repeat {
continue;
}
if let Some((ref_count, rep_start, rep_end)) =
count_tandem_repeats(ref_seq, start_idx, unit)
{
if is_coding && !unit_len.is_multiple_of(3) {
let last_tract_idx = rep_end - 1;
return Some(DupToRepeatResult::GatedInsertion {
start: index_to_hgvs_pos(last_tract_idx),
end: index_to_hgvs_pos(last_tract_idx) + 1,
sequence: dup_seq.to_vec(),
});
}
let total_count = ref_count + copies_in_dup as u64;
return Some(DupToRepeatResult::TandemRepeat {
unit: unit.to_vec(),
count: total_count,
start: index_to_hgvs_pos(rep_start),
end: index_to_hgvs_pos(rep_end - 1),
});
}
}
None
}
pub fn count_tandem_repeats(
ref_seq: &[u8],
pos: usize,
repeat_unit: &[u8],
) -> Option<(u64, usize, usize)> {
if repeat_unit.is_empty() || pos >= ref_seq.len() {
return None;
}
let unit_len = repeat_unit.len();
if pos + unit_len > ref_seq.len() {
return None;
}
let mut start = pos;
while start >= unit_len {
let candidate = &ref_seq[start - unit_len..start];
if candidate == repeat_unit {
start -= unit_len;
} else {
break;
}
}
let mut end = start;
let mut count = 0u64;
while end + unit_len <= ref_seq.len() {
if &ref_seq[end..end + unit_len] == repeat_unit {
count += 1;
end += unit_len;
} else {
break;
}
}
if count >= 1 {
Some((count, start, end))
} else {
None
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum RepeatNormResult {
Deletion {
start: u64, end: u64, },
Duplication {
start: u64, end: u64, sequence: Vec<u8>,
},
Insertion {
start: u64,
end: u64,
sequence: Vec<u8>,
},
Repeat {
start: u64, end: u64, sequence: Vec<u8>,
count: u64,
},
Unchanged,
}
pub fn normalize_repeat(
ref_seq: &[u8],
pos: usize,
repeat_unit: &[u8],
specified_count: u64,
is_coding: bool,
) -> RepeatNormResult {
if repeat_unit.is_empty() {
return RepeatNormResult::Unchanged;
}
let canonical_unit = smallest_repeat_unit(repeat_unit);
let copies_per_input_unit = (repeat_unit.len() / canonical_unit.len()) as u64;
let specified_count = specified_count * copies_per_input_unit;
let Some((ref_count, ref_start, ref_end)) = count_tandem_repeats(ref_seq, pos, canonical_unit)
else {
return RepeatNormResult::Unchanged;
};
let unit_len = canonical_unit.len() as u64;
let codon_blocks_repeat = is_coding && !canonical_unit.len().is_multiple_of(3);
if specified_count < ref_count {
let k = ref_count - specified_count;
if k >= 2 && specified_count >= 1 && !codon_blocks_repeat {
RepeatNormResult::Repeat {
start: index_to_hgvs_pos(ref_start),
end: index_to_hgvs_pos(ref_end - 1),
sequence: canonical_unit.to_vec(),
count: specified_count,
}
} else {
let del_len = (k as usize) * unit_len as usize;
let del_end_idx = ref_end - 1;
let del_start_idx = ref_end - del_len;
RepeatNormResult::Deletion {
start: index_to_hgvs_pos(del_start_idx),
end: index_to_hgvs_pos(del_end_idx),
}
}
} else if specified_count == ref_count + 1 {
let dup_end_idx = ref_end - 1;
let dup_start_idx = ref_end - canonical_unit.len();
RepeatNormResult::Duplication {
start: index_to_hgvs_pos(dup_start_idx),
end: index_to_hgvs_pos(dup_end_idx),
sequence: canonical_unit.to_vec(),
}
} else if specified_count == ref_count {
RepeatNormResult::Unchanged
} else if codon_blocks_repeat {
let added_copies = specified_count - ref_count;
let mut inserted = Vec::with_capacity((added_copies as usize) * canonical_unit.len());
for _ in 0..added_copies {
inserted.extend_from_slice(canonical_unit);
}
let flank_left = index_to_hgvs_pos(ref_end - 1);
let flank_right = flank_left + 1;
RepeatNormResult::Insertion {
start: flank_left,
end: flank_right,
sequence: inserted,
}
} else {
RepeatNormResult::Repeat {
start: index_to_hgvs_pos(ref_start),
end: index_to_hgvs_pos(ref_end - 1),
sequence: canonical_unit.to_vec(),
count: specified_count,
}
}
}
pub fn get_canonical_form(edit: &NaEdit, ref_seq: &[u8], start: u64, _end: u64) -> CanonicalForm {
use crate::hgvs::edit::InsertedSequence;
match edit {
NaEdit::Deletion { .. } => {
CanonicalForm::Deletion
}
NaEdit::Insertion { sequence } => {
if let InsertedSequence::Literal(seq) = sequence {
let seq_bytes: Vec<u8> = seq.bases().iter().map(|b| *b as u8).collect();
if insertion_is_duplication(ref_seq, start, &seq_bytes) {
return CanonicalForm::Duplication;
}
}
CanonicalForm::Insertion
}
NaEdit::Delins { .. } => CanonicalForm::Delins,
NaEdit::Duplication { .. } => CanonicalForm::Duplication,
_ => CanonicalForm::Deletion, }
}
pub fn canonicalize_edit(edit: &NaEdit) -> NaEdit {
match edit {
NaEdit::Deletion { .. } => NaEdit::Deletion {
sequence: None,
length: None,
},
NaEdit::Duplication {
uncertain_extent, ..
} => NaEdit::Duplication {
sequence: None,
length: None,
uncertain_extent: uncertain_extent.clone(),
},
NaEdit::Delins { sequence, .. } => {
NaEdit::Delins {
sequence: sequence.clone(),
deleted: None,
deleted_length: None,
}
}
NaEdit::Substitution {
reference,
alternative,
} if reference == alternative => NaEdit::position_identity(),
_ => edit.clone(),
}
}
pub fn should_canonicalize(edit: &NaEdit) -> bool {
match edit {
NaEdit::Deletion { sequence, length } => sequence.is_some() || length.is_some(),
NaEdit::Duplication {
sequence, length, ..
} => sequence.is_some() || length.is_some(),
NaEdit::Substitution {
reference,
alternative,
} => reference == alternative,
_ => false,
}
}
pub fn canonicalize_conversion_to_delins(edit: &NaEdit) -> Option<NaEdit> {
use crate::hgvs::edit::InsertedSequence;
let source = match edit {
NaEdit::Conversion { source } => source,
_ => return None,
};
if let Some((s, e)) = source.split_once('_') {
if !s.is_empty()
&& !e.is_empty()
&& s.bytes().all(|b| b.is_ascii_digit())
&& e.bytes().all(|b| b.is_ascii_digit())
{
if let (Ok(start), Ok(end)) = (s.parse::<u64>(), e.parse::<u64>()) {
if start >= 1 && end >= 1 && start <= end {
return Some(NaEdit::Delins {
sequence: InsertedSequence::PositionRange { start, end },
deleted: None,
deleted_length: None,
});
}
}
}
}
Some(NaEdit::Delins {
sequence: InsertedSequence::Reference(source.clone()),
deleted: None,
deleted_length: None,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_needs_normalization() {
use crate::hgvs::edit::Base;
assert!(needs_normalization(&NaEdit::Deletion {
sequence: None,
length: None,
}));
assert!(needs_normalization(&NaEdit::Duplication {
sequence: None,
length: None,
uncertain_extent: None,
}));
assert!(needs_normalization(&NaEdit::Inversion {
sequence: None,
length: None,
}));
assert!(!needs_normalization(&NaEdit::Substitution {
reference: Base::A,
alternative: Base::G,
}));
assert!(needs_normalization(&NaEdit::Substitution {
reference: Base::A,
alternative: Base::A,
}));
}
#[test]
fn test_insertion_is_duplication() {
let ref_seq = b"ATGATGATG";
assert!(insertion_is_duplication(ref_seq, 3, b"ATG"));
assert!(!insertion_is_duplication(ref_seq, 3, b"TGA"));
assert!(insertion_is_duplication(ref_seq, 6, b"ATG"));
}
#[test]
fn test_insertion_is_duplication_pos_beyond_ref() {
let ref_seq = b"ATGATGATG";
assert!(!insertion_is_duplication(ref_seq, 95, b"TATTT"));
assert!(!insertion_is_duplication(b"", 95, b"TATTT"));
assert!(!insertion_is_duplication(b"", 0, b"A"));
}
#[test]
fn test_deletion_stays_deletion() {
let ref_seq = b"ATGATGATG";
let del_edit = NaEdit::Deletion {
sequence: None,
length: None,
};
assert_eq!(
get_canonical_form(&del_edit, ref_seq, 3, 6),
CanonicalForm::Deletion
);
}
#[test]
fn test_canonicalize_delins() {
use crate::hgvs::edit::Base;
let ref_seq = b"ACGTACGT";
assert!(matches!(
canonicalize_delins(ref_seq, 3, 4, b"T"),
DelinsCanonical::Identity
));
assert!(matches!(
canonicalize_delins(ref_seq, 1, 4, b"CGT"),
DelinsCanonical::Identity
));
assert!(matches!(
canonicalize_delins(ref_seq, 0, 8, b"ACGTACGT"),
DelinsCanonical::Identity
));
assert!(matches!(
canonicalize_delins(ref_seq, 3, 4, b"A"),
DelinsCanonical::Substitution {
position: 3,
reference: Base::T,
alternative: Base::A,
}
));
assert!(matches!(
canonicalize_delins(b"A", 0, 1, b"T"),
DelinsCanonical::Substitution {
position: 0,
reference: Base::A,
alternative: Base::T,
}
));
assert!(matches!(
canonicalize_delins(b"CTAG", 0, 4, b"TTAG"),
DelinsCanonical::Substitution {
position: 0,
reference: Base::C,
alternative: Base::T,
}
));
assert!(matches!(
canonicalize_delins(b"CTAG", 0, 4, b"CTAA"),
DelinsCanonical::Substitution {
position: 3,
reference: Base::G,
alternative: Base::A,
}
));
assert!(matches!(
canonicalize_delins(b"NACGN", 1, 4, b"ATG"),
DelinsCanonical::Substitution {
position: 2,
reference: Base::C,
alternative: Base::T,
}
));
assert!(matches!(
canonicalize_delins(b"NGTAN", 1, 4, b"GTT"),
DelinsCanonical::Substitution {
position: 3,
reference: Base::A,
alternative: Base::T,
}
));
assert!(matches!(
canonicalize_delins(b"AAAA", 0, 4, b"AAAT"),
DelinsCanonical::Substitution {
position: 3,
reference: Base::A,
alternative: Base::T,
}
));
assert!(matches!(
canonicalize_delins(b"ACGT", 0, 4, b"AT"),
DelinsCanonical::Deletion { start: 1, end: 3 }
));
assert!(matches!(
canonicalize_delins(b"ACT", 0, 3, b"ACGT"),
DelinsCanonical::Insertion {
after_index: 2,
ref sequence,
} if sequence == b"G"
));
assert!(matches!(
canonicalize_delins(b"CTA", 0, 3, b"TAG"),
DelinsCanonical::Inversion { start: 0, end: 3 }
));
assert!(matches!(
canonicalize_delins(b"CTATG", 0, 5, b"CATAG"),
DelinsCanonical::Inversion { start: 1, end: 4 }
));
assert!(matches!(
canonicalize_delins(b"ACGAGT", 0, 6, b"ACTCGT"),
DelinsCanonical::Inversion { start, end } if start < end
));
assert!(matches!(
canonicalize_delins(b"ATAT", 0, 4, b"ATAT"),
DelinsCanonical::Identity
));
assert!(matches!(
canonicalize_delins(b"GATG", 1, 2, b"AA"),
DelinsCanonical::Duplication { start: 1, end: 2 }
));
assert!(matches!(
canonicalize_delins(b"NATGCN", 1, 4, b"ATGATG"),
DelinsCanonical::Duplication { start: 1, end: 4 }
));
assert!(matches!(
canonicalize_delins(b"AAGC", 0, 4, b"TTCG"),
DelinsCanonical::KeepAsDelins {
start: 0,
end: 4,
ref sequence,
} if sequence == b"TTCG"
));
assert!(matches!(
canonicalize_delins(b"AAGC", 0, 4, b"CGAA"),
DelinsCanonical::KeepAsDelins {
start: 0,
end: 4,
ref sequence,
} if sequence == b"CGAA"
));
assert!(matches!(
canonicalize_delins(b"AAGC", 0, 4, b"GGG"),
DelinsCanonical::KeepAsDelins {
start: 0,
end: 4,
ref sequence,
} if sequence == b"GGG"
));
assert!(matches!(
canonicalize_delins(b"AGGCT", 0, 5, b"AAACT"),
DelinsCanonical::KeepAsDelins {
start: 1,
end: 3,
ref sequence,
} if sequence == b"AA"
));
assert!(matches!(
canonicalize_delins(b"AAGC", 0, 1, b""),
DelinsCanonical::KeepAsDelins {
start: 0,
end: 1,
ref sequence,
} if sequence.is_empty()
));
assert!(matches!(
canonicalize_delins(b"AAGC", 2, 2, b"X"),
DelinsCanonical::KeepAsDelins {
start: 2,
end: 2,
ref sequence,
} if sequence == b"X"
));
assert!(matches!(
canonicalize_delins(b"AAGC", 3, 5, b"X"),
DelinsCanonical::KeepAsDelins {
start: 3,
end: 5,
ref sequence,
} if sequence == b"X"
));
}
#[test]
fn test_insertion_becomes_dup() {
use crate::hgvs::edit::{InsertedSequence, Sequence};
use std::str::FromStr;
let ref_seq = b"ATGATGATG";
let ins_edit = NaEdit::Insertion {
sequence: InsertedSequence::Literal(Sequence::from_str("ATG").unwrap()),
};
assert_eq!(
get_canonical_form(&ins_edit, ref_seq, 3, 3),
CanonicalForm::Duplication
);
let ins_edit2 = NaEdit::Insertion {
sequence: InsertedSequence::Literal(Sequence::from_str("TGA").unwrap()),
};
assert_eq!(
get_canonical_form(&ins_edit2, ref_seq, 3, 3),
CanonicalForm::Insertion
);
}
#[test]
fn test_find_homopolymer_at() {
let ref_seq = b"GGGAAAAAGGG";
let result = find_homopolymer_at(ref_seq, 4);
assert!(result.is_some());
let analysis = result.unwrap();
assert!(analysis.is_homopolymer);
assert_eq!(analysis.base, Some(b'A'));
assert_eq!(analysis.ref_start, 3);
assert_eq!(analysis.ref_end, 8); assert_eq!(analysis.ref_count, 5);
let result = find_homopolymer_at(ref_seq, 1);
assert!(result.is_some());
let analysis = result.unwrap();
assert_eq!(analysis.base, Some(b'G'));
assert_eq!(analysis.ref_count, 3);
let single_seq = b"ATGC";
assert!(find_homopolymer_at(single_seq, 0).is_none());
}
#[test]
fn test_insertion_to_repeat() {
let ref_seq = b"GGGAAAAAGGG";
let result = insertion_to_repeat(ref_seq, 7, b"AA", false);
assert!(result.is_some());
let (base, count, start, end, _unit) = result.unwrap();
assert_eq!(base, b'A');
assert_eq!(count, 7); assert_eq!(start, 4); assert_eq!(end, 8);
let result = insertion_to_repeat(ref_seq, 7, b"A", false);
assert!(result.is_none());
let result = insertion_to_repeat(ref_seq, 7, b"T", false);
assert!(result.is_none());
let result = insertion_to_repeat(ref_seq, 7, b"AT", false);
assert!(result.is_none());
let ref_ac = b"ACACGGG";
let result = insertion_to_repeat(ref_ac, 3, b"ACAC", false);
assert!(result.is_some());
let (base, count, start, end, unit) = result.unwrap();
assert_eq!(base, b'A');
assert_eq!(count, 4);
assert_eq!(start, 1); assert_eq!(end, 4); assert_eq!(unit, b"AC");
}
#[test]
fn test_duplication_to_repeat() {
let ref_seq = b"GGGAAAAAGGG";
let result = duplication_to_repeat(ref_seq, 3, 5, false);
assert!(result.is_some());
match result.unwrap() {
DupToRepeatResult::Homopolymer {
base, count, start, ..
} => {
assert_eq!(base, b'A');
assert_eq!(count, 7); assert_eq!(start, 4); }
_ => panic!("Expected Homopolymer result"),
}
let non_repeat_seq = b"ATGCXYZ";
let result = duplication_to_repeat(non_repeat_seq, 0, 3, false);
assert!(result.is_none());
}
#[test]
fn test_duplication_to_tandem_repeat() {
let ref_seq = b"AAAAAGCAGCAGCAGCAGCAGCAGCAGCAAAAA";
let result = duplication_to_repeat(ref_seq, 5, 8, false);
assert!(result.is_none(), "Single-copy dup should not become repeat");
let result = duplication_to_repeat(ref_seq, 5, 11, false);
assert!(result.is_some());
match result.unwrap() {
DupToRepeatResult::TandemRepeat {
unit,
count,
start,
end,
} => {
assert_eq!(unit, b"GCA");
assert_eq!(count, 10); assert_eq!(start, 6); assert_eq!(end, 29); }
_ => panic!("Expected TandemRepeat result"),
}
}
#[test]
fn test_count_tandem_repeats_basic() {
let ref_seq = b"GGGCATCATCATGGG";
let result = count_tandem_repeats(ref_seq, 3, b"CAT");
assert!(result.is_some());
let (count, start, end) = result.unwrap();
assert_eq!(count, 3);
assert_eq!(start, 3);
assert_eq!(end, 12);
let result = count_tandem_repeats(ref_seq, 6, b"CAT");
assert!(result.is_some());
let (count, start, end) = result.unwrap();
assert_eq!(count, 3);
assert_eq!(start, 3);
assert_eq!(end, 12);
}
#[test]
fn test_count_tandem_repeats_single_base() {
let ref_seq = b"GGGAAAAAAGGG";
let result = count_tandem_repeats(ref_seq, 5, b"A");
assert!(result.is_some());
let (count, start, end) = result.unwrap();
assert_eq!(count, 6);
assert_eq!(start, 3);
assert_eq!(end, 9);
}
#[test]
fn test_count_tandem_repeats_no_match() {
let ref_seq = b"GGGAAAAAAGGG";
let result = count_tandem_repeats(ref_seq, 5, b"XYZ");
assert!(result.is_none());
}
#[test]
fn test_normalize_repeat_to_deletion() {
let ref_seq = b"GGGCATCATCATCATGGG";
let result = normalize_repeat(ref_seq, 3, b"CAT", 1, false);
match result {
RepeatNormResult::Repeat {
sequence, count, ..
} => {
assert_eq!(sequence, b"CAT");
assert_eq!(count, 1, "Should reflect specified count of 1");
}
_ => panic!("Expected Repeat (B2), got {:?}", result),
}
}
#[test]
fn test_normalize_repeat_to_duplication() {
let ref_seq = b"GGGCATCATGGG";
let result = normalize_repeat(ref_seq, 3, b"CAT", 3, false);
match result {
RepeatNormResult::Duplication {
start,
end,
sequence,
} => {
assert_eq!(sequence, b"CAT");
assert_eq!(end - start + 1, 3, "Should duplicate 3 bases (1 CAT)");
}
_ => panic!("Expected Duplication, got {:?}", result),
}
}
#[test]
fn test_normalize_repeat_stays_repeat() {
let ref_seq = b"GGGCATCATGGG";
let result = normalize_repeat(ref_seq, 3, b"CAT", 5, false);
match result {
RepeatNormResult::Repeat {
count, sequence, ..
} => {
assert_eq!(sequence, b"CAT");
assert_eq!(count, 5);
}
_ => panic!("Expected Repeat, got {:?}", result),
}
}
#[test]
fn test_normalize_repeat_unchanged() {
let ref_seq = b"GGGCATCATGGG";
let result = normalize_repeat(ref_seq, 3, b"CAT", 2, false);
assert!(matches!(result, RepeatNormResult::Unchanged));
}
#[test]
fn test_normalize_repeat_empty_unit_is_unchanged() {
let ref_seq = b"GGGCATCATGGG";
let result = normalize_repeat(ref_seq, 3, b"", 1, false);
assert!(matches!(result, RepeatNormResult::Unchanged));
}
#[test]
fn test_normalize_repeat_canonicalizes_non_minimal_unit() {
let ref_seq = b"GGGATATATATGGG";
let result = normalize_repeat(ref_seq, 3, b"ATAT", 1, false);
match result {
RepeatNormResult::Repeat {
start,
end,
sequence,
count,
} => {
assert_eq!(sequence, b"AT", "Should emit canonical (smallest) unit");
assert_eq!(count, 2, "Specified ATAT[1] = 2 canonical AT copies");
assert_eq!(start, 4, "Canonical tract starts at HGVS pos 4");
assert_eq!(end, 11, "Canonical tract ends at HGVS pos 11");
}
_ => panic!("Expected canonical AT[2] Repeat, got {:?}", result),
}
}
#[test]
fn test_complement() {
assert_eq!(complement(b'A'), b'T');
assert_eq!(complement(b'T'), b'A');
assert_eq!(complement(b'G'), b'C');
assert_eq!(complement(b'C'), b'G');
assert_eq!(complement(b'N'), b'N'); }
#[test]
fn test_shorten_inversion_basic() {
let seq = b"ATGCAT";
let result = shorten_inversion(seq, 0, 6);
assert!(
result.is_none(),
"Fully complementary inversion should become identity"
);
}
#[test]
fn test_shorten_inversion_partial() {
let seq = b"ATGGAT";
let result = shorten_inversion(seq, 0, 6);
assert!(result.is_some());
let (s, e) = result.unwrap();
assert_eq!(s, 2);
assert_eq!(e, 4);
}
#[test]
fn test_shorten_inversion_no_change() {
let seq = b"GGCC";
let result = shorten_inversion(seq, 0, 4);
assert!(result.is_none());
let seq2 = b"GATT";
let result2 = shorten_inversion(seq2, 0, 4);
assert!(result2.is_some());
let (s, e) = result2.unwrap();
assert_eq!(s, 0);
assert_eq!(e, 4);
}
#[test]
fn test_extend_tandem_tract_homopolymer() {
let ref_seq = b"TTAAAATT";
let tract = extend_tandem_tract(ref_seq, 2..4, b"A").unwrap();
assert_eq!(tract.start, 2);
assert_eq!(tract.end, 6);
assert_eq!(tract.ref_count, 4);
}
#[test]
fn test_extend_tandem_tract_multi_base_unit() {
let ref_seq = b"TTGCAGCAGCATT";
let tract = extend_tandem_tract(ref_seq, 5..8, b"GCA").unwrap();
assert_eq!(tract.start, 2);
assert_eq!(tract.end, 11);
assert_eq!(tract.ref_count, 3);
}
#[test]
fn test_extend_tandem_tract_anchor_at_5prime_edge() {
let ref_seq = b"AAAATT";
let tract = extend_tandem_tract(ref_seq, 0..2, b"A").unwrap();
assert_eq!(tract.start, 0);
assert_eq!(tract.end, 4);
assert_eq!(tract.ref_count, 4);
}
#[test]
fn test_extend_tandem_tract_anchor_at_3prime_edge() {
let ref_seq = b"TTAAAA";
let tract = extend_tandem_tract(ref_seq, 4..6, b"A").unwrap();
assert_eq!(tract.start, 2);
assert_eq!(tract.end, 6);
assert_eq!(tract.ref_count, 4);
}
#[test]
fn test_extend_tandem_tract_zero_width_anchor() {
let ref_seq = b"TTAAAATT";
let tract = extend_tandem_tract(ref_seq, 4..4, b"A").unwrap();
assert_eq!(tract.start, 2);
assert_eq!(tract.end, 6);
assert_eq!(tract.ref_count, 4);
}
#[test]
fn test_extend_tandem_tract_anchor_not_unit_periodic() {
let ref_seq = b"TTAGAATT";
assert!(extend_tandem_tract(ref_seq, 2..4, b"A").is_none());
}
#[test]
fn test_extend_tandem_tract_anchor_length_not_multiple_of_unit() {
let ref_seq = b"AAA";
assert!(extend_tandem_tract(ref_seq, 0..3, b"AA").is_none());
}
#[test]
fn test_deletion_to_repeat_homopolymer_two_removed() {
let ref_seq = b"TTAAAAATT";
let r = deletion_to_repeat(ref_seq, 5, 7, false).expect("should fire");
assert_eq!(r.unit, b"A");
assert_eq!(r.count, 3);
assert_eq!(r.start, 3); assert_eq!(r.end, 7);
}
#[test]
fn test_deletion_to_repeat_multi_base_tandem_two_removed() {
let ref_seq = b"TTGCAGCAGCATT";
let r = deletion_to_repeat(ref_seq, 5, 11, false).expect("should fire");
assert_eq!(r.unit, b"GCA");
assert_eq!(r.count, 1);
assert_eq!(r.start, 3);
assert_eq!(r.end, 11);
}
#[test]
fn test_deletion_to_repeat_one_unit_removed_returns_none() {
let ref_seq = b"TTAAAAATT";
assert!(deletion_to_repeat(ref_seq, 6, 7, false).is_none());
}
#[test]
fn test_deletion_to_repeat_full_tract_removal_returns_none() {
let ref_seq = b"TTAATT";
assert!(deletion_to_repeat(ref_seq, 2, 4, false).is_none());
}
#[test]
fn test_deletion_to_repeat_non_tandem_returns_none() {
let ref_seq = b"TTGCATT";
assert!(deletion_to_repeat(ref_seq, 2, 5, false).is_none());
}
#[test]
fn test_deletion_to_repeat_finer_periodicity() {
let ref_seq = b"TTATATATATATT";
let r = deletion_to_repeat(ref_seq, 8, 12, false).expect("should fire");
assert_eq!(r.unit, b"AT");
assert_eq!(r.count, 3);
assert_eq!(r.start, 3); assert_eq!(r.end, 12);
}
#[test]
fn test_canonicalize_deletion_with_length() {
use crate::hgvs::edit::Sequence;
use std::str::FromStr;
let edit = NaEdit::Deletion {
sequence: None,
length: Some(12),
};
let canonical = canonicalize_edit(&edit);
assert!(matches!(
canonical,
NaEdit::Deletion {
sequence: None,
length: None
}
));
let edit = NaEdit::Deletion {
sequence: Some(Sequence::from_str("ATG").unwrap()),
length: None,
};
let canonical = canonicalize_edit(&edit);
assert!(matches!(
canonical,
NaEdit::Deletion {
sequence: None,
length: None
}
));
}
#[test]
fn test_canonicalize_duplication_with_length() {
use crate::hgvs::edit::Sequence;
use std::str::FromStr;
let edit = NaEdit::Duplication {
sequence: None,
length: Some(12),
uncertain_extent: None,
};
let canonical = canonicalize_edit(&edit);
assert!(matches!(
canonical,
NaEdit::Duplication {
sequence: None,
length: None,
..
}
));
let edit = NaEdit::Duplication {
sequence: Some(Sequence::from_str("ATG").unwrap()),
length: None,
uncertain_extent: None,
};
let canonical = canonicalize_edit(&edit);
assert!(matches!(
canonical,
NaEdit::Duplication {
sequence: None,
length: None,
..
}
));
}
#[test]
fn test_should_canonicalize() {
assert!(should_canonicalize(&NaEdit::Deletion {
sequence: None,
length: Some(12)
}));
assert!(!should_canonicalize(&NaEdit::Deletion {
sequence: None,
length: None
}));
use crate::hgvs::edit::Base;
assert!(!should_canonicalize(&NaEdit::Substitution {
reference: Base::A,
alternative: Base::G
}));
assert!(should_canonicalize(&NaEdit::Substitution {
reference: Base::A,
alternative: Base::A
}));
}
#[test]
fn test_canonicalize_edit_degenerate_substitution_to_identity() {
use crate::hgvs::edit::Base;
let degenerate = NaEdit::Substitution {
reference: Base::A,
alternative: Base::A,
};
assert_eq!(canonicalize_edit(°enerate), NaEdit::position_identity());
let real_sub = NaEdit::Substitution {
reference: Base::A,
alternative: Base::G,
};
assert_eq!(canonicalize_edit(&real_sub), real_sub);
}
#[test]
fn test_insertion_to_repeat_codon_frame_gate_blocks_a_in_coding() {
let ref_seq = b"CAAAAAC";
let result = insertion_to_repeat(ref_seq, 5, b"AA", true);
assert!(
result.is_none(),
"is_coding=true + unit_len=1 must return None"
);
}
#[test]
fn test_insertion_to_repeat_codon_frame_gate_blocks_at_in_coding() {
let ref_seq = b"CATATATC";
let result = insertion_to_repeat(ref_seq, 6, b"ATAT", true);
assert!(
result.is_none(),
"is_coding=true + unit_len=2 must return None"
);
}
#[test]
fn test_insertion_to_repeat_codon_frame_gate_passes_cag_in_coding() {
let ref_seq = b"CCAGCAGCAGT";
let result = insertion_to_repeat(ref_seq, 9, b"CAGCAG", true);
assert!(
result.is_some(),
"is_coding=true + unit_len=3 must allow rewrite"
);
let (_first, count, _start, _end, unit) = result.unwrap();
assert_eq!(count, 5, "expected CAG[5]");
assert_eq!(unit, b"CAG");
}
#[test]
fn test_insertion_to_repeat_gate_no_op_in_genomic() {
let ref_seq = b"CAAAAAC";
let result = insertion_to_repeat(ref_seq, 5, b"AA", false);
assert!(result.is_some(), "is_coding=false must not gate");
}
#[test]
fn test_deletion_to_repeat_codon_frame_gate_blocks_a_in_coding() {
let ref_seq = b"CAAAAAC";
let result = deletion_to_repeat(ref_seq, 2, 4, true);
assert!(
result.is_none(),
"is_coding=true + unit_len=1 must return None"
);
}
#[test]
fn test_deletion_to_repeat_codon_frame_gate_passes_cag_in_coding() {
let ref_seq = b"CCAGCAGCAGT";
let result = deletion_to_repeat(ref_seq, 1, 7, true);
assert!(
result.is_some(),
"is_coding=true + unit_len=3 must allow rewrite"
);
}
#[test]
fn test_duplication_to_repeat_codon_frame_gate_routes_a_to_gated_insertion() {
let ref_seq = b"CAAAAC";
let result = duplication_to_repeat(ref_seq, 1, 3, true);
match result {
Some(DupToRepeatResult::GatedInsertion { sequence, .. }) => {
assert_eq!(sequence, b"AA", "sequence is the duplicated literal");
}
other => panic!("expected GatedInsertion, got {:?}", other),
}
}
#[test]
fn test_duplication_to_repeat_codon_frame_gate_passes_cag_in_coding() {
let ref_seq = b"CCAGCAGCAGT";
let result = duplication_to_repeat(ref_seq, 1, 7, true);
assert!(
result.is_some(),
"is_coding=true + unit_len=3 must allow rewrite"
);
}
#[test]
fn test_normalize_repeat_codon_frame_gate_routes_contraction_to_deletion() {
let ref_seq = b"CAAAAAC";
let result = normalize_repeat(ref_seq, 1, b"A", 3, true);
match result {
RepeatNormResult::Deletion { .. } => {}
other => panic!("expected Deletion under gate, got {:?}", other),
}
}
#[test]
fn test_normalize_repeat_codon_frame_gate_routes_expansion_to_insertion() {
let ref_seq = b"CAAAAAC";
let result = normalize_repeat(ref_seq, 1, b"A", 8, true);
match result {
RepeatNormResult::Insertion { sequence, .. } => {
assert_eq!(sequence, b"AAA", "3 extra A's");
}
other => panic!("expected Insertion under gate, got {:?}", other),
}
}
#[test]
fn test_normalize_repeat_codon_frame_gate_passes_through_dup_branch() {
let ref_seq = b"CAAAAAC";
let result = normalize_repeat(ref_seq, 1, b"A", 6, true);
match result {
RepeatNormResult::Duplication { .. } => {}
other => panic!("expected Duplication, got {:?}", other),
}
}
fn sub_at(position: usize, r: char, a: char) -> DelinsSubedit {
DelinsSubedit::Substitution {
position,
reference: crate::hgvs::edit::Base::from_char(r).unwrap(),
alternative: crate::hgvs::edit::Base::from_char(a).unwrap(),
}
}
fn inv_at(start: usize, end: usize) -> DelinsSubedit {
DelinsSubedit::Inversion { start, end }
}
fn ident_at(position: usize) -> DelinsSubedit {
DelinsSubedit::IdentityAt { position }
}
#[test]
fn decompose_inv_subspan_at_start() {
let result = decompose_delins_inv(b"TCC", 0, 3, b"GAG");
assert_eq!(result, Some(vec![inv_at(0, 2), sub_at(2, 'C', 'G')]));
}
#[test]
fn decompose_inv_subspan_at_end() {
let result = decompose_delins_inv(b"AAG", 0, 3, b"GCT");
assert_eq!(result, Some(vec![sub_at(0, 'A', 'G'), inv_at(1, 3)]));
}
#[test]
fn decompose_full_span_inv_returns_none() {
let result = decompose_delins_inv(b"GCT", 0, 3, b"AGC");
assert_eq!(result, None);
}
#[test]
fn decompose_no_inv_returns_none() {
let result = decompose_delins_inv(b"AT", 0, 2, b"GC");
assert_eq!(result, None);
}
#[test]
fn decompose_disjoint_inv_runs() {
let result = decompose_delins_inv(b"AGACC", 0, 5, b"CTTGG");
assert_eq!(
result,
Some(vec![inv_at(0, 2), sub_at(2, 'A', 'T'), inv_at(3, 5)])
);
}
#[test]
fn decompose_codon_frame_merge_returns_none() {
let result = decompose_delins_inv(b"TAG", 0, 3, b"AAC");
assert_eq!(result, None);
}
#[test]
fn decompose_complement_only_returns_none() {
let result = decompose_delins_inv(b"AC", 0, 2, b"TG");
assert_eq!(result, None);
}
#[test]
fn decompose_reverse_only_returns_none() {
let result = decompose_delins_inv(b"AC", 0, 2, b"CA");
assert_eq!(result, None);
}
#[test]
fn decompose_unequal_length_returns_none() {
let result = decompose_delins_inv(b"AC", 0, 2, b"GTT");
assert_eq!(result, None);
}
#[test]
fn decompose_offset_start_propagates_position() {
let mut seq = vec![b'A'; 200];
seq[100] = b'T';
seq[101] = b'C';
seq[102] = b'C';
let result = decompose_delins_inv(&seq, 100, 103, b"GAG");
assert_eq!(result, Some(vec![inv_at(100, 102), sub_at(102, 'C', 'G')]));
}
#[test]
fn decompose_palindromic_full_span_returns_none() {
let result = decompose_delins_inv(b"GCTA", 0, 4, b"TAGC");
assert_eq!(result, None);
}
#[test]
fn decompose_palindromic_inv_subspan_skipped() {
let result = decompose_delins_inv(b"ATATC", 0, 5, b"ATATG");
assert_eq!(result, None);
}
#[test]
fn decompose_inv_subspan_shortened_outer_pair() {
let result = decompose_delins_inv(b"CTATGC", 0, 6, b"CATAGG");
assert_eq!(result, Some(vec![inv_at(1, 4), sub_at(5, 'C', 'G')]));
}
#[test]
fn decompose_inv_run_with_identity_in_middle() {
let result = decompose_delins_inv(b"AGACC", 0, 5, b"CTAGT");
assert_eq!(
result,
Some(vec![
inv_at(0, 2),
ident_at(2),
sub_at(3, 'C', 'G'),
sub_at(4, 'C', 'T'),
])
);
}
#[test]
fn canonicalize_conversion_same_reference_emits_position_range() {
use crate::hgvs::edit::InsertedSequence;
let edit = NaEdit::Conversion {
source: "42536337_42536382".to_string(),
};
let got = canonicalize_conversion_to_delins(&edit).expect("expected Some");
match got {
NaEdit::Delins {
sequence: InsertedSequence::PositionRange { start, end },
..
} => {
assert_eq!(start, 42536337);
assert_eq!(end, 42536382);
}
other => panic!("expected Delins{{PositionRange}}, got {:?}", other),
}
}
#[test]
fn canonicalize_conversion_cross_reference_emits_bracketed_reference() {
use crate::hgvs::edit::InsertedSequence;
let edit = NaEdit::Conversion {
source: "NM_000089.1:c.789_1011".to_string(),
};
let got = canonicalize_conversion_to_delins(&edit).expect("expected Some");
match &got {
NaEdit::Delins {
sequence: InsertedSequence::Reference(s),
..
} => {
assert_eq!(s, "NM_000089.1:c.789_1011");
}
other => panic!("expected Delins{{Reference}}, got {:?}", other),
}
assert_eq!(format!("{}", got), "delins[NM_000089.1:c.789_1011]");
}
#[test]
fn canonicalize_conversion_returns_none_for_non_conversion() {
let edit = NaEdit::Deletion {
sequence: None,
length: None,
};
assert!(canonicalize_conversion_to_delins(&edit).is_none());
}
#[test]
fn canonicalize_conversion_overflow_falls_back_to_reference() {
use crate::hgvs::edit::InsertedSequence;
let edit = NaEdit::Conversion {
source: "123456789012345678901_2".to_string(),
};
let got = canonicalize_conversion_to_delins(&edit).expect("expected Some");
assert!(matches!(
got,
NaEdit::Delins {
sequence: InsertedSequence::Reference(_),
..
}
));
}
#[test]
fn canonicalize_conversion_zero_position_falls_back_to_reference() {
use crate::hgvs::edit::InsertedSequence;
let edit = NaEdit::Conversion {
source: "0_0".to_string(),
};
let got = canonicalize_conversion_to_delins(&edit).expect("expected Some");
match got {
NaEdit::Delins {
sequence: InsertedSequence::Reference(s),
..
} => assert_eq!(s, "0_0"),
other => panic!("expected Delins{{Reference}} for 0_0, got {:?}", other),
}
}
#[test]
fn canonicalize_conversion_reversed_range_falls_back_to_reference() {
use crate::hgvs::edit::InsertedSequence;
let edit = NaEdit::Conversion {
source: "10_2".to_string(),
};
let got = canonicalize_conversion_to_delins(&edit).expect("expected Some");
match got {
NaEdit::Delins {
sequence: InsertedSequence::Reference(s),
..
} => assert_eq!(s, "10_2"),
other => panic!("expected Delins{{Reference}} for 10_2, got {:?}", other),
}
}
#[test]
fn canonicalize_conversion_zero_start_falls_back_to_reference() {
use crate::hgvs::edit::InsertedSequence;
let edit = NaEdit::Conversion {
source: "0_5".to_string(),
};
let got = canonicalize_conversion_to_delins(&edit).expect("expected Some");
assert!(
matches!(
got,
NaEdit::Delins {
sequence: InsertedSequence::Reference(_),
..
}
),
"expected Delins{{Reference}} fallback for 0_5"
);
}
#[test]
fn test_insertion_to_duplication_homopolymer_matched() {
let ref_seq = b"TTAAATT";
let r = insertion_to_duplication(ref_seq, 3, b"A").expect("should fire");
assert_eq!(r.unit, b"A");
assert_eq!(r.start, 5);
assert_eq!(r.end, 5);
}
#[test]
fn test_insertion_to_duplication_cyclic_rotation_two_base() {
let ref_seq = b"ACGTGTGTAC";
let r = insertion_to_duplication(ref_seq, 2, b"TG").expect("should fire");
assert_eq!(r.unit, b"GT");
assert_eq!(r.start, 7);
assert_eq!(r.end, 8);
}
#[test]
fn test_insertion_to_duplication_no_adjacent_tract() {
let ref_seq = b"ACGTACGT";
assert!(insertion_to_duplication(ref_seq, 3, b"X").is_none());
}
#[test]
fn test_insertion_to_duplication_empty_or_oob() {
let ref_seq = b"TTAAATT";
assert!(insertion_to_duplication(ref_seq, 3, b"").is_none());
assert!(insertion_to_duplication(b"", 0, b"A").is_none());
}
#[test]
fn test_insertion_to_duplication_rejects_multi_copy() {
let ref_seq = b"TTAAATT";
assert!(insertion_to_duplication(ref_seq, 3, b"AA").is_none());
}
#[test]
fn test_insertion_to_duplication_phase_matched_first_base() {
let ref_seq = b"ACGTGTGTAC";
let r = insertion_to_duplication(ref_seq, 2, b"GT").expect("should fire");
assert_eq!(r.unit, b"GT");
assert_eq!(r.start, 7);
assert_eq!(r.end, 8);
}
#[test]
fn test_canonicalize_edit_delins_strips_explicit_deleted_seq() {
use crate::hgvs::edit::{InsertedSequence, Sequence};
use std::str::FromStr;
let edit = NaEdit::Delins {
sequence: InsertedSequence::Literal(Sequence::from_str("TTCC").unwrap()),
deleted: Some(Sequence::from_str("ATG").unwrap()),
deleted_length: None,
};
let canonical = canonicalize_edit(&edit);
match canonical {
NaEdit::Delins {
sequence: _,
deleted,
deleted_length,
} => {
assert_eq!(deleted, None);
assert_eq!(deleted_length, None);
}
other => panic!("expected NaEdit::Delins, got {other:?}"),
}
}
#[test]
fn test_canonicalize_edit_delins_strips_explicit_deleted_count() {
use crate::hgvs::edit::{InsertedSequence, Sequence};
use std::str::FromStr;
let edit = NaEdit::Delins {
sequence: InsertedSequence::Literal(Sequence::from_str("TA").unwrap()),
deleted: None,
deleted_length: Some(3),
};
let canonical = canonicalize_edit(&edit);
match canonical {
NaEdit::Delins {
sequence: _,
deleted,
deleted_length,
} => {
assert_eq!(deleted, None);
assert_eq!(deleted_length, None);
}
other => panic!("expected NaEdit::Delins, got {other:?}"),
}
}
}