use crate::coords::index_to_hgvs_pos;
use crate::hgvs::edit::NaEdit;
pub fn needs_normalization(edit: &NaEdit) -> bool {
matches!(
edit,
NaEdit::Deletion { .. }
| NaEdit::Insertion { .. }
| NaEdit::Duplication { .. }
| NaEdit::Delins { .. }
| NaEdit::Repeat { .. }
)
}
pub fn insertion_is_duplication(ref_seq: &[u8], pos: u64, inserted_seq: &[u8]) -> bool {
let ins_len = inserted_seq.len();
let pos_idx = pos as usize;
if ref_seq.is_empty() || pos_idx > ref_seq.len() {
return false;
}
if pos_idx >= ins_len && pos_idx <= ref_seq.len() {
let before_start = pos_idx - ins_len;
if ref_seq[before_start..pos_idx] == inserted_seq[..] {
return true;
}
}
if pos_idx + ins_len <= ref_seq.len() && ref_seq[pos_idx..pos_idx + ins_len] == inserted_seq[..]
{
return true;
}
false
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CanonicalForm {
Deletion,
Duplication,
Delins,
Insertion,
Repeat {
base: u8,
count: u64,
start: u64,
end: u64,
},
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RepeatAnalysis {
pub is_homopolymer: bool,
pub base: Option<u8>,
pub ref_start: usize,
pub ref_end: usize,
pub ref_count: u64,
}
pub fn find_homopolymer_at(ref_seq: &[u8], pos: usize) -> Option<RepeatAnalysis> {
if pos >= ref_seq.len() {
return None;
}
let base = ref_seq[pos];
let mut start = pos;
while start > 0 && ref_seq[start - 1] == base {
start -= 1;
}
let mut end = pos + 1;
while end < ref_seq.len() && ref_seq[end] == base {
end += 1;
}
let count = (end - start) as u64;
if count < 2 {
return None;
}
Some(RepeatAnalysis {
is_homopolymer: true,
base: Some(base),
ref_start: start,
ref_end: end,
ref_count: count,
})
}
pub fn insertion_to_repeat(
ref_seq: &[u8],
pos: u64,
inserted_seq: &[u8],
) -> Option<(u8, u64, u64, u64, Vec<u8>)> {
if inserted_seq.is_empty() {
return None;
}
let base_unit = smallest_repeat_unit(inserted_seq);
let added_copies = (inserted_seq.len() / base_unit.len()) as u64;
if added_copies < 2 {
return None;
}
let u_len = base_unit.len();
let mut best: Option<(Vec<u8>, usize, u64)> = None;
for r in 0..u_len {
let mut rotated = Vec::with_capacity(u_len);
rotated.extend_from_slice(&base_unit[r..]);
rotated.extend_from_slice(&base_unit[..r]);
if let Some((ref_start, ref_count)) = find_tandem_extent(ref_seq, pos as usize, &rotated) {
if ref_count > 0 && best.as_ref().is_none_or(|(_, _, bc)| ref_count > *bc) {
best = Some((rotated, ref_start, ref_count));
}
}
}
let (unit, ref_start, ref_count) = best?;
let total_count = ref_count + added_copies;
let ref_end = ref_start + ref_count as usize * unit.len() - 1;
Some((
unit[0],
total_count,
index_to_hgvs_pos(ref_start),
index_to_hgvs_pos(ref_end),
unit,
))
}
pub(crate) fn smallest_repeat_unit(seq: &[u8]) -> &[u8] {
let n = seq.len();
for u in 1..=n {
if !n.is_multiple_of(u) {
continue;
}
let unit = &seq[..u];
if seq.chunks_exact(u).all(|c| c == unit) {
return unit;
}
}
seq }
fn find_tandem_extent(ref_seq: &[u8], pos: usize, unit: &[u8]) -> Option<(usize, u64)> {
let u = unit.len();
if u == 0 {
return None;
}
let ins_point = pos + 1;
let lo = ins_point.saturating_sub(u);
let hi = ins_point.min(ref_seq.len());
let mut best: Option<(usize, u64)> = None;
for anchor in lo..=hi {
if anchor + u > ref_seq.len() {
continue;
}
if &ref_seq[anchor..anchor + u] != unit {
continue;
}
let Some(TandemTract {
start,
end,
ref_count: count,
}) = extend_tandem_tract(ref_seq, anchor..anchor + u, unit)
else {
continue;
};
if ins_point < start || ins_point > end {
continue;
}
match best {
None => best = Some((start, count)),
Some((_, bc)) if count > bc => best = Some((start, count)),
_ => {}
}
}
best
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct TandemTract {
pub start: usize,
pub end: usize,
pub ref_count: u64,
}
pub(crate) fn extend_tandem_tract(
ref_seq: &[u8],
anchor: std::ops::Range<usize>,
unit: &[u8],
) -> Option<TandemTract> {
let u = unit.len();
if u == 0 || anchor.start > anchor.end || anchor.end > ref_seq.len() {
return None;
}
if !(anchor.end - anchor.start).is_multiple_of(u) {
return None;
}
if !ref_seq[anchor.start..anchor.end]
.chunks_exact(u)
.all(|chunk| chunk == unit)
{
return None;
}
let mut start = anchor.start;
while start >= u && &ref_seq[start - u..start] == unit {
start -= u;
}
let mut end = anchor.end;
while end + u <= ref_seq.len() && &ref_seq[end..end + u] == unit {
end += u;
}
let ref_count = ((end - start) / u) as u64;
Some(TandemTract {
start,
end,
ref_count,
})
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct DelToRepeatResult {
pub unit: Vec<u8>,
pub count: u64,
pub start: u64,
pub end: u64,
}
pub(crate) fn deletion_to_repeat(
ref_seq: &[u8],
del_start: usize,
del_end: usize,
) -> Option<DelToRepeatResult> {
if del_start >= del_end || del_end > ref_seq.len() {
return None;
}
let del_slice = &ref_seq[del_start..del_end];
let unit_slice = smallest_repeat_unit(del_slice);
let p = unit_slice.len();
if p == 0 || !(del_end - del_start).is_multiple_of(p) {
return None;
}
let tract = extend_tandem_tract(ref_seq, del_start..del_end, unit_slice)?;
if tract.ref_count < 2 {
return None;
}
let k = ((del_end - del_start) / p) as u64;
if k < 2 {
return None;
}
let post_count = tract.ref_count - k;
if post_count == 0 {
return None;
}
Some(DelToRepeatResult {
unit: unit_slice.to_vec(),
count: post_count,
start: index_to_hgvs_pos(tract.start),
end: index_to_hgvs_pos(tract.end - 1),
})
}
fn complement(base: u8) -> u8 {
match base {
b'A' | b'a' => b'T',
b'T' | b't' => b'A',
b'G' | b'g' => b'C',
b'C' | b'c' => b'G',
b'U' | b'u' => b'A', _ => base, }
}
pub fn shorten_inversion(ref_seq: &[u8], start: usize, end: usize) -> Option<(usize, usize)> {
if start >= end || end > ref_seq.len() {
return None;
}
let mut s = start;
let mut e = end;
while s < e {
let first = ref_seq[s];
let last = ref_seq[e - 1];
if complement(first) == last {
s += 1;
e -= 1;
} else {
break;
}
}
if e <= s + 1 {
return None; }
Some((s, e))
}
pub fn delins_is_identity(ref_seq: &[u8], start: usize, end: usize, inserted_seq: &[u8]) -> bool {
if start >= end || end > ref_seq.len() {
return false;
}
let deleted_len = end - start;
if inserted_seq.len() != deleted_len {
return false;
}
&ref_seq[start..end] == inserted_seq
}
pub fn delins_is_substitution(
ref_seq: &[u8],
start: usize,
end: usize,
inserted_seq: &[u8],
) -> Option<(crate::hgvs::edit::Base, crate::hgvs::edit::Base)> {
use crate::hgvs::edit::Base;
if end != start + 1 || end > ref_seq.len() || inserted_seq.len() != 1 {
return None;
}
let ref_byte = ref_seq[start];
let alt_byte = inserted_seq[0];
if ref_byte == alt_byte {
return None;
}
Some((
Base::from_char(ref_byte as char)?,
Base::from_char(alt_byte as char)?,
))
}
pub fn delins_is_duplication(
ref_seq: &[u8],
start: usize,
end: usize,
inserted_seq: &[u8],
) -> bool {
if start >= end || end > ref_seq.len() {
return false;
}
let deleted_len = end - start;
let deleted_seq = &ref_seq[start..end];
if inserted_seq.len() != 2 * deleted_len {
return false;
}
let (first_half, second_half) = inserted_seq.split_at(deleted_len);
first_half == deleted_seq && second_half == deleted_seq
}
#[derive(Debug, Clone)]
pub enum DupToRepeatResult {
Homopolymer {
base: u8,
count: u64,
start: u64, end: u64, },
TandemRepeat {
unit: Vec<u8>,
count: u64,
start: u64, end: u64, },
}
pub fn duplication_to_repeat(ref_seq: &[u8], start: u64, end: u64) -> Option<DupToRepeatResult> {
let start_idx = start as usize;
let end_idx = end as usize;
if start_idx >= ref_seq.len() || end_idx > ref_seq.len() || start_idx >= end_idx {
return None;
}
let dup_seq = &ref_seq[start_idx..end_idx];
if dup_seq.is_empty() {
return None;
}
let dup_len = dup_seq.len();
let first = dup_seq[0];
if dup_len >= 2 && dup_seq.iter().all(|&b| b == first) {
if let Some(analysis) = find_homopolymer_at(ref_seq, start_idx) {
if analysis.base == Some(first) {
let total_count = analysis.ref_count + dup_len as u64;
return Some(DupToRepeatResult::Homopolymer {
base: first,
count: total_count,
start: index_to_hgvs_pos(analysis.ref_start),
end: index_to_hgvs_pos(analysis.ref_start + analysis.ref_count as usize - 1),
});
}
}
}
for unit_len in 1..=dup_len / 2 {
if !dup_len.is_multiple_of(unit_len) {
continue;
}
let unit = &dup_seq[0..unit_len];
let copies_in_dup = dup_len / unit_len;
if copies_in_dup < 2 {
continue;
}
let is_repeat = (0..copies_in_dup).all(|i| {
let chunk = &dup_seq[i * unit_len..(i + 1) * unit_len];
chunk == unit
});
if !is_repeat {
continue;
}
if let Some((ref_count, rep_start, rep_end)) =
count_tandem_repeats(ref_seq, start_idx, unit)
{
let total_count = ref_count + copies_in_dup as u64;
return Some(DupToRepeatResult::TandemRepeat {
unit: unit.to_vec(),
count: total_count,
start: index_to_hgvs_pos(rep_start),
end: index_to_hgvs_pos(rep_end - 1),
});
}
}
None
}
pub fn count_tandem_repeats(
ref_seq: &[u8],
pos: usize,
repeat_unit: &[u8],
) -> Option<(u64, usize, usize)> {
if repeat_unit.is_empty() || pos >= ref_seq.len() {
return None;
}
let unit_len = repeat_unit.len();
if pos + unit_len > ref_seq.len() {
return None;
}
let mut start = pos;
while start >= unit_len {
let candidate = &ref_seq[start - unit_len..start];
if candidate == repeat_unit {
start -= unit_len;
} else {
break;
}
}
let mut end = start;
let mut count = 0u64;
while end + unit_len <= ref_seq.len() {
if &ref_seq[end..end + unit_len] == repeat_unit {
count += 1;
end += unit_len;
} else {
break;
}
}
if count >= 1 {
Some((count, start, end))
} else {
None
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum RepeatNormResult {
Deletion {
start: u64, end: u64, },
Duplication {
start: u64, end: u64, sequence: Vec<u8>,
},
Repeat {
start: u64, end: u64, sequence: Vec<u8>,
count: u64,
},
Unchanged,
}
pub fn normalize_repeat(
ref_seq: &[u8],
pos: usize,
repeat_unit: &[u8],
specified_count: u64,
) -> RepeatNormResult {
if repeat_unit.is_empty() {
return RepeatNormResult::Unchanged;
}
let canonical_unit = smallest_repeat_unit(repeat_unit);
let copies_per_input_unit = (repeat_unit.len() / canonical_unit.len()) as u64;
let specified_count = specified_count * copies_per_input_unit;
let Some((ref_count, ref_start, ref_end)) = count_tandem_repeats(ref_seq, pos, canonical_unit)
else {
return RepeatNormResult::Unchanged;
};
let unit_len = canonical_unit.len() as u64;
if specified_count < ref_count {
let k = ref_count - specified_count;
if k >= 2 && specified_count >= 1 {
RepeatNormResult::Repeat {
start: index_to_hgvs_pos(ref_start),
end: index_to_hgvs_pos(ref_end - 1),
sequence: canonical_unit.to_vec(),
count: specified_count,
}
} else {
let del_len = (k as usize) * unit_len as usize;
let del_end_idx = ref_end - 1;
let del_start_idx = ref_end - del_len;
RepeatNormResult::Deletion {
start: index_to_hgvs_pos(del_start_idx),
end: index_to_hgvs_pos(del_end_idx),
}
}
} else if specified_count == ref_count + 1 {
let dup_end_idx = ref_end - 1;
let dup_start_idx = ref_end - canonical_unit.len();
RepeatNormResult::Duplication {
start: index_to_hgvs_pos(dup_start_idx),
end: index_to_hgvs_pos(dup_end_idx),
sequence: canonical_unit.to_vec(),
}
} else if specified_count == ref_count {
RepeatNormResult::Unchanged
} else {
RepeatNormResult::Repeat {
start: index_to_hgvs_pos(ref_start),
end: index_to_hgvs_pos(ref_end - 1),
sequence: canonical_unit.to_vec(),
count: specified_count,
}
}
}
pub fn get_canonical_form(edit: &NaEdit, ref_seq: &[u8], start: u64, _end: u64) -> CanonicalForm {
use crate::hgvs::edit::InsertedSequence;
match edit {
NaEdit::Deletion { .. } => {
CanonicalForm::Deletion
}
NaEdit::Insertion { sequence } => {
if let InsertedSequence::Literal(seq) = sequence {
let seq_bytes: Vec<u8> = seq.bases().iter().map(|b| *b as u8).collect();
if insertion_is_duplication(ref_seq, start, &seq_bytes) {
return CanonicalForm::Duplication;
}
}
CanonicalForm::Insertion
}
NaEdit::Delins { .. } => CanonicalForm::Delins,
NaEdit::Duplication { .. } => CanonicalForm::Duplication,
_ => CanonicalForm::Deletion, }
}
pub fn canonicalize_edit(edit: &NaEdit) -> NaEdit {
match edit {
NaEdit::Deletion { .. } => NaEdit::Deletion {
sequence: None,
length: None,
},
NaEdit::Duplication {
uncertain_extent, ..
} => NaEdit::Duplication {
sequence: None,
length: None,
uncertain_extent: uncertain_extent.clone(),
},
NaEdit::Delins { sequence } => {
NaEdit::Delins {
sequence: sequence.clone(),
}
}
_ => edit.clone(),
}
}
pub fn should_canonicalize(edit: &NaEdit) -> bool {
match edit {
NaEdit::Deletion { sequence, length } => sequence.is_some() || length.is_some(),
NaEdit::Duplication {
sequence, length, ..
} => sequence.is_some() || length.is_some(),
_ => false,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_needs_normalization() {
assert!(needs_normalization(&NaEdit::Deletion {
sequence: None,
length: None,
}));
assert!(needs_normalization(&NaEdit::Duplication {
sequence: None,
length: None,
uncertain_extent: None,
}));
assert!(!needs_normalization(&NaEdit::Inversion {
sequence: None,
length: None,
}));
}
#[test]
fn test_insertion_is_duplication() {
let ref_seq = b"ATGATGATG";
assert!(insertion_is_duplication(ref_seq, 3, b"ATG"));
assert!(!insertion_is_duplication(ref_seq, 3, b"TGA"));
assert!(insertion_is_duplication(ref_seq, 6, b"ATG"));
}
#[test]
fn test_insertion_is_duplication_pos_beyond_ref() {
let ref_seq = b"ATGATGATG";
assert!(!insertion_is_duplication(ref_seq, 95, b"TATTT"));
assert!(!insertion_is_duplication(b"", 95, b"TATTT"));
assert!(!insertion_is_duplication(b"", 0, b"A"));
}
#[test]
fn test_deletion_stays_deletion() {
let ref_seq = b"ATGATGATG";
let del_edit = NaEdit::Deletion {
sequence: None,
length: None,
};
assert_eq!(
get_canonical_form(&del_edit, ref_seq, 3, 6),
CanonicalForm::Deletion
);
}
#[test]
fn test_delins_is_identity() {
let ref_seq = b"ACGTACGT";
assert!(delins_is_identity(ref_seq, 3, 4, b"T"));
assert!(delins_is_identity(ref_seq, 1, 4, b"CGT"));
assert!(delins_is_identity(ref_seq, 0, 8, b"ACGTACGT"));
assert!(!delins_is_identity(ref_seq, 3, 4, b"A"));
assert!(!delins_is_identity(ref_seq, 1, 4, b"CGA"));
assert!(!delins_is_identity(ref_seq, 3, 4, b"TT"));
assert!(!delins_is_identity(ref_seq, 1, 3, b"C"));
assert!(!delins_is_identity(ref_seq, 3, 4, b""));
assert!(!delins_is_identity(ref_seq, 7, 9, b"TT"));
assert!(!delins_is_identity(ref_seq, 5, 3, b"CG"));
}
#[test]
fn test_delins_is_substitution() {
use crate::hgvs::edit::Base;
let ref_seq = b"ACGTACGT";
assert_eq!(
delins_is_substitution(ref_seq, 3, 4, b"A"),
Some((Base::T, Base::A))
);
assert_eq!(delins_is_substitution(ref_seq, 3, 4, b"T"), None);
assert_eq!(delins_is_substitution(ref_seq, 3, 5, b"A"), None);
assert_eq!(delins_is_substitution(ref_seq, 3, 4, b"AT"), None);
assert_eq!(delins_is_substitution(ref_seq, 8, 9, b"A"), None);
assert_eq!(delins_is_substitution(ref_seq, 3, 4, b""), None);
}
#[test]
fn test_insertion_becomes_dup() {
use crate::hgvs::edit::{InsertedSequence, Sequence};
use std::str::FromStr;
let ref_seq = b"ATGATGATG";
let ins_edit = NaEdit::Insertion {
sequence: InsertedSequence::Literal(Sequence::from_str("ATG").unwrap()),
};
assert_eq!(
get_canonical_form(&ins_edit, ref_seq, 3, 3),
CanonicalForm::Duplication
);
let ins_edit2 = NaEdit::Insertion {
sequence: InsertedSequence::Literal(Sequence::from_str("TGA").unwrap()),
};
assert_eq!(
get_canonical_form(&ins_edit2, ref_seq, 3, 3),
CanonicalForm::Insertion
);
}
#[test]
fn test_find_homopolymer_at() {
let ref_seq = b"GGGAAAAAGGG";
let result = find_homopolymer_at(ref_seq, 4);
assert!(result.is_some());
let analysis = result.unwrap();
assert!(analysis.is_homopolymer);
assert_eq!(analysis.base, Some(b'A'));
assert_eq!(analysis.ref_start, 3);
assert_eq!(analysis.ref_end, 8); assert_eq!(analysis.ref_count, 5);
let result = find_homopolymer_at(ref_seq, 1);
assert!(result.is_some());
let analysis = result.unwrap();
assert_eq!(analysis.base, Some(b'G'));
assert_eq!(analysis.ref_count, 3);
let single_seq = b"ATGC";
assert!(find_homopolymer_at(single_seq, 0).is_none());
}
#[test]
fn test_insertion_to_repeat() {
let ref_seq = b"GGGAAAAAGGG";
let result = insertion_to_repeat(ref_seq, 7, b"AA");
assert!(result.is_some());
let (base, count, start, end, _unit) = result.unwrap();
assert_eq!(base, b'A');
assert_eq!(count, 7); assert_eq!(start, 4); assert_eq!(end, 8);
let result = insertion_to_repeat(ref_seq, 7, b"A");
assert!(result.is_none());
let result = insertion_to_repeat(ref_seq, 7, b"T");
assert!(result.is_none());
let result = insertion_to_repeat(ref_seq, 7, b"AT");
assert!(result.is_none());
let ref_ac = b"ACACGGG";
let result = insertion_to_repeat(ref_ac, 3, b"ACAC");
assert!(result.is_some());
let (base, count, start, end, unit) = result.unwrap();
assert_eq!(base, b'A');
assert_eq!(count, 4);
assert_eq!(start, 1); assert_eq!(end, 4); assert_eq!(unit, b"AC");
}
#[test]
fn test_duplication_to_repeat() {
let ref_seq = b"GGGAAAAAGGG";
let result = duplication_to_repeat(ref_seq, 3, 5);
assert!(result.is_some());
match result.unwrap() {
DupToRepeatResult::Homopolymer {
base, count, start, ..
} => {
assert_eq!(base, b'A');
assert_eq!(count, 7); assert_eq!(start, 4); }
_ => panic!("Expected Homopolymer result"),
}
let non_repeat_seq = b"ATGCXYZ";
let result = duplication_to_repeat(non_repeat_seq, 0, 3);
assert!(result.is_none());
}
#[test]
fn test_duplication_to_tandem_repeat() {
let ref_seq = b"AAAAAGCAGCAGCAGCAGCAGCAGCAGCAAAAA";
let result = duplication_to_repeat(ref_seq, 5, 8);
assert!(result.is_none(), "Single-copy dup should not become repeat");
let result = duplication_to_repeat(ref_seq, 5, 11);
assert!(result.is_some());
match result.unwrap() {
DupToRepeatResult::TandemRepeat {
unit,
count,
start,
end,
} => {
assert_eq!(unit, b"GCA");
assert_eq!(count, 10); assert_eq!(start, 6); assert_eq!(end, 29); }
_ => panic!("Expected TandemRepeat result"),
}
}
#[test]
fn test_count_tandem_repeats_basic() {
let ref_seq = b"GGGCATCATCATGGG";
let result = count_tandem_repeats(ref_seq, 3, b"CAT");
assert!(result.is_some());
let (count, start, end) = result.unwrap();
assert_eq!(count, 3);
assert_eq!(start, 3);
assert_eq!(end, 12);
let result = count_tandem_repeats(ref_seq, 6, b"CAT");
assert!(result.is_some());
let (count, start, end) = result.unwrap();
assert_eq!(count, 3);
assert_eq!(start, 3);
assert_eq!(end, 12);
}
#[test]
fn test_count_tandem_repeats_single_base() {
let ref_seq = b"GGGAAAAAAGGG";
let result = count_tandem_repeats(ref_seq, 5, b"A");
assert!(result.is_some());
let (count, start, end) = result.unwrap();
assert_eq!(count, 6);
assert_eq!(start, 3);
assert_eq!(end, 9);
}
#[test]
fn test_count_tandem_repeats_no_match() {
let ref_seq = b"GGGAAAAAAGGG";
let result = count_tandem_repeats(ref_seq, 5, b"XYZ");
assert!(result.is_none());
}
#[test]
fn test_normalize_repeat_to_deletion() {
let ref_seq = b"GGGCATCATCATCATGGG";
let result = normalize_repeat(ref_seq, 3, b"CAT", 1);
match result {
RepeatNormResult::Repeat {
sequence, count, ..
} => {
assert_eq!(sequence, b"CAT");
assert_eq!(count, 1, "Should reflect specified count of 1");
}
_ => panic!("Expected Repeat (B2), got {:?}", result),
}
}
#[test]
fn test_normalize_repeat_to_duplication() {
let ref_seq = b"GGGCATCATGGG";
let result = normalize_repeat(ref_seq, 3, b"CAT", 3);
match result {
RepeatNormResult::Duplication {
start,
end,
sequence,
} => {
assert_eq!(sequence, b"CAT");
assert_eq!(end - start + 1, 3, "Should duplicate 3 bases (1 CAT)");
}
_ => panic!("Expected Duplication, got {:?}", result),
}
}
#[test]
fn test_normalize_repeat_stays_repeat() {
let ref_seq = b"GGGCATCATGGG";
let result = normalize_repeat(ref_seq, 3, b"CAT", 5);
match result {
RepeatNormResult::Repeat {
count, sequence, ..
} => {
assert_eq!(sequence, b"CAT");
assert_eq!(count, 5);
}
_ => panic!("Expected Repeat, got {:?}", result),
}
}
#[test]
fn test_normalize_repeat_unchanged() {
let ref_seq = b"GGGCATCATGGG";
let result = normalize_repeat(ref_seq, 3, b"CAT", 2);
assert!(matches!(result, RepeatNormResult::Unchanged));
}
#[test]
fn test_normalize_repeat_empty_unit_is_unchanged() {
let ref_seq = b"GGGCATCATGGG";
let result = normalize_repeat(ref_seq, 3, b"", 1);
assert!(matches!(result, RepeatNormResult::Unchanged));
}
#[test]
fn test_normalize_repeat_canonicalizes_non_minimal_unit() {
let ref_seq = b"GGGATATATATGGG";
let result = normalize_repeat(ref_seq, 3, b"ATAT", 1);
match result {
RepeatNormResult::Repeat {
start,
end,
sequence,
count,
} => {
assert_eq!(sequence, b"AT", "Should emit canonical (smallest) unit");
assert_eq!(count, 2, "Specified ATAT[1] = 2 canonical AT copies");
assert_eq!(start, 4, "Canonical tract starts at HGVS pos 4");
assert_eq!(end, 11, "Canonical tract ends at HGVS pos 11");
}
_ => panic!("Expected canonical AT[2] Repeat, got {:?}", result),
}
}
#[test]
fn test_complement() {
assert_eq!(complement(b'A'), b'T');
assert_eq!(complement(b'T'), b'A');
assert_eq!(complement(b'G'), b'C');
assert_eq!(complement(b'C'), b'G');
assert_eq!(complement(b'N'), b'N'); }
#[test]
fn test_shorten_inversion_basic() {
let seq = b"ATGCAT";
let result = shorten_inversion(seq, 0, 6);
assert!(
result.is_none(),
"Fully complementary inversion should become identity"
);
}
#[test]
fn test_shorten_inversion_partial() {
let seq = b"ATGGAT";
let result = shorten_inversion(seq, 0, 6);
assert!(result.is_some());
let (s, e) = result.unwrap();
assert_eq!(s, 2);
assert_eq!(e, 4);
}
#[test]
fn test_shorten_inversion_no_change() {
let seq = b"GGCC";
let result = shorten_inversion(seq, 0, 4);
assert!(result.is_none());
let seq2 = b"GATT";
let result2 = shorten_inversion(seq2, 0, 4);
assert!(result2.is_some());
let (s, e) = result2.unwrap();
assert_eq!(s, 0);
assert_eq!(e, 4);
}
#[test]
fn test_extend_tandem_tract_homopolymer() {
let ref_seq = b"TTAAAATT";
let tract = extend_tandem_tract(ref_seq, 2..4, b"A").unwrap();
assert_eq!(tract.start, 2);
assert_eq!(tract.end, 6);
assert_eq!(tract.ref_count, 4);
}
#[test]
fn test_extend_tandem_tract_multi_base_unit() {
let ref_seq = b"TTGCAGCAGCATT";
let tract = extend_tandem_tract(ref_seq, 5..8, b"GCA").unwrap();
assert_eq!(tract.start, 2);
assert_eq!(tract.end, 11);
assert_eq!(tract.ref_count, 3);
}
#[test]
fn test_extend_tandem_tract_anchor_at_5prime_edge() {
let ref_seq = b"AAAATT";
let tract = extend_tandem_tract(ref_seq, 0..2, b"A").unwrap();
assert_eq!(tract.start, 0);
assert_eq!(tract.end, 4);
assert_eq!(tract.ref_count, 4);
}
#[test]
fn test_extend_tandem_tract_anchor_at_3prime_edge() {
let ref_seq = b"TTAAAA";
let tract = extend_tandem_tract(ref_seq, 4..6, b"A").unwrap();
assert_eq!(tract.start, 2);
assert_eq!(tract.end, 6);
assert_eq!(tract.ref_count, 4);
}
#[test]
fn test_extend_tandem_tract_zero_width_anchor() {
let ref_seq = b"TTAAAATT";
let tract = extend_tandem_tract(ref_seq, 4..4, b"A").unwrap();
assert_eq!(tract.start, 2);
assert_eq!(tract.end, 6);
assert_eq!(tract.ref_count, 4);
}
#[test]
fn test_extend_tandem_tract_anchor_not_unit_periodic() {
let ref_seq = b"TTAGAATT";
assert!(extend_tandem_tract(ref_seq, 2..4, b"A").is_none());
}
#[test]
fn test_extend_tandem_tract_anchor_length_not_multiple_of_unit() {
let ref_seq = b"AAA";
assert!(extend_tandem_tract(ref_seq, 0..3, b"AA").is_none());
}
#[test]
fn test_deletion_to_repeat_homopolymer_two_removed() {
let ref_seq = b"TTAAAAATT";
let r = deletion_to_repeat(ref_seq, 5, 7).expect("should fire");
assert_eq!(r.unit, b"A");
assert_eq!(r.count, 3);
assert_eq!(r.start, 3); assert_eq!(r.end, 7);
}
#[test]
fn test_deletion_to_repeat_multi_base_tandem_two_removed() {
let ref_seq = b"TTGCAGCAGCATT";
let r = deletion_to_repeat(ref_seq, 5, 11).expect("should fire");
assert_eq!(r.unit, b"GCA");
assert_eq!(r.count, 1);
assert_eq!(r.start, 3);
assert_eq!(r.end, 11);
}
#[test]
fn test_deletion_to_repeat_one_unit_removed_returns_none() {
let ref_seq = b"TTAAAAATT";
assert!(deletion_to_repeat(ref_seq, 6, 7).is_none());
}
#[test]
fn test_deletion_to_repeat_full_tract_removal_returns_none() {
let ref_seq = b"TTAATT";
assert!(deletion_to_repeat(ref_seq, 2, 4).is_none());
}
#[test]
fn test_deletion_to_repeat_non_tandem_returns_none() {
let ref_seq = b"TTGCATT";
assert!(deletion_to_repeat(ref_seq, 2, 5).is_none());
}
#[test]
fn test_deletion_to_repeat_finer_periodicity() {
let ref_seq = b"TTATATATATATT";
let r = deletion_to_repeat(ref_seq, 8, 12).expect("should fire");
assert_eq!(r.unit, b"AT");
assert_eq!(r.count, 3);
assert_eq!(r.start, 3); assert_eq!(r.end, 12);
}
#[test]
fn test_canonicalize_deletion_with_length() {
use crate::hgvs::edit::Sequence;
use std::str::FromStr;
let edit = NaEdit::Deletion {
sequence: None,
length: Some(12),
};
let canonical = canonicalize_edit(&edit);
assert!(matches!(
canonical,
NaEdit::Deletion {
sequence: None,
length: None
}
));
let edit = NaEdit::Deletion {
sequence: Some(Sequence::from_str("ATG").unwrap()),
length: None,
};
let canonical = canonicalize_edit(&edit);
assert!(matches!(
canonical,
NaEdit::Deletion {
sequence: None,
length: None
}
));
}
#[test]
fn test_canonicalize_duplication_with_length() {
use crate::hgvs::edit::Sequence;
use std::str::FromStr;
let edit = NaEdit::Duplication {
sequence: None,
length: Some(12),
uncertain_extent: None,
};
let canonical = canonicalize_edit(&edit);
assert!(matches!(
canonical,
NaEdit::Duplication {
sequence: None,
length: None,
..
}
));
let edit = NaEdit::Duplication {
sequence: Some(Sequence::from_str("ATG").unwrap()),
length: None,
uncertain_extent: None,
};
let canonical = canonicalize_edit(&edit);
assert!(matches!(
canonical,
NaEdit::Duplication {
sequence: None,
length: None,
..
}
));
}
#[test]
fn test_should_canonicalize() {
assert!(should_canonicalize(&NaEdit::Deletion {
sequence: None,
length: Some(12)
}));
assert!(!should_canonicalize(&NaEdit::Deletion {
sequence: None,
length: None
}));
use crate::hgvs::edit::Base;
assert!(!should_canonicalize(&NaEdit::Substitution {
reference: Base::A,
alternative: Base::G
}));
}
}