use crate::coords::index_to_hgvs_pos;
use crate::hgvs::edit::NaEdit;
pub fn needs_normalization(edit: &NaEdit) -> bool {
matches!(
edit,
NaEdit::Deletion { .. }
| NaEdit::Insertion { .. }
| NaEdit::Duplication { .. }
| NaEdit::Delins { .. }
| NaEdit::Repeat { .. }
)
}
pub fn insertion_is_duplication(ref_seq: &[u8], pos: u64, inserted_seq: &[u8]) -> bool {
let ins_len = inserted_seq.len();
let pos_idx = pos as usize;
if ref_seq.is_empty() || pos_idx > ref_seq.len() {
return false;
}
if pos_idx >= ins_len && pos_idx <= ref_seq.len() {
let before_start = pos_idx - ins_len;
if ref_seq[before_start..pos_idx] == inserted_seq[..] {
return true;
}
}
if pos_idx + ins_len <= ref_seq.len() && ref_seq[pos_idx..pos_idx + ins_len] == inserted_seq[..]
{
return true;
}
false
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CanonicalForm {
Deletion,
Duplication,
Delins,
Insertion,
Repeat {
base: u8,
count: u64,
start: u64,
end: u64,
},
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RepeatAnalysis {
pub is_homopolymer: bool,
pub base: Option<u8>,
pub ref_start: usize,
pub ref_end: usize,
pub ref_count: u64,
}
pub fn find_homopolymer_at(ref_seq: &[u8], pos: usize) -> Option<RepeatAnalysis> {
if pos >= ref_seq.len() {
return None;
}
let base = ref_seq[pos];
let mut start = pos;
while start > 0 && ref_seq[start - 1] == base {
start -= 1;
}
let mut end = pos + 1;
while end < ref_seq.len() && ref_seq[end] == base {
end += 1;
}
let count = (end - start) as u64;
if count < 2 {
return None;
}
Some(RepeatAnalysis {
is_homopolymer: true,
base: Some(base),
ref_start: start,
ref_end: end,
ref_count: count,
})
}
pub fn insertion_to_repeat(
ref_seq: &[u8],
pos: u64,
inserted_seq: &[u8],
) -> Option<(u8, u64, u64, u64)> {
if inserted_seq.is_empty() {
return None;
}
let first = inserted_seq[0];
if !inserted_seq.iter().all(|&b| b == first) {
return None;
}
let pos_idx = pos as usize;
let mut best_analysis: Option<RepeatAnalysis> = None;
if pos_idx > 0 && ref_seq.get(pos_idx - 1) == Some(&first) {
if let Some(analysis) = find_homopolymer_at(ref_seq, pos_idx - 1) {
if analysis.base == Some(first) {
best_analysis = Some(analysis);
}
}
}
if ref_seq.get(pos_idx) == Some(&first) {
if let Some(analysis) = find_homopolymer_at(ref_seq, pos_idx) {
if analysis.base == Some(first)
&& (best_analysis.is_none()
|| analysis.ref_count > best_analysis.as_ref().unwrap().ref_count)
{
best_analysis = Some(analysis);
}
}
}
if ref_seq.get(pos_idx + 1) == Some(&first) {
if let Some(analysis) = find_homopolymer_at(ref_seq, pos_idx + 1) {
if analysis.base == Some(first)
&& (best_analysis.is_none()
|| analysis.ref_count > best_analysis.as_ref().unwrap().ref_count)
{
best_analysis = Some(analysis);
}
}
}
let analysis = best_analysis;
if let Some(analysis) = analysis {
if analysis.base == Some(first) {
let total_count = analysis.ref_count + inserted_seq.len() as u64;
return Some((
first,
total_count,
index_to_hgvs_pos(analysis.ref_start),
index_to_hgvs_pos(analysis.ref_start + analysis.ref_count as usize - 1),
));
}
}
None
}
fn complement(base: u8) -> u8 {
match base {
b'A' | b'a' => b'T',
b'T' | b't' => b'A',
b'G' | b'g' => b'C',
b'C' | b'c' => b'G',
b'U' | b'u' => b'A', _ => base, }
}
pub fn shorten_inversion(ref_seq: &[u8], start: usize, end: usize) -> Option<(usize, usize)> {
if start >= end || end > ref_seq.len() {
return None;
}
let mut s = start;
let mut e = end;
while s < e {
let first = ref_seq[s];
let last = ref_seq[e - 1];
if complement(first) == last {
s += 1;
e -= 1;
} else {
break;
}
}
if e <= s + 1 {
return None; }
Some((s, e))
}
pub fn delins_is_duplication(
ref_seq: &[u8],
start: usize,
end: usize,
inserted_seq: &[u8],
) -> bool {
if start >= end || end > ref_seq.len() {
return false;
}
let deleted_len = end - start;
let deleted_seq = &ref_seq[start..end];
if inserted_seq.len() != 2 * deleted_len {
return false;
}
let (first_half, second_half) = inserted_seq.split_at(deleted_len);
first_half == deleted_seq && second_half == deleted_seq
}
#[derive(Debug, Clone)]
pub enum DupToRepeatResult {
Homopolymer {
base: u8,
count: u64,
start: u64, end: u64, },
TandemRepeat {
unit: Vec<u8>,
count: u64,
start: u64, end: u64, },
}
pub fn duplication_to_repeat(ref_seq: &[u8], start: u64, end: u64) -> Option<DupToRepeatResult> {
let start_idx = start as usize;
let end_idx = end as usize;
if start_idx >= ref_seq.len() || end_idx > ref_seq.len() || start_idx >= end_idx {
return None;
}
let dup_seq = &ref_seq[start_idx..end_idx];
if dup_seq.is_empty() {
return None;
}
let dup_len = dup_seq.len();
let first = dup_seq[0];
if dup_len >= 2 && dup_seq.iter().all(|&b| b == first) {
if let Some(analysis) = find_homopolymer_at(ref_seq, start_idx) {
if analysis.base == Some(first) {
let total_count = analysis.ref_count + dup_len as u64;
return Some(DupToRepeatResult::Homopolymer {
base: first,
count: total_count,
start: index_to_hgvs_pos(analysis.ref_start),
end: index_to_hgvs_pos(analysis.ref_start + analysis.ref_count as usize - 1),
});
}
}
}
for unit_len in 1..=dup_len / 2 {
if !dup_len.is_multiple_of(unit_len) {
continue;
}
let unit = &dup_seq[0..unit_len];
let copies_in_dup = dup_len / unit_len;
if copies_in_dup < 2 {
continue;
}
let is_repeat = (0..copies_in_dup).all(|i| {
let chunk = &dup_seq[i * unit_len..(i + 1) * unit_len];
chunk == unit
});
if !is_repeat {
continue;
}
if let Some((ref_count, rep_start, rep_end)) =
count_tandem_repeats(ref_seq, start_idx, unit)
{
let total_count = ref_count + copies_in_dup as u64;
return Some(DupToRepeatResult::TandemRepeat {
unit: unit.to_vec(),
count: total_count,
start: index_to_hgvs_pos(rep_start),
end: index_to_hgvs_pos(rep_end - 1),
});
}
}
None
}
pub fn count_tandem_repeats(
ref_seq: &[u8],
pos: usize,
repeat_unit: &[u8],
) -> Option<(u64, usize, usize)> {
if repeat_unit.is_empty() || pos >= ref_seq.len() {
return None;
}
let unit_len = repeat_unit.len();
if pos + unit_len > ref_seq.len() {
return None;
}
let mut start = pos;
while start >= unit_len {
let candidate = &ref_seq[start - unit_len..start];
if candidate == repeat_unit {
start -= unit_len;
} else {
break;
}
}
let mut end = start;
let mut count = 0u64;
while end + unit_len <= ref_seq.len() {
if &ref_seq[end..end + unit_len] == repeat_unit {
count += 1;
end += unit_len;
} else {
break;
}
}
if count >= 1 {
Some((count, start, end))
} else {
None
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum RepeatNormResult {
Deletion {
start: u64, end: u64, },
Duplication {
start: u64, end: u64, sequence: Vec<u8>,
},
Repeat {
start: u64, end: u64, sequence: Vec<u8>,
count: u64,
},
Unchanged,
}
pub fn normalize_repeat(
ref_seq: &[u8],
pos: usize,
repeat_unit: &[u8],
specified_count: u64,
) -> RepeatNormResult {
let Some((ref_count, ref_start, ref_end)) = count_tandem_repeats(ref_seq, pos, repeat_unit)
else {
return RepeatNormResult::Unchanged;
};
let unit_len = repeat_unit.len() as u64;
if specified_count < ref_count {
let del_count = ref_count - specified_count;
let del_len = del_count * unit_len;
let del_end_idx = ref_end - 1;
let del_start_idx = ref_end - del_len as usize;
RepeatNormResult::Deletion {
start: index_to_hgvs_pos(del_start_idx),
end: index_to_hgvs_pos(del_end_idx),
}
} else if specified_count == ref_count + 1 {
let dup_end_idx = ref_end - 1;
let dup_start_idx = ref_end - repeat_unit.len();
RepeatNormResult::Duplication {
start: index_to_hgvs_pos(dup_start_idx),
end: index_to_hgvs_pos(dup_end_idx),
sequence: repeat_unit.to_vec(),
}
} else if specified_count == ref_count {
RepeatNormResult::Unchanged
} else {
RepeatNormResult::Repeat {
start: index_to_hgvs_pos(ref_start),
end: index_to_hgvs_pos(ref_end - 1),
sequence: repeat_unit.to_vec(),
count: specified_count,
}
}
}
pub fn get_canonical_form(edit: &NaEdit, ref_seq: &[u8], start: u64, _end: u64) -> CanonicalForm {
use crate::hgvs::edit::InsertedSequence;
match edit {
NaEdit::Deletion { .. } => {
CanonicalForm::Deletion
}
NaEdit::Insertion { sequence } => {
if let InsertedSequence::Literal(seq) = sequence {
let seq_bytes: Vec<u8> = seq.bases().iter().map(|b| *b as u8).collect();
if insertion_is_duplication(ref_seq, start, &seq_bytes) {
return CanonicalForm::Duplication;
}
}
CanonicalForm::Insertion
}
NaEdit::Delins { .. } => CanonicalForm::Delins,
NaEdit::Duplication { .. } => CanonicalForm::Duplication,
_ => CanonicalForm::Deletion, }
}
pub fn canonicalize_edit(edit: &NaEdit) -> NaEdit {
match edit {
NaEdit::Deletion { .. } => NaEdit::Deletion {
sequence: None,
length: None,
},
NaEdit::Duplication {
uncertain_extent, ..
} => NaEdit::Duplication {
sequence: None,
length: None,
uncertain_extent: uncertain_extent.clone(),
},
NaEdit::Delins { sequence } => {
NaEdit::Delins {
sequence: sequence.clone(),
}
}
_ => edit.clone(),
}
}
pub fn should_canonicalize(edit: &NaEdit) -> bool {
match edit {
NaEdit::Deletion { sequence, length } => sequence.is_some() || length.is_some(),
NaEdit::Duplication {
sequence, length, ..
} => sequence.is_some() || length.is_some(),
_ => false,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_needs_normalization() {
assert!(needs_normalization(&NaEdit::Deletion {
sequence: None,
length: None,
}));
assert!(needs_normalization(&NaEdit::Duplication {
sequence: None,
length: None,
uncertain_extent: None,
}));
assert!(!needs_normalization(&NaEdit::Inversion {
sequence: None,
length: None,
}));
}
#[test]
fn test_insertion_is_duplication() {
let ref_seq = b"ATGATGATG";
assert!(insertion_is_duplication(ref_seq, 3, b"ATG"));
assert!(!insertion_is_duplication(ref_seq, 3, b"TGA"));
assert!(insertion_is_duplication(ref_seq, 6, b"ATG"));
}
#[test]
fn test_insertion_is_duplication_pos_beyond_ref() {
let ref_seq = b"ATGATGATG";
assert!(!insertion_is_duplication(ref_seq, 95, b"TATTT"));
assert!(!insertion_is_duplication(b"", 95, b"TATTT"));
assert!(!insertion_is_duplication(b"", 0, b"A"));
}
#[test]
fn test_deletion_stays_deletion() {
let ref_seq = b"ATGATGATG";
let del_edit = NaEdit::Deletion {
sequence: None,
length: None,
};
assert_eq!(
get_canonical_form(&del_edit, ref_seq, 3, 6),
CanonicalForm::Deletion
);
}
#[test]
fn test_insertion_becomes_dup() {
use crate::hgvs::edit::{InsertedSequence, Sequence};
use std::str::FromStr;
let ref_seq = b"ATGATGATG";
let ins_edit = NaEdit::Insertion {
sequence: InsertedSequence::Literal(Sequence::from_str("ATG").unwrap()),
};
assert_eq!(
get_canonical_form(&ins_edit, ref_seq, 3, 3),
CanonicalForm::Duplication
);
let ins_edit2 = NaEdit::Insertion {
sequence: InsertedSequence::Literal(Sequence::from_str("TGA").unwrap()),
};
assert_eq!(
get_canonical_form(&ins_edit2, ref_seq, 3, 3),
CanonicalForm::Insertion
);
}
#[test]
fn test_find_homopolymer_at() {
let ref_seq = b"GGGAAAAAGGG";
let result = find_homopolymer_at(ref_seq, 4);
assert!(result.is_some());
let analysis = result.unwrap();
assert!(analysis.is_homopolymer);
assert_eq!(analysis.base, Some(b'A'));
assert_eq!(analysis.ref_start, 3);
assert_eq!(analysis.ref_end, 8); assert_eq!(analysis.ref_count, 5);
let result = find_homopolymer_at(ref_seq, 1);
assert!(result.is_some());
let analysis = result.unwrap();
assert_eq!(analysis.base, Some(b'G'));
assert_eq!(analysis.ref_count, 3);
let single_seq = b"ATGC";
assert!(find_homopolymer_at(single_seq, 0).is_none());
}
#[test]
fn test_insertion_to_repeat() {
let ref_seq = b"GGGAAAAAGGG";
let result = insertion_to_repeat(ref_seq, 8, b"AA");
assert!(result.is_some());
let (base, count, start, end) = result.unwrap();
assert_eq!(base, b'A');
assert_eq!(count, 7); assert_eq!(start, 4); assert_eq!(end, 8);
let result = insertion_to_repeat(ref_seq, 8, b"T");
assert!(result.is_none());
let result = insertion_to_repeat(ref_seq, 8, b"AT");
assert!(result.is_none());
}
#[test]
fn test_duplication_to_repeat() {
let ref_seq = b"GGGAAAAAGGG";
let result = duplication_to_repeat(ref_seq, 3, 5);
assert!(result.is_some());
match result.unwrap() {
DupToRepeatResult::Homopolymer {
base, count, start, ..
} => {
assert_eq!(base, b'A');
assert_eq!(count, 7); assert_eq!(start, 4); }
_ => panic!("Expected Homopolymer result"),
}
let non_repeat_seq = b"ATGCXYZ";
let result = duplication_to_repeat(non_repeat_seq, 0, 3);
assert!(result.is_none());
}
#[test]
fn test_duplication_to_tandem_repeat() {
let ref_seq = b"AAAAAGCAGCAGCAGCAGCAGCAGCAGCAAAAA";
let result = duplication_to_repeat(ref_seq, 5, 8);
assert!(result.is_none(), "Single-copy dup should not become repeat");
let result = duplication_to_repeat(ref_seq, 5, 11);
assert!(result.is_some());
match result.unwrap() {
DupToRepeatResult::TandemRepeat {
unit,
count,
start,
end,
} => {
assert_eq!(unit, b"GCA");
assert_eq!(count, 10); assert_eq!(start, 6); assert_eq!(end, 29); }
_ => panic!("Expected TandemRepeat result"),
}
}
#[test]
fn test_count_tandem_repeats_basic() {
let ref_seq = b"GGGCATCATCATGGG";
let result = count_tandem_repeats(ref_seq, 3, b"CAT");
assert!(result.is_some());
let (count, start, end) = result.unwrap();
assert_eq!(count, 3);
assert_eq!(start, 3);
assert_eq!(end, 12);
let result = count_tandem_repeats(ref_seq, 6, b"CAT");
assert!(result.is_some());
let (count, start, end) = result.unwrap();
assert_eq!(count, 3);
assert_eq!(start, 3);
assert_eq!(end, 12);
}
#[test]
fn test_count_tandem_repeats_single_base() {
let ref_seq = b"GGGAAAAAAGGG";
let result = count_tandem_repeats(ref_seq, 5, b"A");
assert!(result.is_some());
let (count, start, end) = result.unwrap();
assert_eq!(count, 6);
assert_eq!(start, 3);
assert_eq!(end, 9);
}
#[test]
fn test_count_tandem_repeats_no_match() {
let ref_seq = b"GGGAAAAAAGGG";
let result = count_tandem_repeats(ref_seq, 5, b"XYZ");
assert!(result.is_none());
}
#[test]
fn test_normalize_repeat_to_deletion() {
let ref_seq = b"GGGCATCATCATCATGGG";
let result = normalize_repeat(ref_seq, 3, b"CAT", 1);
match result {
RepeatNormResult::Deletion { start, end } => {
assert_eq!(end - start + 1, 9, "Should delete 9 bases (3 CATs)");
}
_ => panic!("Expected Deletion, got {:?}", result),
}
}
#[test]
fn test_normalize_repeat_to_duplication() {
let ref_seq = b"GGGCATCATGGG";
let result = normalize_repeat(ref_seq, 3, b"CAT", 3);
match result {
RepeatNormResult::Duplication {
start,
end,
sequence,
} => {
assert_eq!(sequence, b"CAT");
assert_eq!(end - start + 1, 3, "Should duplicate 3 bases (1 CAT)");
}
_ => panic!("Expected Duplication, got {:?}", result),
}
}
#[test]
fn test_normalize_repeat_stays_repeat() {
let ref_seq = b"GGGCATCATGGG";
let result = normalize_repeat(ref_seq, 3, b"CAT", 5);
match result {
RepeatNormResult::Repeat {
count, sequence, ..
} => {
assert_eq!(sequence, b"CAT");
assert_eq!(count, 5);
}
_ => panic!("Expected Repeat, got {:?}", result),
}
}
#[test]
fn test_normalize_repeat_unchanged() {
let ref_seq = b"GGGCATCATGGG";
let result = normalize_repeat(ref_seq, 3, b"CAT", 2);
assert!(matches!(result, RepeatNormResult::Unchanged));
}
#[test]
fn test_complement() {
assert_eq!(complement(b'A'), b'T');
assert_eq!(complement(b'T'), b'A');
assert_eq!(complement(b'G'), b'C');
assert_eq!(complement(b'C'), b'G');
assert_eq!(complement(b'N'), b'N'); }
#[test]
fn test_shorten_inversion_basic() {
let seq = b"ATGCAT";
let result = shorten_inversion(seq, 0, 6);
assert!(
result.is_none(),
"Fully complementary inversion should become identity"
);
}
#[test]
fn test_shorten_inversion_partial() {
let seq = b"ATGGAT";
let result = shorten_inversion(seq, 0, 6);
assert!(result.is_some());
let (s, e) = result.unwrap();
assert_eq!(s, 2);
assert_eq!(e, 4);
}
#[test]
fn test_shorten_inversion_no_change() {
let seq = b"GGCC";
let result = shorten_inversion(seq, 0, 4);
assert!(result.is_none());
let seq2 = b"GATT";
let result2 = shorten_inversion(seq2, 0, 4);
assert!(result2.is_some());
let (s, e) = result2.unwrap();
assert_eq!(s, 0);
assert_eq!(e, 4);
}
#[test]
fn test_canonicalize_deletion_with_length() {
use crate::hgvs::edit::Sequence;
use std::str::FromStr;
let edit = NaEdit::Deletion {
sequence: None,
length: Some(12),
};
let canonical = canonicalize_edit(&edit);
assert!(matches!(
canonical,
NaEdit::Deletion {
sequence: None,
length: None
}
));
let edit = NaEdit::Deletion {
sequence: Some(Sequence::from_str("ATG").unwrap()),
length: None,
};
let canonical = canonicalize_edit(&edit);
assert!(matches!(
canonical,
NaEdit::Deletion {
sequence: None,
length: None
}
));
}
#[test]
fn test_canonicalize_duplication_with_length() {
use crate::hgvs::edit::Sequence;
use std::str::FromStr;
let edit = NaEdit::Duplication {
sequence: None,
length: Some(12),
uncertain_extent: None,
};
let canonical = canonicalize_edit(&edit);
assert!(matches!(
canonical,
NaEdit::Duplication {
sequence: None,
length: None,
..
}
));
let edit = NaEdit::Duplication {
sequence: Some(Sequence::from_str("ATG").unwrap()),
length: None,
uncertain_extent: None,
};
let canonical = canonicalize_edit(&edit);
assert!(matches!(
canonical,
NaEdit::Duplication {
sequence: None,
length: None,
..
}
));
}
#[test]
fn test_should_canonicalize() {
assert!(should_canonicalize(&NaEdit::Deletion {
sequence: None,
length: Some(12)
}));
assert!(!should_canonicalize(&NaEdit::Deletion {
sequence: None,
length: None
}));
use crate::hgvs::edit::Base;
assert!(!should_canonicalize(&NaEdit::Substitution {
reference: Base::A,
alternative: Base::G
}));
}
}