use crate::coords::index_to_hgvs_pos;
#[derive(Debug, Clone, PartialEq)]
pub struct RawEdit {
pub ref_start: u64,
pub ref_end: u64,
pub obs_start: u64,
pub obs_end: u64,
pub ref_seq: String,
pub obs_seq: String,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum EditOp {
Match,
Replace,
Insert,
Delete,
}
pub fn align_sequences(reference: &str, observed: &str) -> Vec<RawEdit> {
if reference == observed {
return Vec::new();
}
let ref_bytes = reference.as_bytes();
let obs_bytes = observed.as_bytes();
find_edits(ref_bytes, obs_bytes)
}
fn find_edits(reference: &[u8], observed: &[u8]) -> Vec<RawEdit> {
let mut edits = Vec::new();
let mut ref_pos = 0usize;
let mut obs_pos = 0usize;
while ref_pos < reference.len() || obs_pos < observed.len() {
while ref_pos < reference.len()
&& obs_pos < observed.len()
&& reference[ref_pos] == observed[obs_pos]
{
ref_pos += 1;
obs_pos += 1;
}
if ref_pos >= reference.len() && obs_pos >= observed.len() {
break;
}
let edit_start_ref = ref_pos;
let edit_start_obs = obs_pos;
let (ref_end, obs_end) = find_resync_point(reference, observed, ref_pos, obs_pos);
let ref_seq: String = reference[edit_start_ref..ref_end]
.iter()
.map(|&b| b as char)
.collect();
let obs_seq: String = observed[edit_start_obs..obs_end]
.iter()
.map(|&b| b as char)
.collect();
let (norm_ref_start, norm_ref_end, norm_ref_seq, norm_obs_seq) =
normalize_3prime(reference, edit_start_ref, ref_end, &ref_seq, &obs_seq);
edits.push(RawEdit {
ref_start: index_to_hgvs_pos(norm_ref_start), ref_end: if norm_ref_end > norm_ref_start {
norm_ref_end as u64 } else {
index_to_hgvs_pos(norm_ref_start) },
obs_start: index_to_hgvs_pos(edit_start_obs), obs_end: if obs_end > edit_start_obs {
obs_end as u64 } else {
index_to_hgvs_pos(edit_start_obs) },
ref_seq: norm_ref_seq,
obs_seq: norm_obs_seq,
});
ref_pos = ref_end;
obs_pos = obs_end;
}
edits
}
fn find_resync_point(
reference: &[u8],
observed: &[u8],
ref_start: usize,
obs_start: usize,
) -> (usize, usize) {
let ref_remaining = reference.len() - ref_start;
let obs_remaining = observed.len() - obs_start;
if ref_remaining > 0 && obs_remaining > 0 {
if ref_remaining >= 1 && obs_remaining >= 1 {
let check_len = std::cmp::min(ref_remaining, obs_remaining);
for i in 1..=check_len {
if i < ref_remaining
&& i < obs_remaining
&& reference[ref_start + i] == observed[obs_start + i]
{
return (ref_start + i, obs_start + i);
}
}
}
for del_len in 1..=ref_remaining {
if ref_start + del_len < reference.len() {
let ref_after_del = ref_start + del_len;
if reference[ref_after_del..] == observed[obs_start..] {
return (ref_after_del, obs_start);
}
let remaining_ref = reference.len() - ref_after_del;
let remaining_obs = observed.len() - obs_start;
let check_len = std::cmp::min(remaining_ref, remaining_obs);
if check_len > 0
&& reference[ref_after_del..ref_after_del + check_len]
== observed[obs_start..obs_start + check_len]
{
return (ref_start + del_len, obs_start);
}
}
}
for ins_len in 1..=obs_remaining {
if obs_start + ins_len < observed.len() {
let obs_after_ins = obs_start + ins_len;
if reference[ref_start..] == observed[obs_after_ins..] {
return (ref_start, obs_after_ins);
}
let remaining_ref = reference.len() - ref_start;
let remaining_obs = observed.len() - obs_after_ins;
let check_len = std::cmp::min(remaining_ref, remaining_obs);
if check_len > 0
&& reference[ref_start..ref_start + check_len]
== observed[obs_after_ins..obs_after_ins + check_len]
{
return (ref_start, obs_start + ins_len);
}
}
}
}
(reference.len(), observed.len())
}
fn normalize_3prime(
reference: &[u8],
ref_start: usize,
ref_end: usize,
ref_seq: &str,
obs_seq: &str,
) -> (usize, usize, String, String) {
if !ref_seq.is_empty() && !obs_seq.is_empty() {
return (ref_start, ref_end, ref_seq.to_string(), obs_seq.to_string());
}
let seq_to_shift = if ref_seq.is_empty() { obs_seq } else { ref_seq };
if seq_to_shift.is_empty() {
return (ref_start, ref_end, ref_seq.to_string(), obs_seq.to_string());
}
let shift_bytes = seq_to_shift.as_bytes();
let mut new_start = ref_start;
let mut new_end = ref_end;
while new_end < reference.len() && reference[new_end] == shift_bytes[0] {
let first = shift_bytes[0];
if shift_bytes.iter().all(|&b| b == first) {
new_start += 1;
new_end += 1;
} else {
break;
}
}
if ref_seq.is_empty() {
(new_start, new_end, String::new(), obs_seq.to_string())
} else {
(new_start, new_end, ref_seq.to_string(), String::new())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_identical_sequences() {
let edits = align_sequences("ATGC", "ATGC");
assert!(edits.is_empty());
}
#[test]
fn test_single_substitution() {
let edits = align_sequences("ATGC", "ATAC");
assert_eq!(edits.len(), 1);
assert_eq!(edits[0].ref_start, 3);
assert_eq!(edits[0].ref_seq, "G");
assert_eq!(edits[0].obs_seq, "A");
}
#[test]
fn test_single_deletion() {
let edits = align_sequences("ATGC", "ATC");
assert_eq!(edits.len(), 1);
assert_eq!(edits[0].ref_seq, "G");
assert_eq!(edits[0].obs_seq, "");
}
#[test]
fn test_single_insertion() {
let edits = align_sequences("ATGC", "ATGGC");
assert_eq!(edits.len(), 1);
assert_eq!(edits[0].ref_seq, "");
assert_eq!(edits[0].obs_seq, "G");
}
#[test]
fn test_multi_base_deletion() {
let edits = align_sequences("ATGCAT", "ATAT");
assert_eq!(edits.len(), 1);
assert_eq!(edits[0].ref_seq, "GC");
assert_eq!(edits[0].obs_seq, "");
}
#[test]
fn test_delins() {
let edits = align_sequences("ATGC", "ATTC");
assert_eq!(edits.len(), 1);
assert_eq!(edits[0].ref_seq, "G");
assert_eq!(edits[0].obs_seq, "T");
}
#[test]
fn test_multiple_substitutions() {
let edits = align_sequences("ATGCAT", "CTGCGT");
assert_eq!(edits.len(), 2);
assert_eq!(edits[0].ref_start, 1);
assert_eq!(edits[0].ref_seq, "A");
assert_eq!(edits[0].obs_seq, "C");
assert_eq!(edits[1].ref_seq, "A");
assert_eq!(edits[1].obs_seq, "G");
}
#[test]
fn test_complete_replacement() {
let edits = align_sequences("AAAA", "TTTT");
assert_eq!(edits.len(), 1);
assert_eq!(edits[0].ref_seq, "AAAA");
assert_eq!(edits[0].obs_seq, "TTTT");
}
#[test]
fn test_insertion_at_end() {
let edits = align_sequences("ATG", "ATGC");
assert_eq!(edits.len(), 1);
assert_eq!(edits[0].ref_seq, "");
assert_eq!(edits[0].obs_seq, "C");
}
#[test]
fn test_deletion_at_end() {
let edits = align_sequences("ATGC", "ATG");
assert_eq!(edits.len(), 1);
assert_eq!(edits[0].ref_seq, "C");
assert_eq!(edits[0].obs_seq, "");
}
}