use crate::error::{SeqError, SeqResult};
const WORD_BITS: usize = 64;
fn build_peq(pattern: &[u8], blocks: usize) -> Vec<u64> {
let mut peq = vec![0u64; 256 * blocks];
for (j, &c) in pattern.iter().enumerate() {
let block = j / WORD_BITS;
let bit = j % WORD_BITS;
peq[usize::from(c) * blocks + block] |= 1u64 << bit;
}
peq
}
fn single_word(pattern: &[u8], text: &[u8]) -> usize {
let m = pattern.len();
let peq = build_peq(pattern, 1);
let mut vp: u64 = if m == WORD_BITS {
u64::MAX
} else {
(1u64 << m) - 1
};
let mut vn: u64 = 0;
let mut score = m;
let high_bit: u64 = 1u64 << (m - 1);
for &c in text {
let eq = peq[usize::from(c)];
let xv = eq | vn;
let xh = (((eq & vp).wrapping_add(vp)) ^ vp) | eq;
let mut ph = vn | !(xh | vp);
let mut mh = vp & xh;
if ph & high_bit != 0 {
score += 1;
} else if mh & high_bit != 0 {
score -= 1;
}
ph = (ph << 1) | 1;
mh <<= 1;
vp = mh | !(xv | ph);
vn = ph & xv;
}
score
}
fn blocked(pattern: &[u8], text: &[u8]) -> usize {
let m = pattern.len();
let blocks = m.div_ceil(WORD_BITS);
let peq = build_peq(pattern, blocks);
let mut vp = vec![u64::MAX; blocks];
let mut vn = vec![0u64; blocks];
let last_bits = m - (blocks - 1) * WORD_BITS;
let high_bit: u64 = 1u64 << (last_bits - 1);
let mut score = m;
for &c in text {
let mut hp: u64 = 1;
let mut hn: u64 = 0;
for b in 0..blocks {
let eq = peq[usize::from(c) * blocks + b];
let eq = eq | hn;
let vp_b = vp[b];
let vn_b = vn[b];
let xv = eq | vn_b;
let xh = (((eq & vp_b).wrapping_add(vp_b)) ^ vp_b) | eq;
let mut ph = vn_b | !(xh | vp_b);
let mut mh = vp_b & xh;
let ph_out = (ph >> (WORD_BITS - 1)) & 1;
let mh_out = (mh >> (WORD_BITS - 1)) & 1;
if b + 1 == blocks {
if ph & high_bit != 0 {
score += 1;
} else if mh & high_bit != 0 {
score -= 1;
}
}
ph = (ph << 1) | hp;
mh = (mh << 1) | hn;
vp[b] = mh | !(xv | ph);
vn[b] = ph & xv;
hp = ph_out;
hn = mh_out;
}
}
score
}
pub fn myers_edit_distance(pattern: &[u8], text: &[u8]) -> usize {
let (short, long) = if pattern.len() <= text.len() {
(pattern, text)
} else {
(text, pattern)
};
let m = short.len();
let n = long.len();
if m == 0 {
return n;
}
if n == 0 {
return m;
}
if m <= WORD_BITS {
single_word(short, long)
} else {
blocked(short, long)
}
}
pub fn myers_edit_distance_checked(pattern: &[u8], text: &[u8]) -> SeqResult<usize> {
pattern.len().checked_add(text.len()).ok_or_else(|| {
SeqError::InvalidConfiguration("combined input length overflows usize".to_string())
})?;
Ok(myers_edit_distance(pattern, text))
}
pub fn myers_distance_str(a: &str, b: &str) -> usize {
myers_edit_distance(a.as_bytes(), b.as_bytes())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::handle::LcgRng;
use crate::metrics::metrics::edit_distance;
fn dp_levenshtein(a: &[u8], b: &[u8]) -> usize {
let m = a.len();
let n = b.len();
let mut prev: Vec<usize> = (0..=n).collect();
let mut curr = vec![0usize; n + 1];
for i in 1..=m {
curr[0] = i;
for j in 1..=n {
let cost = usize::from(a[i - 1] != b[j - 1]);
curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
}
std::mem::swap(&mut prev, &mut curr);
}
prev[n]
}
fn random_bytes(rng: &mut LcgRng, alphabet: &[u8], len: usize) -> Vec<u8> {
(0..len)
.map(|_| alphabet[rng.next_usize(alphabet.len())])
.collect()
}
#[test]
fn matches_reference_dp_random_and_edge_cases() {
let explicit: &[(&[u8], &[u8], usize)] = &[
(b"kitten", b"sitting", 3),
(b"flaw", b"lawn", 2),
(b"", b"", 0),
(b"abc", b"", 3),
(b"", b"abc", 3),
(b"book", b"back", 2),
];
for &(a, b, expected) in explicit {
let got = myers_edit_distance(a, b);
assert_eq!(got, expected, "explicit pair {a:?} vs {b:?}");
assert_eq!(got, edit_distance(a, b), "vs crate DP for {a:?} vs {b:?}");
assert_eq!(got, dp_levenshtein(a, b), "vs local DP for {a:?} vs {b:?}");
}
let mut rng = LcgRng::new(42);
let alphabet = b"ACGT";
for _ in 0..200 {
let la = rng.next_usize(40);
let lb = rng.next_usize(40);
let a = random_bytes(&mut rng, alphabet, la);
let b = random_bytes(&mut rng, alphabet, lb);
let got = myers_edit_distance(&a, &b);
assert_eq!(got, edit_distance(&a, &b), "crate DP: {a:?} vs {b:?}");
assert_eq!(got, dp_levenshtein(&a, &b), "local DP: {a:?} vs {b:?}");
assert_eq!(got, myers_edit_distance(&b, &a), "symmetry: {a:?} vs {b:?}");
}
}
#[test]
fn identical_strings_are_zero() {
for s in [b"abc".as_slice(), b"hello world", b"", b"AAAAAAAA"] {
assert_eq!(myers_edit_distance(s, s), 0);
}
}
#[test]
fn single_edits_cost_one() {
assert_eq!(myers_edit_distance(b"abc", b"abd"), 1, "substitution");
assert_eq!(myers_edit_distance(b"abc", b"abxc"), 1, "insertion");
assert_eq!(myers_edit_distance(b"abc", b"ac"), 1, "deletion");
}
#[test]
fn empty_inputs() {
assert_eq!(myers_edit_distance(b"", b"hello"), 5);
assert_eq!(myers_edit_distance(b"hello", b""), 5);
assert_eq!(myers_edit_distance(b"", b""), 0);
}
#[test]
fn blocked_path_matches_reference_dp() {
let mut rng = LcgRng::new(7);
let alphabet = b"ACGT";
let pattern = random_bytes(&mut rng, alphabet, 100);
let mut mutated = pattern.clone();
for pos in [3usize, 17, 41, 68, 95] {
let cur = mutated[pos];
let next = alphabet.iter().copied().find(|&x| x != cur).unwrap_or(b'X');
mutated[pos] = next;
}
let got = myers_edit_distance(&pattern, &mutated);
assert_eq!(got, 5, "five substitutions on length-100 pattern");
assert_eq!(got, edit_distance(&pattern, &mutated));
assert_eq!(got, dp_levenshtein(&pattern, &mutated));
let mut deleted = pattern.clone();
for &pos in [80usize, 50, 10].iter() {
deleted.remove(pos);
}
let got_del = myers_edit_distance(&pattern, &deleted);
assert_eq!(got_del, edit_distance(&pattern, &deleted));
assert_eq!(got_del, dp_levenshtein(&pattern, &deleted));
assert_eq!(got_del, 3, "three deletions");
for _ in 0..40 {
let la = 64 + rng.next_usize(64); let lb = 64 + rng.next_usize(64);
let a = random_bytes(&mut rng, alphabet, la);
let b = random_bytes(&mut rng, alphabet, lb);
let got = myers_edit_distance(&a, &b);
assert_eq!(got, edit_distance(&a, &b), "long random crate DP");
assert_eq!(got, dp_levenshtein(&a, &b), "long random local DP");
assert_eq!(got, myers_edit_distance(&b, &a), "long random symmetry");
}
for &len in &[63usize, 64, 65, 127, 128, 129, 192, 200] {
let a = random_bytes(&mut rng, alphabet, len);
let mut b = a.clone();
if !b.is_empty() {
let cur = b[len / 2];
b[len / 2] = if cur == b'A' { b'C' } else { b'A' };
}
let got = myers_edit_distance(&a, &b);
assert_eq!(got, edit_distance(&a, &b), "boundary len {len}");
assert_eq!(got, dp_levenshtein(&a, &b), "boundary len {len} local");
}
}
#[test]
fn transposition_costs_two() {
assert_eq!(myers_edit_distance(b"ab", b"ba"), 2);
assert_eq!(edit_distance(b"ab".as_slice(), b"ba".as_slice()), 2);
assert_eq!(myers_edit_distance(b"converse", b"covnerse"), 2);
assert_eq!(
myers_edit_distance(b"converse", b"covnerse"),
edit_distance(b"converse".as_slice(), b"covnerse".as_slice())
);
}
#[test]
fn prefix_suffix_relationships() {
assert_eq!(myers_edit_distance(b"hello", b"hello world"), 6);
assert_eq!(
myers_edit_distance(b"hello", b"hello world"),
edit_distance(b"hello".as_slice(), b"hello world".as_slice())
);
let base = b"abcdefghij";
for k in 0..=base.len() {
let prefix = &base[..k];
assert_eq!(myers_edit_distance(base, prefix), base.len() - k);
let suffix = &base[k..];
assert_eq!(myers_edit_distance(base, suffix), k);
}
}
#[test]
fn wrappers_agree() {
assert_eq!(myers_distance_str("kitten", "sitting"), 3);
assert_eq!(myers_distance_str("flaw", "lawn"), 2);
assert_eq!(
myers_edit_distance_checked(b"book", b"back").expect("never fails"),
2
);
}
}