use std::cmp::max;
use std::collections::VecDeque;
const DELETION_COST: usize = 2;
const INSERTION_COST: usize = 2;
const INITIAL_MISMATCH_PENALITY: usize = 1;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Operation {
NoOp,
Deletion,
Insertion,
}
use Operation::*;
#[derive(Clone, Debug)]
struct Cell {
parent: usize,
operation: Operation,
cost: usize,
}
#[derive(Debug)]
pub struct Alignment<'a> {
pub x: Vec<&'a str>,
pub y: Vec<&'a str>,
table: Vec<Cell>,
dim: [usize; 2],
}
impl<'a> Alignment<'a> {
pub fn new(x: Vec<&'a str>, y: Vec<&'a str>) -> Self {
let dim = [y.len() + 1, x.len() + 1];
let table = vec![
Cell {
parent: 0,
operation: NoOp,
cost: 0
};
dim[0] * dim[1]
];
let mut alignment = Self { x, y, table, dim };
alignment.fill();
alignment
}
pub fn fill(&mut self) {
for i in 1..self.dim[1] {
self.table[i] = Cell {
parent: 0,
operation: Deletion,
cost: i * DELETION_COST + INITIAL_MISMATCH_PENALITY,
};
}
for j in 1..self.dim[0] {
self.table[j * self.dim[1]] = Cell {
parent: 0,
operation: Insertion,
cost: j * INSERTION_COST + INITIAL_MISMATCH_PENALITY,
};
}
for (i, x_i) in self.x.iter().enumerate() {
for (j, y_j) in self.y.iter().enumerate() {
let (left, diag, up) =
(self.index(i, j + 1), self.index(i, j), self.index(i + 1, j));
let candidates = [
Cell {
parent: up,
operation: Insertion,
cost: self.mismatch_cost(up, INSERTION_COST),
},
Cell {
parent: left,
operation: Deletion,
cost: self.mismatch_cost(left, DELETION_COST),
},
Cell {
parent: diag,
operation: NoOp,
cost: if x_i == y_j {
self.table[diag].cost
} else {
usize::MAX
},
},
];
let index = self.index(i + 1, j + 1);
self.table[index] = candidates
.iter()
.min_by_key(|cell| cell.cost)
.unwrap()
.clone();
}
}
}
fn mismatch_cost(&self, parent: usize, basic_cost: usize) -> usize {
self.table[parent].cost
+ basic_cost
+ if self.table[parent].operation == NoOp {
INITIAL_MISMATCH_PENALITY
} else {
0
}
}
pub fn operations(&self) -> Vec<Operation> {
let mut ops = VecDeque::with_capacity(max(self.x.len(), self.y.len()));
let mut cell = &self.table[self.index(self.x.len(), self.y.len())];
loop {
ops.push_front(cell.operation);
if cell.parent == 0 {
break;
}
cell = &self.table[cell.parent];
}
Vec::from(ops)
}
pub fn coalesced_operations(&self) -> Vec<(Operation, usize)> {
run_length_encode(self.operations())
}
#[allow(dead_code)]
pub fn distance(&self) -> f64 {
let (numer, denom) = self.distance_parts();
(numer as f64) / (denom as f64)
}
#[allow(dead_code)]
pub fn distance_parts(&self) -> (usize, usize) {
let (mut numer, mut denom) = (0, 0);
for op in self.operations() {
if op != NoOp {
numer += 1;
}
denom += 1;
}
(numer, denom)
}
#[allow(dead_code)]
pub fn levenshtein_distance(&self) -> usize {
self.table[self.index(self.x.len(), self.y.len())].cost
}
fn index(&self, i: usize, j: usize) -> usize {
j * self.dim[1] + i
}
#[allow(dead_code)]
fn format_cell(&self, cell: &Cell) -> String {
let parent = &self.table[cell.parent];
let op = match cell.operation {
Deletion => "-",
Insertion => "+",
NoOp => ".",
};
format!("{}{}{}", parent.cost, op, cell.cost)
}
#[allow(dead_code)]
fn print(&self) {
println!("x: {:?}", self.x);
println!("y: {:?}", self.y);
println!();
print!(" ");
for j in 0..self.dim[1] {
print!("{} ", if j > 0 { self.x[j - 1] } else { " " })
}
println!();
for i in 0..self.dim[0] {
for j in 0..self.dim[1] {
if j == 0 {
print!("{} ", if i > 0 { self.y[i - 1] } else { " " })
}
let cell = &self.table[self.index(j, i)];
print!("{} ", self.format_cell(cell));
}
println!();
}
println!();
}
}
fn run_length_encode<T>(sequence: Vec<T>) -> Vec<(T, usize)>
where
T: Copy,
T: PartialEq,
{
let mut encoded = Vec::with_capacity(sequence.len());
if sequence.is_empty() {
return encoded;
}
let end = sequence.len();
let (mut i, mut j) = (0, 1);
let mut curr = &sequence[i];
loop {
if j == end || sequence[j] != *curr {
encoded.push((*curr, j - i));
if j == end {
return encoded;
} else {
curr = &sequence[j];
i = j;
}
}
j += 1;
}
}
#[cfg(test)]
mod tests {
use super::*;
use unicode_segmentation::UnicodeSegmentation;
#[test]
fn test_run_length_encode() {
assert_eq!(run_length_encode::<usize>(vec![]), vec![]);
assert_eq!(run_length_encode(vec![0]), vec![(0, 1)]);
assert_eq!(run_length_encode(vec!["0", "0"]), vec![("0", 2)]);
assert_eq!(
run_length_encode(vec![0, 0, 1, 2, 2, 2, 3, 4, 4, 4]),
vec![(0, 2), (1, 1), (2, 3), (3, 1), (4, 3)]
);
}
#[test]
fn test_0() {
TestCase {
before: "aaa",
after: "aba",
distance: 5,
parts: (2, 4),
operations: vec![NoOp, Deletion, Insertion, NoOp],
}
.run();
}
#[test]
fn test_0_nonascii() {
TestCase {
before: "ááb",
after: "áaa",
distance: 9,
parts: (4, 5),
operations: vec![NoOp, Deletion, Deletion, Insertion, Insertion],
}
.run();
}
#[test]
fn test_1() {
TestCase {
before: "kitten",
after: "sitting",
distance: 13,
parts: (5, 9),
operations: vec![
Deletion, Insertion, NoOp, NoOp, NoOp, Deletion, Insertion, NoOp, Insertion, ],
}
.run();
}
#[test]
fn test_2() {
TestCase {
before: "saturday",
after: "sunday",
distance: 10,
parts: (4, 9),
operations: vec![
NoOp, Deletion, Deletion, NoOp, Deletion, Insertion, NoOp, NoOp, NoOp, ],
}
.run();
}
#[test]
fn test_3() {
TestCase {
before: "ab",
after: "ba",
distance: 6,
parts: (2, 3),
operations: vec![
Deletion, NoOp, Insertion, ],
}
.run();
}
#[test]
fn test_4() {
TestCase {
before: "AABB",
after: "AB",
distance: 5,
parts: (2, 4),
operations: vec![
NoOp, Deletion, Deletion, NoOp, ],
}
.run();
}
#[test]
fn test_5() {
TestCase {
before: "AB",
after: "AABB",
distance: 5,
parts: (2, 4),
operations: vec![
NoOp, Insertion, Insertion, NoOp, ],
}
.run();
}
#[test]
fn test_6() {
TestCase {
before: "AAABBB",
after: "ACB",
distance: 11,
parts: (5, 7),
operations: vec![
NoOp, Deletion, Deletion, Deletion, Deletion, Insertion, NoOp, ],
}
.run();
}
struct TestCase<'a> {
before: &'a str,
after: &'a str,
distance: usize,
parts: (usize, usize),
operations: Vec<Operation>,
}
impl<'a> TestCase<'a> {
pub fn run(&self) -> () {
self.assert_string_distance_parts();
assert_eq!(operations(self.before, self.after), self.operations);
}
fn assert_string_distance_parts(&self) {
self.assert_string_levenshtein_distance();
assert_eq!(string_distance_parts(self.before, self.after), self.parts);
assert_eq!(string_distance_parts(self.after, self.before), self.parts);
}
fn assert_string_levenshtein_distance(&self) {
assert_eq!(
string_levenshtein_distance(self.before, self.after),
self.distance
);
assert_eq!(
string_levenshtein_distance(self.after, self.before),
self.distance
);
}
}
fn string_distance_parts(x: &str, y: &str) -> (usize, usize) {
let (x, y) = (
x.graphemes(true).collect::<Vec<&str>>(),
y.graphemes(true).collect::<Vec<&str>>(),
);
Alignment::new(x, y).distance_parts()
}
fn string_levenshtein_distance(x: &str, y: &str) -> usize {
let (x, y) = (
x.graphemes(true).collect::<Vec<&str>>(),
y.graphemes(true).collect::<Vec<&str>>(),
);
Alignment::new(x, y).levenshtein_distance()
}
fn operations<'a>(x: &'a str, y: &'a str) -> Vec<Operation> {
let (x, y) = (
x.graphemes(true).collect::<Vec<&str>>(),
y.graphemes(true).collect::<Vec<&str>>(),
);
Alignment::new(x, y).operations()
}
}