levenshtein_diff/
edit.rs

1use std::cmp::min;
2use std::error::Error;
3use std::fmt;
4
5use crate::util::DistanceMatrix;
6
7/// Represents an error specific to working with the Levenshtein distance, or the generated
8/// distance matrix
9#[derive(Debug)]
10pub enum LevenshteinError {
11    // The supplied distance matrix is invalid
12    InvalidDistanceMatrixError,
13}
14
15impl fmt::Display for LevenshteinError {
16    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
17        let error = match self {
18            LevenshteinError::InvalidDistanceMatrixError => "Invalid matrix error",
19        };
20
21        write!(f, "{}", error)
22    }
23}
24
25impl Error for LevenshteinError {}
26
27/// Represents an Edit applied on a source sequence.
28#[derive(Clone, PartialEq)]
29pub enum Edit<T: PartialEq> {
30    Delete(usize),        // Delete item at index
31    Insert(usize, T),     // Insert item T at index
32    Substitute(usize, T), // Substitute item at index with T
33}
34
35/// Applies a sequence of edits on the source sequence, and returns a vector representing the
36/// target sequence.
37///
38/// # Arguments
39///
40/// * `source` - The source sequence
41/// * `edits` - A reference to a vector of edits of the same type as elements of source
42///
43/// # Examples
44///
45/// ```
46/// use levenshtein_diff as levenshtein;
47///
48/// let s1 = "FLOWER";
49/// let expected_s2 = "FOLLOWER";
50///
51/// let (_, matrix) = levenshtein::distance(s1.as_bytes(), expected_s2.as_bytes());
52///
53/// let edits = levenshtein::generate_edits(s1.as_bytes(), expected_s2.as_bytes(), &matrix).unwrap();
54///
55/// let target = levenshtein::apply_edits(s1.as_bytes(), &edits);
56///
57/// let s2 = match std::str::from_utf8(&target) {
58///     Ok(v) => v,
59///     Err(_) => panic!("Not a valid UTF-8 sequence!"),
60/// };
61///
62/// assert_eq!(s2, expected_s2);
63/// ```
64pub fn apply_edits<T: Clone + PartialEq>(source: &[T], edits: &[Edit<T>]) -> Vec<T> {
65    // Convert each item of source into Some(item)
66    let mut target_constructor: Vec<Option<T>> =
67        source.iter().map(|item| Some(item.clone())).collect();
68
69    let mut inserts = Vec::<Edit<T>>::with_capacity(source.len());
70
71    // We iterate in the reverse order because we want to populate the inserts vector in the
72    // reverse order of indices. This ensures that we don't need any operational transforms on the
73    // inserts.
74    for edit in edits.iter().rev() {
75        match edit {
76            Edit::Substitute(idx, val) => target_constructor[idx - 1] = Some(val.clone()),
77            Edit::Delete(idx) => target_constructor[idx - 1] = None,
78            Edit::Insert(idx, val) => inserts.push(Edit::Insert(*idx, val.clone())),
79        }
80    }
81
82    for i in &inserts {
83        if let Edit::Insert(idx, val) = i {
84            target_constructor.insert(*idx, Some(val.clone()));
85        }
86    }
87
88    let mut target = Vec::<T>::new();
89    for i in &target_constructor {
90        match i {
91            Some(val) => target.push(val.clone()),
92            None => (),
93        }
94    }
95
96    target
97}
98
99/// Generate a vector of edits that, when applied to the source sequence, transform it into the
100/// target sequence.
101///
102/// # Arguments
103///
104/// * `source` - The source sequence
105/// * `target` - The target sequence
106/// * `distances` - A reference to the `DistanceMatrix` for converting source to target
107///
108/// # Examples
109///
110/// ```
111/// use levenshtein_diff as levenshtein;
112///
113/// let s1 = "SATURDAY";
114/// let s2 = "SUNDAY";
115///
116/// let (_, matrix) = levenshtein::distance(s1.as_bytes(), s2.as_bytes());
117///
118/// // This can be used with the `apply_edits` function to transform source to target
119/// let edits = levenshtein::generate_edits(s1.as_bytes(), s2.as_bytes(), &matrix).unwrap();
120/// ```
121pub fn generate_edits<T: Clone + PartialEq>(
122    source: &[T],
123    target: &[T],
124    distances: &DistanceMatrix,
125) -> Result<Vec<Edit<T>>, LevenshteinError> {
126    let mut source_idx = source.len();
127    let mut target_idx = target.len();
128
129    if source_idx + 1 != distances.len() || target_idx + 1 != distances[0].len() {
130        return Err(LevenshteinError::InvalidDistanceMatrixError);
131    }
132
133    let mut edits = Vec::<Edit<T>>::new();
134
135    // When both source and target indices are 0, we have succesfully computed all the edits
136    // required to transform the source into the target
137    while source_idx != 0 || target_idx != 0 {
138        let current_item = distances[source_idx][target_idx];
139
140        // These represent the options we have: substitute, insert and delete
141        let substitute = if source_idx > 0 && target_idx > 0 {
142            distances[source_idx - 1][target_idx - 1]
143        } else {
144            usize::MAX
145        };
146
147        let delete = if source_idx > 0 {
148            distances[source_idx - 1][target_idx]
149        } else {
150            usize::MAX
151        };
152
153        let insert = if target_idx > 0 {
154            distances[source_idx][target_idx - 1]
155        } else {
156            usize::MAX
157        };
158
159        let min = min(min(insert, delete), substitute);
160
161        if min == current_item {
162            source_idx = source_idx - 1;
163            target_idx = target_idx - 1;
164        } else if min == current_item - 1 {
165            if min == insert {
166                // The edits are expected to be 1-indexed, but the slices obviously aren't
167                // Hence we do target_idx - 1 to access the right value
168                edits.push(Edit::Insert(source_idx, target[target_idx - 1].clone()));
169                target_idx = target_idx - 1;
170            } else if min == delete {
171                edits.push(Edit::Delete(source_idx));
172                source_idx = source_idx - 1;
173            } else if min == substitute {
174                edits.push(Edit::Substitute(source_idx, target[target_idx - 1].clone()));
175                source_idx = source_idx - 1;
176                target_idx = target_idx - 1;
177            } else {
178                return Err(LevenshteinError::InvalidDistanceMatrixError);
179            };
180        } else {
181            return Err(LevenshteinError::InvalidDistanceMatrixError);
182        };
183    }
184
185    Ok(edits)
186}
187
188#[cfg(test)]
189mod tests {
190    use crate::edit::*;
191
192    // Copied verbatim from
193    // https://stackoverflow.com/questions/29504514/whats-the-best-way-to-compare-2-vectors-or-strings-element-by-element
194    fn do_vecs_match<T: PartialEq>(a: &Vec<T>, b: &Vec<T>) -> bool {
195        let matching = a.iter().zip(b.iter()).filter(|&(a, b)| a == b).count();
196        matching == a.len() && matching == b.len()
197    }
198
199    #[test]
200    fn edit_list_is_correct() {
201        let s1 = "SATURDAY";
202        let s2 = "SUNDAY";
203
204        // This is the distance matrix for the strings
205        // SATURDAY and SUNDAY
206        let distances = vec![
207            vec![0, 1, 2, 3, 4, 5, 6],
208            vec![1, 0, 1, 2, 3, 4, 5],
209            vec![2, 1, 1, 2, 3, 3, 4],
210            vec![3, 2, 2, 2, 3, 4, 4],
211            vec![4, 3, 2, 3, 3, 4, 5],
212            vec![5, 4, 3, 3, 4, 4, 5],
213            vec![6, 5, 4, 4, 3, 4, 5],
214            vec![7, 6, 5, 5, 4, 3, 4],
215            vec![8, 7, 6, 6, 5, 4, 3],
216        ];
217
218        let expected_edits = vec![
219            Edit::<u8>::Substitute(5, 78),
220            Edit::<u8>::Delete(3),
221            Edit::<u8>::Delete(2),
222        ];
223
224        let edits = generate_edits(s1.as_bytes(), s2.as_bytes(), &distances).unwrap();
225
226        assert_eq!(do_vecs_match(&edits, &expected_edits), true);
227    }
228
229    #[test]
230    fn edits_are_applied_correctly() {
231        let s1 = "SATURDAY";
232        let expected_s2 = "SUNDAY";
233
234        // Edits that convert SATURDAY to SUNDAY
235        let mut edits = vec![
236            Edit::<u8>::Substitute(5, 78),
237            Edit::<u8>::Delete(3),
238            Edit::<u8>::Delete(2),
239        ];
240
241        let s2_bytes_vec = apply_edits(s1.as_bytes(), &mut edits);
242
243        let s2 = match std::str::from_utf8(&s2_bytes_vec) {
244            Ok(v) => v,
245            Err(_) => panic!("Not a valid UTF-8 sequence!"),
246        };
247
248        assert_eq!(s2, expected_s2);
249    }
250}