sdiff/
lib.rs

1//! Find the differences of two sequences.
2//!
3//! A diff function that finds the longest common subsequence (LCS). The output
4//! can easily be transformed to a shortest edit script (SES).
5//!
6//! The implementation is base on the [difference algorithm by Eugene W. Myers].
7//!
8//! [difference algorithm by Eugene W. Myers]: http://www.xmailserver.org/diff2.pdf
9
10#![cfg_attr(not(feature = "std"), no_std)]
11
12#[cfg(not(feature = "std"))]
13mod std {
14    extern crate alloc;
15    pub use alloc::*;
16    pub use core::*;
17}
18
19#[cfg(feature = "std")]
20mod std {
21    pub use std::*;
22}
23
24use crate::std::{
25    boxed::Box,
26    ops::{Index, IndexMut},
27    vec,
28    vec::Vec,
29};
30
31// workaround for false positive 'unused extern crate' warnings until
32// Rust issue [#95513](https://github.com/rust-lang/rust/issues/95513) is fixed
33#[cfg(test)]
34mod dummy_extern_uses {
35    use proptest as _;
36}
37
38/// Max length of the sequences that is supported.
39#[must_use]
40pub fn max_sequence_length() -> usize {
41    Trace::max_sequence_length()
42}
43
44/// Find the common subsequences and differences between two strings.
45///
46/// Each of the two strings must not be longer than the max supported length
47/// [`max_sequence_length()`].
48#[must_use]
49pub fn diff_str(left: &str, right: &str) -> Vec<Diff> {
50    diff(
51        &left.chars().collect::<Vec<_>>(),
52        &right.chars().collect::<Vec<_>>(),
53    )
54}
55
56/// Find the common subsequences and differences between two slices.
57///
58/// Each of the two slices must not be longer than the max supported length
59/// [`max_sequence_length()`].
60#[must_use]
61pub fn diff<T>(left: &[T], right: &[T]) -> Vec<Diff>
62where
63    T: PartialEq,
64{
65    let trace = find_shortest_trace(left, right);
66    list_diffs(left, right, &trace)
67}
68
69/// A subsequence that is present in either of two sequences or in both.
70#[derive(Debug, Clone, Copy, PartialEq, Eq)]
71pub enum Diff {
72    /// A subsequence that is only present in the left sequence. It starts at
73    /// the specified index [`Diff::Left::index`] into the left sequence and
74    /// has a length of [`Diff::Left::length`].
75    ///
76    /// This is equivalent to a 'remove' in an edit script.
77    Left {
78        /// The index into the left sequence where the subsequence starts.
79        index: usize,
80        /// The length of the subsequence.
81        length: usize,
82    },
83
84    /// A common subsequence of both sequences. This subsequence is present in
85    /// both, the left and the right sequence.
86    Both {
87        /// The index into the left sequence where the common subsequence
88        /// starts.
89        left_index: usize,
90        /// The index into the right sequence where the common subsequence
91        /// starts.
92        right_index: usize,
93        /// The length of the common subsequence.
94        length: usize,
95    },
96
97    /// A subsequence that is only present in the right sequence. It starts at
98    /// the specified index [`Diff::Left::index`] into the right sequence and
99    /// has a length of [`Diff::Left::length`].
100    ///
101    /// This is equivalent to an 'insert' in an edit script.
102    Right {
103        /// The index into the right sequence where the subsequence starts.
104        index: usize,
105        /// The length of the subsequence.
106        length: usize,
107    },
108}
109
110/// The shortest trace found in the edit space.
111///
112/// The index *k* is calculated as *k = x - y*. *d* is the depth in the graph
113/// that is examined. The values stored in the matrix are the best *x* value
114/// that can be achieved at each point.
115///
116/// # Layout
117///
118/// ```text
119///     |                k
120///     |-5 -4 -3 -2 -1  0  1  2  3  4  5
121/// ----+---------------------------------
122///   0 |                o
123///   1 |             o  o  o
124/// d 2 |          o  o  o  o  o
125///   3 |       o  o  o  o  o  o  o
126///   4 |    o  o  o  o  o  o  o  o  o
127///   5 | o  o  o  o  o  o  o  o  o  o  o
128/// ```
129///
130/// # Example
131///
132/// Trace for diff of sequences 'ABCABBA' and 'CBABAC':
133///
134/// ```text
135///     |                k
136///     |-5 -4 -3 -2 -1  0  1  2  3  4  5
137/// ----+---------------------------------
138///   0 |                0
139///   1 |             0  0  1
140/// d 2 |          2  0  2  1  3
141///   3 |       3  2  4  2  5  3  5
142///   4 |       3  4  4  5  5  7  5  7
143///   5 |       3  4  5  5  7  7  5  7
144/// ```
145#[derive(Debug, Clone, PartialEq, Eq)]
146pub struct ShortestTrace {
147    data: Box<[isize]>,
148    len: isize,
149}
150
151impl ShortestTrace {
152    /// The length of the found shortest trace.
153    #[must_use]
154    #[allow(clippy::cast_sign_loss, clippy::len_without_is_empty)]
155    pub const fn len(&self) -> usize {
156        self.len as usize
157    }
158
159    /// A slice of the 2D-matrix containing the recorded trace.
160    #[must_use]
161    pub const fn data(&self) -> &[isize] {
162        &self.data
163    }
164
165    /// Get a shared reference to an element in the recorded trace.
166    #[must_use]
167    pub fn get(&self, d: isize, k: isize) -> &isize {
168        let idx = Trace::calculate_index(d, k);
169        &self.data[idx]
170    }
171
172    /// Get a mutable reference to an element in the recorded trace.
173    #[must_use]
174    pub fn get_mut(&mut self, d: isize, k: isize) -> &mut isize {
175        let idx = Trace::calculate_index(d, k);
176        &mut self.data[idx]
177    }
178}
179
180impl Index<(isize, isize)> for ShortestTrace {
181    type Output = isize;
182
183    fn index(&self, (d, k): (isize, isize)) -> &Self::Output {
184        self.get(d, k)
185    }
186}
187
188impl IndexMut<(isize, isize)> for ShortestTrace {
189    fn index_mut(&mut self, (d, k): (isize, isize)) -> &mut Self::Output {
190        self.get_mut(d, k)
191    }
192}
193
194/// Recorded path through the edit space.
195///
196/// The index *k* is calculated as *k = x - y*. *d* is the depth in the graph
197/// that is examined. The values stored in the matrix are the best *x* value
198/// that can be achieved at each point.
199///
200/// # Layout
201///
202/// ```text
203///     |                k
204///     |-5 -4 -3 -2 -1  0  1  2  3  4  5
205/// ----+---------------------------------
206///   0 |                o
207///   1 |             o  o  o
208/// d 2 |          o  o  o  o  o
209///   3 |       o  o  o  o  o  o  o
210///   4 |    o  o  o  o  o  o  o  o  o
211///   5 | o  o  o  o  o  o  o  o  o  o  o
212/// ```
213///
214/// # Example
215///
216/// Trace for diff of sequences 'ABCABBA' and 'CBABAC':
217///
218/// ```text
219///     |                k
220///     |-5 -4 -3 -2 -1  0  1  2  3  4  5
221/// ----+---------------------------------
222///   0 |                0
223///   1 |             0  0  1
224/// d 2 |          2  0  2  1  3
225///   3 |       3  2  4  2  5  3  5
226///   4 |       3  4  4  5  5  7  5  7
227///   5 |       3  4  5  5  7  7  5  7
228/// ```
229struct Trace {
230    data: Box<[isize]>,
231}
232
233impl Trace {
234    /// Max length of the sequences that is supported.
235    #[must_use]
236    #[allow(
237        clippy::cast_precision_loss,
238        clippy::cast_possible_truncation,
239        clippy::cast_sign_loss
240    )]
241    pub fn max_sequence_length() -> usize {
242        2 * (libm::sqrt(isize::MAX as f64) as usize - 2)
243    }
244
245    /// Constructs a new `Trace` with pre-allocated slots.
246    ///
247    /// * *d* is iterated from *0* to max depth
248    /// * For each value of *d* we need *1 + d* slots
249    /// * sum of integers is *n * (n + 1) / 2*
250    /// * *k* is iterated from *-d* to *+d* on every other.
251    pub fn new(left_len: usize, right_len: usize) -> Self {
252        let max_sequence_length = Self::max_sequence_length();
253        assert!(
254            left_len <= max_sequence_length,
255            "the left sequence is longer than the max supported length of {max_sequence_length}",
256        );
257        assert!(
258            right_len <= max_sequence_length,
259            "the right sequence is longer than the max supported length of {max_sequence_length}",
260        );
261
262        let max_depth = left_len + right_len;
263        let num_slots = (max_depth + 1) * (max_depth + 2) / 2;
264
265        Self {
266            data: vec![0; num_slots].into(),
267        }
268    }
269
270    /// Calculates the index into the internal matrix for *(d, k)*.
271    #[inline]
272    #[allow(clippy::cast_sign_loss)]
273    fn calculate_index(d: isize, k: isize) -> usize {
274        debug_assert!(k >= -d && k <= d, "invalid index in matrix {:?}", (d, k));
275        let k_offset = d * (d + 1) / 2;
276        // *k* goes from *-d* to *d* so we need to map [-d, d] -> [0, 2d]
277        let unsigned_k = k + d;
278        (unsigned_k / 2 + k_offset) as usize
279    }
280
281    #[must_use]
282    pub fn get(&self, d: isize, k: isize) -> &isize {
283        let idx = Self::calculate_index(d, k);
284        &self.data[idx]
285    }
286
287    #[must_use]
288    pub fn get_mut(&mut self, d: isize, k: isize) -> &mut isize {
289        let idx = Self::calculate_index(d, k);
290        &mut self.data[idx]
291    }
292}
293
294impl Index<(isize, isize)> for Trace {
295    type Output = isize;
296
297    fn index(&self, (d, k): (isize, isize)) -> &Self::Output {
298        self.get(d, k)
299    }
300}
301
302impl IndexMut<(isize, isize)> for Trace {
303    fn index_mut(&mut self, (d, k): (isize, isize)) -> &mut Self::Output {
304        self.get_mut(d, k)
305    }
306}
307
308/// Find the shortest path from *(0,0)* till the end of the edit graph.
309#[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss)]
310fn find_shortest_trace<T>(left: &[T], right: &[T]) -> ShortestTrace
311where
312    T: PartialEq,
313{
314    let left_len = left.len();
315    let right_len = right.len();
316
317    let max_depth = left_len + right_len;
318
319    let mut trace = Trace::new(left_len, right_len);
320
321    let max_depth = max_depth as isize;
322    let left_len = left_len as isize;
323    let right_len = right_len as isize;
324
325    for d in 0..=max_depth {
326        for k in (-d..=d).step_by(2) {
327            let mut x = if d == 0 {
328                0
329            } else if k == -d {
330                trace[(d - 1, k + 1)]
331            } else if k == d {
332                trace[(d - 1, k - 1)] + 1
333            } else {
334                let left = trace[(d - 1, k - 1)];
335                let right = trace[(d - 1, k + 1)];
336                if left < right {
337                    right
338                } else {
339                    left + 1
340                }
341            };
342
343            let mut y = x - k;
344            debug_assert!(
345                y >= 0,
346                "y should always be greater than or equal to 0, but is: {y:?}"
347            );
348
349            #[allow(clippy::suspicious_operation_groupings)]
350            while x < left_len && y < right_len && left[x as usize] == right[y as usize] {
351                x += 1;
352                y += 1;
353            }
354
355            trace[(d, k)] = x;
356
357            if x >= left_len && y >= right_len {
358                return ShortestTrace {
359                    data: trace.data,
360                    len: d,
361                };
362            }
363        }
364    }
365
366    panic!("length of a trace is longer than the maximum, which is `left.len() + right.len()`")
367}
368
369/// List common subsequences and differences between two sequences by
370/// backtracking the given trace.
371#[allow(clippy::cast_possible_wrap)]
372fn list_diffs<T>(left: &[T], right: &[T], trace: &ShortestTrace) -> Vec<Diff> {
373    if left.len() + right.len() == 0 {
374        return vec![Diff::Both {
375            left_index: 0,
376            right_index: 0,
377            length: 0,
378        }];
379    }
380
381    let mut x = left.len() as isize;
382    let mut y = right.len() as isize;
383
384    let mut diffs = Vec::new();
385
386    for d in (0..=trace.len).rev() {
387        let k = x - y;
388
389        let prev_k = if d == 0 {
390            0
391        } else if k == -d {
392            k + 1
393        } else if k == d {
394            k - 1
395        } else {
396            let left = trace[(d - 1, k - 1)];
397            let right = trace[(d - 1, k + 1)];
398            if left < right {
399                k + 1
400            } else {
401                k - 1
402            }
403        };
404
405        let prev_x = if d == 0 { 0 } else { trace[(d - 1, prev_k)] };
406        let prev_y = prev_x - prev_k;
407
408        while x > prev_x && y > prev_y {
409            x -= 1;
410            y -= 1;
411            if y < 0 {
412                y = 0;
413            }
414            if let Some(Diff::Both {
415                left_index,
416                right_index,
417                length,
418            }) = diffs.last_mut()
419            {
420                *left_index -= 1;
421                *right_index -= 1;
422                *length += 1;
423            } else {
424                #[allow(clippy::cast_sign_loss)]
425                diffs.push(Diff::Both {
426                    left_index: x as usize,
427                    right_index: y as usize,
428                    length: 1,
429                });
430            }
431        }
432
433        if d > 0 {
434            if prev_y == y {
435                if let Some(Diff::Left { index, length }) = diffs.last_mut() {
436                    *index -= 1;
437                    *length += 1;
438                } else {
439                    #[allow(clippy::cast_sign_loss)]
440                    diffs.push(Diff::Left {
441                        index: prev_x as usize,
442                        length: 1,
443                    });
444                }
445            } else if prev_x == x {
446                if let Some(Diff::Right { index, length }) = diffs.last_mut() {
447                    *index -= 1;
448                    *length += 1;
449                } else {
450                    #[allow(clippy::cast_sign_loss)]
451                    diffs.push(Diff::Right {
452                        index: prev_y as usize,
453                        length: 1,
454                    });
455                }
456            } else {
457                unreachable!("we should not come here!")
458            }
459        }
460
461        x = prev_x;
462        y = prev_y;
463    }
464
465    diffs.reverse();
466    diffs
467}
468
469#[cfg(test)]
470mod tests;