sdiff/lib.rs
1//! Find the differences of two sequences.
2//!
3//! A diff function that finds the longest common subsequence (LCS). The output
4//! can easily be transformed to a shortest edit script (SES).
5//!
6//! The implementation is base on the [difference algorithm by Eugene W. Myers].
7//!
8//! [difference algorithm by Eugene W. Myers]: http://www.xmailserver.org/diff2.pdf
9
10#![cfg_attr(not(feature = "std"), no_std)]
11
12#[cfg(not(feature = "std"))]
13mod std {
14 extern crate alloc;
15 pub use alloc::*;
16 pub use core::*;
17}
18
19#[cfg(feature = "std")]
20mod std {
21 pub use std::*;
22}
23
24use crate::std::{
25 boxed::Box,
26 ops::{Index, IndexMut},
27 vec,
28 vec::Vec,
29};
30
31// workaround for false positive 'unused extern crate' warnings until
32// Rust issue [#95513](https://github.com/rust-lang/rust/issues/95513) is fixed
33#[cfg(test)]
34mod dummy_extern_uses {
35 use proptest as _;
36}
37
38/// Max length of the sequences that is supported.
39#[must_use]
40pub fn max_sequence_length() -> usize {
41 Trace::max_sequence_length()
42}
43
44/// Find the common subsequences and differences between two strings.
45///
46/// Each of the two strings must not be longer than the max supported length
47/// [`max_sequence_length()`].
48#[must_use]
49pub fn diff_str(left: &str, right: &str) -> Vec<Diff> {
50 diff(
51 &left.chars().collect::<Vec<_>>(),
52 &right.chars().collect::<Vec<_>>(),
53 )
54}
55
56/// Find the common subsequences and differences between two slices.
57///
58/// Each of the two slices must not be longer than the max supported length
59/// [`max_sequence_length()`].
60#[must_use]
61pub fn diff<T>(left: &[T], right: &[T]) -> Vec<Diff>
62where
63 T: PartialEq,
64{
65 let trace = find_shortest_trace(left, right);
66 list_diffs(left, right, &trace)
67}
68
69/// A subsequence that is present in either of two sequences or in both.
70#[derive(Debug, Clone, Copy, PartialEq, Eq)]
71pub enum Diff {
72 /// A subsequence that is only present in the left sequence. It starts at
73 /// the specified index [`Diff::Left::index`] into the left sequence and
74 /// has a length of [`Diff::Left::length`].
75 ///
76 /// This is equivalent to a 'remove' in an edit script.
77 Left {
78 /// The index into the left sequence where the subsequence starts.
79 index: usize,
80 /// The length of the subsequence.
81 length: usize,
82 },
83
84 /// A common subsequence of both sequences. This subsequence is present in
85 /// both, the left and the right sequence.
86 Both {
87 /// The index into the left sequence where the common subsequence
88 /// starts.
89 left_index: usize,
90 /// The index into the right sequence where the common subsequence
91 /// starts.
92 right_index: usize,
93 /// The length of the common subsequence.
94 length: usize,
95 },
96
97 /// A subsequence that is only present in the right sequence. It starts at
98 /// the specified index [`Diff::Left::index`] into the right sequence and
99 /// has a length of [`Diff::Left::length`].
100 ///
101 /// This is equivalent to an 'insert' in an edit script.
102 Right {
103 /// The index into the right sequence where the subsequence starts.
104 index: usize,
105 /// The length of the subsequence.
106 length: usize,
107 },
108}
109
110/// The shortest trace found in the edit space.
111///
112/// The index *k* is calculated as *k = x - y*. *d* is the depth in the graph
113/// that is examined. The values stored in the matrix are the best *x* value
114/// that can be achieved at each point.
115///
116/// # Layout
117///
118/// ```text
119/// | k
120/// |-5 -4 -3 -2 -1 0 1 2 3 4 5
121/// ----+---------------------------------
122/// 0 | o
123/// 1 | o o o
124/// d 2 | o o o o o
125/// 3 | o o o o o o o
126/// 4 | o o o o o o o o o
127/// 5 | o o o o o o o o o o o
128/// ```
129///
130/// # Example
131///
132/// Trace for diff of sequences 'ABCABBA' and 'CBABAC':
133///
134/// ```text
135/// | k
136/// |-5 -4 -3 -2 -1 0 1 2 3 4 5
137/// ----+---------------------------------
138/// 0 | 0
139/// 1 | 0 0 1
140/// d 2 | 2 0 2 1 3
141/// 3 | 3 2 4 2 5 3 5
142/// 4 | 3 4 4 5 5 7 5 7
143/// 5 | 3 4 5 5 7 7 5 7
144/// ```
145#[derive(Debug, Clone, PartialEq, Eq)]
146pub struct ShortestTrace {
147 data: Box<[isize]>,
148 len: isize,
149}
150
151impl ShortestTrace {
152 /// The length of the found shortest trace.
153 #[must_use]
154 #[allow(clippy::cast_sign_loss, clippy::len_without_is_empty)]
155 pub const fn len(&self) -> usize {
156 self.len as usize
157 }
158
159 /// A slice of the 2D-matrix containing the recorded trace.
160 #[must_use]
161 pub const fn data(&self) -> &[isize] {
162 &self.data
163 }
164
165 /// Get a shared reference to an element in the recorded trace.
166 #[must_use]
167 pub fn get(&self, d: isize, k: isize) -> &isize {
168 let idx = Trace::calculate_index(d, k);
169 &self.data[idx]
170 }
171
172 /// Get a mutable reference to an element in the recorded trace.
173 #[must_use]
174 pub fn get_mut(&mut self, d: isize, k: isize) -> &mut isize {
175 let idx = Trace::calculate_index(d, k);
176 &mut self.data[idx]
177 }
178}
179
180impl Index<(isize, isize)> for ShortestTrace {
181 type Output = isize;
182
183 fn index(&self, (d, k): (isize, isize)) -> &Self::Output {
184 self.get(d, k)
185 }
186}
187
188impl IndexMut<(isize, isize)> for ShortestTrace {
189 fn index_mut(&mut self, (d, k): (isize, isize)) -> &mut Self::Output {
190 self.get_mut(d, k)
191 }
192}
193
194/// Recorded path through the edit space.
195///
196/// The index *k* is calculated as *k = x - y*. *d* is the depth in the graph
197/// that is examined. The values stored in the matrix are the best *x* value
198/// that can be achieved at each point.
199///
200/// # Layout
201///
202/// ```text
203/// | k
204/// |-5 -4 -3 -2 -1 0 1 2 3 4 5
205/// ----+---------------------------------
206/// 0 | o
207/// 1 | o o o
208/// d 2 | o o o o o
209/// 3 | o o o o o o o
210/// 4 | o o o o o o o o o
211/// 5 | o o o o o o o o o o o
212/// ```
213///
214/// # Example
215///
216/// Trace for diff of sequences 'ABCABBA' and 'CBABAC':
217///
218/// ```text
219/// | k
220/// |-5 -4 -3 -2 -1 0 1 2 3 4 5
221/// ----+---------------------------------
222/// 0 | 0
223/// 1 | 0 0 1
224/// d 2 | 2 0 2 1 3
225/// 3 | 3 2 4 2 5 3 5
226/// 4 | 3 4 4 5 5 7 5 7
227/// 5 | 3 4 5 5 7 7 5 7
228/// ```
229struct Trace {
230 data: Box<[isize]>,
231}
232
233impl Trace {
234 /// Max length of the sequences that is supported.
235 #[must_use]
236 #[allow(
237 clippy::cast_precision_loss,
238 clippy::cast_possible_truncation,
239 clippy::cast_sign_loss
240 )]
241 pub fn max_sequence_length() -> usize {
242 2 * (libm::sqrt(isize::MAX as f64) as usize - 2)
243 }
244
245 /// Constructs a new `Trace` with pre-allocated slots.
246 ///
247 /// * *d* is iterated from *0* to max depth
248 /// * For each value of *d* we need *1 + d* slots
249 /// * sum of integers is *n * (n + 1) / 2*
250 /// * *k* is iterated from *-d* to *+d* on every other.
251 pub fn new(left_len: usize, right_len: usize) -> Self {
252 let max_sequence_length = Self::max_sequence_length();
253 assert!(
254 left_len <= max_sequence_length,
255 "the left sequence is longer than the max supported length of {max_sequence_length}",
256 );
257 assert!(
258 right_len <= max_sequence_length,
259 "the right sequence is longer than the max supported length of {max_sequence_length}",
260 );
261
262 let max_depth = left_len + right_len;
263 let num_slots = (max_depth + 1) * (max_depth + 2) / 2;
264
265 Self {
266 data: vec![0; num_slots].into(),
267 }
268 }
269
270 /// Calculates the index into the internal matrix for *(d, k)*.
271 #[inline]
272 #[allow(clippy::cast_sign_loss)]
273 fn calculate_index(d: isize, k: isize) -> usize {
274 debug_assert!(k >= -d && k <= d, "invalid index in matrix {:?}", (d, k));
275 let k_offset = d * (d + 1) / 2;
276 // *k* goes from *-d* to *d* so we need to map [-d, d] -> [0, 2d]
277 let unsigned_k = k + d;
278 (unsigned_k / 2 + k_offset) as usize
279 }
280
281 #[must_use]
282 pub fn get(&self, d: isize, k: isize) -> &isize {
283 let idx = Self::calculate_index(d, k);
284 &self.data[idx]
285 }
286
287 #[must_use]
288 pub fn get_mut(&mut self, d: isize, k: isize) -> &mut isize {
289 let idx = Self::calculate_index(d, k);
290 &mut self.data[idx]
291 }
292}
293
294impl Index<(isize, isize)> for Trace {
295 type Output = isize;
296
297 fn index(&self, (d, k): (isize, isize)) -> &Self::Output {
298 self.get(d, k)
299 }
300}
301
302impl IndexMut<(isize, isize)> for Trace {
303 fn index_mut(&mut self, (d, k): (isize, isize)) -> &mut Self::Output {
304 self.get_mut(d, k)
305 }
306}
307
308/// Find the shortest path from *(0,0)* till the end of the edit graph.
309#[allow(clippy::cast_possible_wrap, clippy::cast_sign_loss)]
310fn find_shortest_trace<T>(left: &[T], right: &[T]) -> ShortestTrace
311where
312 T: PartialEq,
313{
314 let left_len = left.len();
315 let right_len = right.len();
316
317 let max_depth = left_len + right_len;
318
319 let mut trace = Trace::new(left_len, right_len);
320
321 let max_depth = max_depth as isize;
322 let left_len = left_len as isize;
323 let right_len = right_len as isize;
324
325 for d in 0..=max_depth {
326 for k in (-d..=d).step_by(2) {
327 let mut x = if d == 0 {
328 0
329 } else if k == -d {
330 trace[(d - 1, k + 1)]
331 } else if k == d {
332 trace[(d - 1, k - 1)] + 1
333 } else {
334 let left = trace[(d - 1, k - 1)];
335 let right = trace[(d - 1, k + 1)];
336 if left < right {
337 right
338 } else {
339 left + 1
340 }
341 };
342
343 let mut y = x - k;
344 debug_assert!(
345 y >= 0,
346 "y should always be greater than or equal to 0, but is: {y:?}"
347 );
348
349 #[allow(clippy::suspicious_operation_groupings)]
350 while x < left_len && y < right_len && left[x as usize] == right[y as usize] {
351 x += 1;
352 y += 1;
353 }
354
355 trace[(d, k)] = x;
356
357 if x >= left_len && y >= right_len {
358 return ShortestTrace {
359 data: trace.data,
360 len: d,
361 };
362 }
363 }
364 }
365
366 panic!("length of a trace is longer than the maximum, which is `left.len() + right.len()`")
367}
368
369/// List common subsequences and differences between two sequences by
370/// backtracking the given trace.
371#[allow(clippy::cast_possible_wrap)]
372fn list_diffs<T>(left: &[T], right: &[T], trace: &ShortestTrace) -> Vec<Diff> {
373 if left.len() + right.len() == 0 {
374 return vec![Diff::Both {
375 left_index: 0,
376 right_index: 0,
377 length: 0,
378 }];
379 }
380
381 let mut x = left.len() as isize;
382 let mut y = right.len() as isize;
383
384 let mut diffs = Vec::new();
385
386 for d in (0..=trace.len).rev() {
387 let k = x - y;
388
389 let prev_k = if d == 0 {
390 0
391 } else if k == -d {
392 k + 1
393 } else if k == d {
394 k - 1
395 } else {
396 let left = trace[(d - 1, k - 1)];
397 let right = trace[(d - 1, k + 1)];
398 if left < right {
399 k + 1
400 } else {
401 k - 1
402 }
403 };
404
405 let prev_x = if d == 0 { 0 } else { trace[(d - 1, prev_k)] };
406 let prev_y = prev_x - prev_k;
407
408 while x > prev_x && y > prev_y {
409 x -= 1;
410 y -= 1;
411 if y < 0 {
412 y = 0;
413 }
414 if let Some(Diff::Both {
415 left_index,
416 right_index,
417 length,
418 }) = diffs.last_mut()
419 {
420 *left_index -= 1;
421 *right_index -= 1;
422 *length += 1;
423 } else {
424 #[allow(clippy::cast_sign_loss)]
425 diffs.push(Diff::Both {
426 left_index: x as usize,
427 right_index: y as usize,
428 length: 1,
429 });
430 }
431 }
432
433 if d > 0 {
434 if prev_y == y {
435 if let Some(Diff::Left { index, length }) = diffs.last_mut() {
436 *index -= 1;
437 *length += 1;
438 } else {
439 #[allow(clippy::cast_sign_loss)]
440 diffs.push(Diff::Left {
441 index: prev_x as usize,
442 length: 1,
443 });
444 }
445 } else if prev_x == x {
446 if let Some(Diff::Right { index, length }) = diffs.last_mut() {
447 *index -= 1;
448 *length += 1;
449 } else {
450 #[allow(clippy::cast_sign_loss)]
451 diffs.push(Diff::Right {
452 index: prev_y as usize,
453 length: 1,
454 });
455 }
456 } else {
457 unreachable!("we should not come here!")
458 }
459 }
460
461 x = prev_x;
462 y = prev_y;
463 }
464
465 diffs.reverse();
466 diffs
467}
468
469#[cfg(test)]
470mod tests;