diffr_lib/
lib.rs

1//! Algorithms to compute diffs.
2//!
3//! This module implements various algorithms described in E. Myers
4//! paper: [An O(ND) Difference Algorithm and Its
5//! Variations](http://www.xmailserver.org/diff2.pdf).
6//!
7//! The main entrypoint is `diff`, which allows to compute the longest
8//! common subsequence between two sequences of byte slices.
9
10use std::collections::hash_map::DefaultHasher;
11use std::convert::TryFrom;
12use std::fmt::Debug;
13use std::fmt::{Error as FmtErr, Formatter};
14use std::hash::{Hash, Hasher};
15
16/// A span of bytes and a hash of the content it refers.
17#[derive(Clone, Copy, Debug)]
18pub struct HashedSpan {
19    pub lo: usize,
20    pub hi: usize,
21    pub hash: u64,
22}
23
24/// A wrapper around a token, optimized for equality comparison.
25#[derive(PartialEq, Eq)]
26pub struct HashedSlice<'a> {
27    pub hash: u64,
28    pub data: &'a [u8],
29}
30
31impl<'a> Debug for HashedSlice<'a> {
32    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), FmtErr> {
33        let data_pp = String::from_utf8_lossy(self.data);
34        f.debug_tuple("HashedSlice").field(&data_pp).finish()
35    }
36}
37
38impl<'a> Hash for HashedSlice<'a> {
39    fn hash<H: Hasher>(&self, h: &mut H) {
40        h.write_u64(self.hash)
41    }
42}
43
44/// A tokenized slice of bytes.
45pub struct Tokenization<'a> {
46    data: &'a [u8],
47    tokens: &'a [HashedSpan],
48    start_index: isize,
49    one_past_end_index: isize,
50}
51
52impl<'a> Debug for Tokenization<'a> {
53    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), FmtErr> {
54        let Self {
55            data,
56            tokens,
57            start_index,
58            one_past_end_index,
59        } = self;
60        let data_pp = String::from_utf8_lossy(data);
61        let tokens_pp = tokens[to_usize(*start_index)..to_usize(*one_past_end_index)]
62            .iter()
63            .map(|sref| String::from_utf8_lossy(&data[sref.lo..sref.hi]))
64            .collect::<Vec<_>>();
65        f.debug_struct("Tokenization")
66            .field("data", &data_pp)
67            .field("tokens", &tokens_pp)
68            .finish()
69    }
70}
71
72impl<'a> Tokenization<'a> {
73    pub fn new(data: &'a [u8], tokens: &'a [HashedSpan]) -> Self {
74        Tokenization {
75            data,
76            tokens,
77            start_index: 0,
78            one_past_end_index: to_isize(tokens.len()),
79        }
80    }
81
82    pub fn data(&self) -> &'a [u8] {
83        self.data
84    }
85
86    /// Split `self` in two tokenizations:
87    /// * the first one from the start to `lo`;
88    /// * the second one from `hi` to the end.
89    pub fn split_at(&self, lo: isize, hi: isize) -> (Self, Self) {
90        let start = self.start_index;
91        let end = self.one_past_end_index;
92        assert!(start <= lo);
93        assert!(lo <= hi);
94        assert!(hi <= end);
95        (
96            Tokenization {
97                one_past_end_index: lo,
98                ..*self
99            },
100            Tokenization {
101                start_index: hi,
102                ..*self
103            },
104        )
105    }
106
107    /// Get `self`'s number of tokens.
108    pub fn nb_tokens(&self) -> usize {
109        to_usize(self.one_past_end_index - self.start_index)
110    }
111
112    /// Get `self`'s `n`th token.
113    pub fn nth_token(&self, n: isize) -> HashedSlice {
114        let HashedSpan { lo, hi, hash } = self.nth_span(n);
115        HashedSlice {
116            hash,
117            data: &self.data[lo..hi],
118        }
119    }
120
121    /// Get the span corresponding to `self`'s `n`th token.
122    pub fn nth_span(&self, n: isize) -> HashedSpan {
123        self.tokens[to_usize(self.start_index + n)]
124    }
125}
126
127/// A pair of [Tokenization]s to compare.
128#[derive(Debug)]
129pub struct DiffInput<'a> {
130    pub added: Tokenization<'a>,
131    pub removed: Tokenization<'a>,
132}
133
134impl<'a> DiffInput<'a> {
135    fn split_at(&self, (x0, y0): (isize, isize), (x1, y1): (isize, isize)) -> (Self, Self) {
136        let (removed1, removed2) = self.removed.split_at(x0, x1);
137        let (added1, added2) = self.added.split_at(y0, y1);
138
139        (
140            DiffInput {
141                added: added1,
142                removed: removed1,
143            },
144            DiffInput {
145                added: added2,
146                removed: removed2,
147            },
148        )
149    }
150
151    fn n(&self) -> usize {
152        self.removed.nb_tokens()
153    }
154
155    fn m(&self) -> usize {
156        self.added.nb_tokens()
157    }
158
159    fn seq_a(&self, index: isize) -> HashedSlice {
160        self.removed.nth_token(index)
161    }
162
163    fn seq_b(&self, index: isize) -> HashedSlice {
164        self.added.nth_token(index)
165    }
166}
167
168/// Hash the given bytes.
169fn hash_slice(data: &[u8]) -> u64 {
170    let mut s = DefaultHasher::new();
171    s.write(data);
172    s.finish()
173}
174
175struct DiffTraversal<'a> {
176    v: &'a mut [isize],
177    max: usize,
178    end: (isize, isize),
179}
180
181impl<'a> DiffTraversal<'a> {
182    fn from_slice(input: &'a DiffInput<'a>, v: &'a mut [isize], forward: bool, max: usize) -> Self {
183        let start = (input.removed.start_index, input.added.start_index);
184        let end = (
185            input.removed.one_past_end_index,
186            input.added.one_past_end_index,
187        );
188        assert!(max * 2 + 1 <= v.len());
189        let (start, end) = if forward { (start, end) } else { (end, start) };
190        let mut res = DiffTraversal { v, max, end };
191        if max != 0 {
192            *res.v_mut(1) = start.0 - input.removed.start_index
193        }
194        res
195    }
196
197    fn from_vector(
198        input: &'a DiffInput<'a>,
199        v: &'a mut Vec<isize>,
200        forward: bool,
201        max: usize,
202    ) -> Self {
203        v.resize(max * 2 + 1, 0);
204        Self::from_slice(input, v, forward, max)
205    }
206
207    fn v(&self, index: isize) -> isize {
208        self.v[to_usize(index + to_isize(self.max))]
209    }
210
211    fn v_mut(&mut self, index: isize) -> &mut isize {
212        &mut self.v[to_usize(index + to_isize(self.max))]
213    }
214}
215
216fn diff_sequences_kernel_forward(
217    input: &DiffInput,
218    ctx: &mut DiffTraversal,
219    d: usize,
220) -> Option<usize> {
221    let n = to_isize(input.n());
222    let m = to_isize(input.m());
223    assert!(d < ctx.max);
224    let d = to_isize(d);
225    for k in (-d..=d).step_by(2) {
226        let mut x = if k == -d || k != d && ctx.v(k - 1) < ctx.v(k + 1) {
227            ctx.v(k + 1)
228        } else {
229            ctx.v(k - 1) + 1
230        };
231        let mut y = x - k;
232        while x < n && y < m && input.seq_a(x) == input.seq_b(y) {
233            x += 1;
234            y += 1;
235        }
236        *ctx.v_mut(k) = x;
237        if ctx.end == (x, y) {
238            return Some(to_usize(d));
239        }
240    }
241    None
242}
243
244fn diff_sequences_kernel_backward(
245    input: &DiffInput,
246    ctx: &mut DiffTraversal,
247    d: usize,
248) -> Option<usize> {
249    let n = to_isize(input.n());
250    let m = to_isize(input.m());
251    let delta = n - m;
252    assert!(d < ctx.max);
253    let d = to_isize(d);
254    for k in (-d..=d).step_by(2) {
255        let mut x = if k == -d || k != d && ctx.v(k + 1) < ctx.v(k - 1) {
256            ctx.v(k + 1)
257        } else {
258            ctx.v(k - 1) + 1
259        };
260        let mut y = x - (k + delta);
261        while 0 < x && 0 < y && input.seq_a(x - 1) == input.seq_b(y - 1) {
262            x -= 1;
263            y -= 1;
264        }
265        *ctx.v_mut(k) = x - 1;
266        if ctx.end == (x, y) {
267            return Some(to_usize(d));
268        }
269    }
270    None
271}
272
273/// A wrapper around a vector of bytes that keeps track of end of lines.
274#[derive(Debug, Default)]
275pub struct LineSplit {
276    data: Vec<u8>,
277    line_lengths: Vec<usize>,
278}
279
280impl LineSplit {
281    pub fn iter(&self) -> LineSplitIter {
282        LineSplitIter {
283            line_split: &self,
284            index: 0,
285            start_of_slice: 0,
286        }
287    }
288
289    pub fn data<'a>(&'a self) -> &'a [u8] {
290        &self.data
291    }
292
293    pub fn append_line(&mut self, line: &[u8]) {
294        if self.data.last().cloned() == Some(b'\n') {
295            self.line_lengths.push(line.len());
296        } else {
297            match self.line_lengths.last_mut() {
298                Some(len) => *len += line.len(),
299                None => self.line_lengths.push(line.len()),
300            }
301        }
302        self.data.extend_from_slice(line)
303    }
304
305    pub fn clear(&mut self) {
306        self.data.clear();
307        self.line_lengths.clear();
308    }
309
310    pub fn len(&self) -> usize {
311        self.data.len()
312    }
313}
314
315pub struct LineSplitIter<'a> {
316    line_split: &'a LineSplit,
317    start_of_slice: usize,
318    index: usize,
319}
320
321impl<'a> Iterator for LineSplitIter<'a> {
322    type Item = (usize, usize);
323    fn next(&mut self) -> Option<Self::Item> {
324        let &mut LineSplitIter {
325            line_split:
326                LineSplit {
327                    data: _,
328                    line_lengths,
329                },
330            index,
331            start_of_slice,
332        } = self;
333        if index < line_lengths.len() {
334            let len = line_lengths[index];
335            self.start_of_slice += len;
336            self.index += 1;
337            Some((start_of_slice, start_of_slice + len))
338        } else {
339            None
340        }
341    }
342}
343
344/// A pair of spans with the same content in two different slices.
345#[derive(Clone, Debug, Default)]
346pub struct Snake {
347    /// The start of the span in the removed bytes.
348    pub x0: isize,
349
350    /// The start of the span in the added bytes.
351    pub y0: isize,
352
353    /// The length of the span.
354    pub len: isize,
355}
356
357impl Snake {
358    fn from(mut self, x0: isize, y0: isize) -> Self {
359        self.x0 = x0;
360        self.y0 = y0;
361        self
362    }
363
364    fn len(mut self, len: isize) -> Self {
365        self.len = len;
366        self
367    }
368}
369
370fn diff_sequences_kernel_bidirectional(
371    input: &DiffInput,
372    ctx_fwd: &mut DiffTraversal,
373    ctx_bwd: &mut DiffTraversal,
374    d: usize,
375) -> Option<(Snake, isize)> {
376    let n = to_isize(input.n());
377    let m = to_isize(input.m());
378    let delta = n - m;
379    let odd = delta % 2 != 0;
380    assert!(d < ctx_fwd.max);
381    assert!(d < ctx_bwd.max);
382    let d = to_isize(d);
383    for k in (-d..=d).step_by(2) {
384        let mut x = if k == -d || k != d && ctx_fwd.v(k - 1) < ctx_fwd.v(k + 1) {
385            ctx_fwd.v(k + 1)
386        } else {
387            ctx_fwd.v(k - 1) + 1
388        };
389        let mut y = x - k;
390        let (x0, y0) = (x, y);
391        while x < n && y < m && input.seq_a(x) == input.seq_b(y) {
392            x += 1;
393            y += 1;
394        }
395        if odd && (k - delta).abs() <= d - 1 && x > ctx_bwd.v(k - delta) {
396            return Some((Snake::default().from(x0, y0).len(x - x0), 2 * d - 1));
397        }
398        *ctx_fwd.v_mut(k) = x;
399    }
400    for k in (-d..=d).step_by(2) {
401        let mut x = if k == -d || k != d && ctx_bwd.v(k + 1) < ctx_bwd.v(k - 1) {
402            ctx_bwd.v(k + 1)
403        } else {
404            ctx_bwd.v(k - 1) + 1
405        };
406        let mut y = x - (k + delta);
407        let x1 = x;
408        while 0 < x && 0 < y && input.seq_a(x - 1) == input.seq_b(y - 1) {
409            x -= 1;
410            y -= 1;
411        }
412        if !odd && (k + delta).abs() <= d && x - 1 < ctx_fwd.v(k + delta) {
413            return Some((Snake::default().from(x, y).len(x1 - x), 2 * d));
414        }
415        *ctx_bwd.v_mut(k) = x - 1;
416    }
417    None
418}
419
420/// Compute the length of the edit script for `input`.
421/// This is the forward version.
422pub fn diff_sequences_simple_forward(input: &DiffInput, v: &mut Vec<isize>) -> usize {
423    diff_sequences_simple(input, v, true)
424}
425
426/// Compute the length of the edit script for `input`.
427/// This is the backward version.
428pub fn diff_sequences_simple_backward(input: &DiffInput, v: &mut Vec<isize>) -> usize {
429    diff_sequences_simple(input, v, false)
430}
431
432fn diff_sequences_simple(input: &DiffInput, v: &mut Vec<isize>, forward: bool) -> usize {
433    let max_result = input.n() + input.m();
434    let ctx = &mut DiffTraversal::from_vector(input, v, forward, max_result);
435    (0..max_result)
436        .filter_map(|d| {
437            if forward {
438                diff_sequences_kernel_forward(input, ctx, d)
439            } else {
440                diff_sequences_kernel_backward(input, ctx, d)
441            }
442        })
443        .next()
444        .unwrap_or(max_result)
445}
446
447/// Compute the longest common subsequence for `input` into `dst`.
448pub fn diff(input: &DiffInput, v: &mut Vec<isize>, dst: &mut Vec<Snake>) {
449    dst.clear();
450    diff_rec(input, v, dst)
451}
452
453fn diff_rec(input: &DiffInput, v: &mut Vec<isize>, dst: &mut Vec<Snake>) {
454    let n = to_isize(input.n());
455    fn trivial_diff(tok: &Tokenization) -> bool {
456        tok.one_past_end_index <= tok.start_index
457    }
458
459    if trivial_diff(&input.removed) || trivial_diff(&input.added) {
460        return;
461    }
462
463    let (snake, d) = diff_sequences_bidirectional_snake(input, v);
464    let &Snake { x0, y0, len } = &snake;
465    if 1 < d {
466        let (input1, input2) = input.split_at((x0, y0), (x0 + len, y0 + len));
467        diff_rec(&input1, v, dst);
468        if len != 0 {
469            dst.push(snake);
470        }
471        diff_rec(&input2, v, dst);
472    } else {
473        let SplittingPoint { sp, dx, dy } = find_splitting_point(&input);
474        let x0 = input.removed.start_index;
475        let y0 = input.added.start_index;
476        if sp != 0 {
477            dst.push(Snake::default().from(x0, y0).len(sp));
478        }
479        let len = n - sp - dx;
480        if len != 0 {
481            dst.push(Snake::default().from(x0 + sp + dx, y0 + sp + dy).len(len));
482        }
483    }
484}
485
486struct SplittingPoint {
487    sp: isize,
488    dx: isize,
489    dy: isize,
490}
491
492// Find the splitting point when two sequences differ by one element.
493fn find_splitting_point(input: &DiffInput) -> SplittingPoint {
494    let n = to_isize(input.n());
495    let m = to_isize(input.m());
496    let (short, long, nb_tokens, dx, dy) = if n < m {
497        (&input.removed, &input.added, n, 0, 1)
498    } else if m < n {
499        (&input.added, &input.removed, m, 1, 0)
500    } else {
501        (&input.added, &input.removed, m, 0, 0)
502    };
503    let mut sp = nb_tokens;
504    for i in 0..nb_tokens {
505        if long.nth_token(i) != short.nth_token(i) {
506            sp = i;
507            break;
508        }
509    }
510    SplittingPoint { sp, dx, dy }
511}
512
513/// Compute the length of the edit script for `input`.
514/// This is the bidirectional version.
515pub fn diff_sequences_bidirectional(input: &DiffInput, v: &mut Vec<isize>) -> usize {
516    if input.n() + input.m() == 0 {
517        return 0;
518    }
519    to_usize(diff_sequences_bidirectional_snake(input, v).1)
520}
521
522fn diff_sequences_bidirectional_snake(input: &DiffInput, v: &mut Vec<isize>) -> (Snake, isize) {
523    let max = (input.n() + input.m() + 1) / 2 + 1;
524    let iter_len = 2 * max + 1;
525    v.resize(2 * iter_len, 0);
526
527    let (v1, v2) = v.split_at_mut(iter_len);
528    let ctx_fwd = &mut DiffTraversal::from_slice(input, v1, true, max);
529    let ctx_bwd = &mut DiffTraversal::from_slice(input, v2, false, max);
530    let mut result = (0..max)
531        .filter_map(|d| diff_sequences_kernel_bidirectional(input, ctx_fwd, ctx_bwd, d))
532        .next()
533        .expect("snake not found");
534    result.0.x0 += input.removed.start_index;
535    result.0.y0 += input.added.start_index;
536    result
537}
538
539fn to_isize(input: usize) -> isize {
540    isize::try_from(input).unwrap()
541}
542
543fn to_usize(input: isize) -> usize {
544    usize::try_from(input).unwrap()
545}
546
547#[derive(PartialEq, Eq, Clone, Copy, Debug)]
548enum TokenKind {
549    Other,
550    Word,
551    Spaces,
552}
553
554/// Tokenize data from `src` from the position `ofs` into `tokens`.
555pub fn tokenize(src: &[u8], ofs: usize, tokens: &mut Vec<HashedSpan>) {
556    let mut push = |lo: usize, hi: usize| {
557        if lo < hi {
558            tokens.push(HashedSpan {
559                lo,
560                hi,
561                hash: hash_slice(&src[lo..hi]),
562            })
563        }
564    };
565    let mut lo = ofs;
566    let mut kind = TokenKind::Other;
567    for hi in ofs..src.len() {
568        let oldkind = kind;
569        kind = classify_byte(src[hi]);
570        if kind != oldkind || oldkind == TokenKind::Other {
571            push(lo, hi);
572            lo = hi
573        }
574    }
575    push(lo, src.len());
576}
577
578fn classify_byte(b: u8) -> TokenKind {
579    match b {
580        b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' => TokenKind::Word,
581        b'\t' | b' ' => TokenKind::Spaces,
582        _ => TokenKind::Other,
583    }
584}
585
586mod best_projection;
587pub use crate::best_projection::optimize_partition;
588pub use crate::best_projection::{NormalizationResult, SharedSegments};
589
590#[cfg(test)]
591mod test;