dissimilar/
lib.rs

1//! [![github]](https://github.com/dtolnay/dissimilar) [![crates-io]](https://crates.io/crates/dissimilar) [![docs-rs]](https://docs.rs/dissimilar)
2//!
3//! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github
4//! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust
5//! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs
6//!
7//! <br>
8//!
9//! ## Diff library with semantic cleanup, based on Google's diff-match-patch
10//!
11//! This library is a port of the Diff component of [Diff Match Patch] to Rust.
12//! The diff implementation is based on [Myers' diff algorithm] but includes
13//! some [semantic cleanups] to increase human readability by factoring out
14//! commonalities which are likely to be coincidental.
15//!
16//! Diff Match Patch was originally built in 2006 to power Google Docs.
17//!
18//! # Interface
19//!
20//! Here is the entire API of the Rust implementation. It operates on borrowed
21//! strings and the return value of the diff algorithm is a vector of chunks
22//! pointing into slices of those input strings.
23//!
24//! ```
25//! pub enum Chunk<'a> {
26//!     Equal(&'a str),
27//!     Delete(&'a str),
28//!     Insert(&'a str),
29//! }
30//!
31//! # const IGNORE: &str = stringify! {
32//! pub fn diff(text1: &str, text2: &str) -> Vec<Chunk>;
33//! # };
34//! ```
35//!
36//! [Diff Match Patch]: https://github.com/google/diff-match-patch
37//! [Myers' diff algorithm]: https://neil.fraser.name/writing/diff/myers.pdf
38//! [semantic cleanups]: https://neil.fraser.name/writing/diff/
39
40#![doc(html_root_url = "https://docs.rs/dissimilar/1.0.10")]
41#![allow(
42    clippy::blocks_in_conditions,
43    clippy::bool_to_int_with_if,
44    clippy::cast_possible_wrap,
45    clippy::cast_sign_loss,
46    clippy::cloned_instead_of_copied, // https://github.com/rust-lang/rust-clippy/issues/7127
47    clippy::collapsible_else_if,
48    clippy::comparison_chain,
49    clippy::implied_bounds_in_impls,
50    clippy::items_after_test_module, // https://github.com/rust-lang/rust-clippy/issues/10713
51    clippy::let_underscore_untyped,
52    clippy::match_same_arms,
53    clippy::module_name_repetitions,
54    clippy::must_use_candidate,
55    clippy::new_without_default,
56    clippy::octal_escapes,
57    clippy::shadow_unrelated,
58    clippy::similar_names,
59    clippy::too_many_lines,
60    clippy::unseparated_literal_suffix,
61    unused_parens, // false positive on Some(&(mut diff)) pattern
62)]
63
64mod find;
65mod range;
66
67#[cfg(test)]
68mod tests;
69
70use crate::range::{slice, Range};
71use std::cmp;
72use std::collections::VecDeque;
73use std::fmt::{self, Debug, Display, Write};
74
75#[derive(Copy, Clone, PartialEq, Eq)]
76pub enum Chunk<'a> {
77    Equal(&'a str),
78    Delete(&'a str),
79    Insert(&'a str),
80}
81
82#[derive(Copy, Clone)]
83enum Diff<'a, 'b> {
84    Equal(Range<'a>, Range<'b>),
85    Delete(Range<'a>),
86    Insert(Range<'b>),
87}
88
89impl<'tmp, 'a: 'tmp, 'b: 'tmp> Diff<'a, 'b> {
90    fn text(&self) -> Range<'tmp> {
91        match *self {
92            Diff::Equal(range, _) | Diff::Delete(range) | Diff::Insert(range) => range,
93        }
94    }
95
96    fn grow_left(&mut self, increment: usize) {
97        self.for_each(|range| {
98            range.offset -= increment;
99            range.len += increment;
100        });
101    }
102
103    fn grow_right(&mut self, increment: usize) {
104        self.for_each(|range| range.len += increment);
105    }
106
107    fn shift_left(&mut self, increment: usize) {
108        self.for_each(|range| range.offset -= increment);
109    }
110
111    fn shift_right(&mut self, increment: usize) {
112        self.for_each(|range| range.offset += increment);
113    }
114
115    fn for_each(&mut self, f: impl Fn(&mut Range)) {
116        match self {
117            Diff::Equal(range1, range2) => {
118                f(range1);
119                f(range2);
120            }
121            Diff::Delete(range) => f(range),
122            Diff::Insert(range) => f(range),
123        }
124    }
125}
126
127pub fn diff<'a>(text1: &'a str, text2: &'a str) -> Vec<Chunk<'a>> {
128    let chars1: Vec<char> = text1.chars().collect();
129    let chars2: Vec<char> = text2.chars().collect();
130    let range1 = Range::new(&chars1, ..);
131    let range2 = Range::new(&chars2, ..);
132
133    let mut solution = main(range1, range2);
134    cleanup_char_boundary(&mut solution);
135    cleanup_semantic(&mut solution);
136    cleanup_merge(&mut solution);
137
138    let mut chunks = Vec::new();
139    let mut pos1 = 0;
140    let mut pos2 = 0;
141    for diff in solution.diffs {
142        chunks.push(match diff {
143            Diff::Equal(range, _) => {
144                let len = range.len_bytes();
145                let chunk = Chunk::Equal(&text1[pos1..pos1 + len]);
146                pos1 += len;
147                pos2 += len;
148                chunk
149            }
150            Diff::Delete(range) => {
151                let len = range.len_bytes();
152                let chunk = Chunk::Delete(&text1[pos1..pos1 + len]);
153                pos1 += len;
154                chunk
155            }
156            Diff::Insert(range) => {
157                let len = range.len_bytes();
158                let chunk = Chunk::Insert(&text2[pos2..pos2 + len]);
159                pos2 += len;
160                chunk
161            }
162        });
163    }
164    chunks
165}
166
167struct Solution<'a, 'b> {
168    text1: Range<'a>,
169    text2: Range<'b>,
170    diffs: Vec<Diff<'a, 'b>>,
171}
172
173fn main<'a, 'b>(mut text1: Range<'a>, mut text2: Range<'b>) -> Solution<'a, 'b> {
174    let whole1 = text1;
175    let whole2 = text2;
176
177    // Trim off common prefix.
178    let common_prefix_len = common_prefix(text1, text2);
179    let common_prefix = Diff::Equal(
180        text1.substring(..common_prefix_len),
181        text2.substring(..common_prefix_len),
182    );
183    text1 = text1.substring(common_prefix_len..);
184    text2 = text2.substring(common_prefix_len..);
185
186    // Trim off common suffix.
187    let common_suffix_len = common_suffix(text1, text2);
188    let common_suffix = Diff::Equal(
189        text1.substring(text1.len - common_suffix_len..),
190        text2.substring(text2.len - common_suffix_len..),
191    );
192    text1 = text1.substring(..text1.len - common_suffix_len);
193    text2 = text2.substring(..text2.len - common_suffix_len);
194
195    // Compute the diff on the middle block.
196    let mut solution = Solution {
197        text1: whole1,
198        text2: whole2,
199        diffs: compute(text1, text2),
200    };
201
202    // Restore the prefix and suffix.
203    if common_prefix_len > 0 {
204        solution.diffs.insert(0, common_prefix);
205    }
206    if common_suffix_len > 0 {
207        solution.diffs.push(common_suffix);
208    }
209
210    cleanup_merge(&mut solution);
211
212    solution
213}
214
215// Find the differences between two texts. Assumes that the texts do not have
216// any common prefix or suffix.
217fn compute<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
218    match (text1.is_empty(), text2.is_empty()) {
219        (true, true) => return Vec::new(),
220        (true, false) => return vec![Diff::Insert(text2)],
221        (false, true) => return vec![Diff::Delete(text1)],
222        (false, false) => {}
223    }
224
225    // Check for entire shorter text inside the longer text.
226    if text1.len > text2.len {
227        if let Some(i) = text1.find(text2) {
228            return vec![
229                Diff::Delete(text1.substring(..i)),
230                Diff::Equal(text1.substring(i..i + text2.len), text2),
231                Diff::Delete(text1.substring(i + text2.len..)),
232            ];
233        }
234    } else {
235        if let Some(i) = text2.find(text1) {
236            return vec![
237                Diff::Insert(text2.substring(..i)),
238                Diff::Equal(text1, text2.substring(i..i + text1.len)),
239                Diff::Insert(text2.substring(i + text1.len..)),
240            ];
241        }
242    }
243
244    if text1.len == 1 || text2.len == 1 {
245        // Single character string.
246        // After the previous check, the character can't be an equality.
247        return vec![Diff::Delete(text1), Diff::Insert(text2)];
248    }
249
250    bisect(text1, text2)
251}
252
253// Find the 'middle snake' of a diff, split the problem in two and return the
254// recursively constructed diff.
255//
256// See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
257fn bisect<'a, 'b>(text1: Range<'a>, text2: Range<'b>) -> Vec<Diff<'a, 'b>> {
258    let max_d = (text1.len + text2.len + 1) / 2;
259    let v_offset = max_d;
260    let v_len = 2 * max_d;
261    let mut v1 = vec![-1isize; v_len];
262    let mut v2 = vec![-1isize; v_len];
263    v1[v_offset + 1] = 0;
264    v2[v_offset + 1] = 0;
265    let delta = text1.len as isize - text2.len as isize;
266    // If the total number of characters is odd, then the front path will
267    // collide with the reverse path.
268    let front = delta % 2 != 0;
269    // Offsets for start and end of k loop.
270    // Prevents mapping of space beyond the grid.
271    let mut k1start = 0;
272    let mut k1end = 0;
273    let mut k2start = 0;
274    let mut k2end = 0;
275    for d in 0..max_d as isize {
276        // Walk the front path one step.
277        let mut k1 = -d + k1start;
278        while k1 <= d - k1end {
279            let k1_offset = (v_offset as isize + k1) as usize;
280            let mut x1 = if k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1]) {
281                v1[k1_offset + 1]
282            } else {
283                v1[k1_offset - 1] + 1
284            } as usize;
285            let mut y1 = (x1 as isize - k1) as usize;
286            if let (Some(s1), Some(s2)) = (text1.get(x1..), text2.get(y1..)) {
287                let advance = common_prefix(s1, s2);
288                x1 += advance;
289                y1 += advance;
290            }
291            v1[k1_offset] = x1 as isize;
292            if x1 > text1.len {
293                // Ran off the right of the graph.
294                k1end += 2;
295            } else if y1 > text2.len {
296                // Ran off the bottom of the graph.
297                k1start += 2;
298            } else if front {
299                let k2_offset = v_offset as isize + delta - k1;
300                if k2_offset >= 0 && k2_offset < v_len as isize && v2[k2_offset as usize] != -1 {
301                    // Mirror x2 onto top-left coordinate system.
302                    let x2 = text1.len as isize - v2[k2_offset as usize];
303                    if x1 as isize >= x2 {
304                        // Overlap detected.
305                        return bisect_split(text1, text2, x1, y1);
306                    }
307                }
308            }
309            k1 += 2;
310        }
311
312        // Walk the reverse path one step.
313        let mut k2 = -d + k2start;
314        while k2 <= d - k2end {
315            let k2_offset = (v_offset as isize + k2) as usize;
316            let mut x2 = if k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]) {
317                v2[k2_offset + 1]
318            } else {
319                v2[k2_offset - 1] + 1
320            } as usize;
321            let mut y2 = (x2 as isize - k2) as usize;
322            if x2 < text1.len && y2 < text2.len {
323                let advance = common_suffix(
324                    text1.substring(..text1.len - x2),
325                    text2.substring(..text2.len - y2),
326                );
327                x2 += advance;
328                y2 += advance;
329            }
330            v2[k2_offset] = x2 as isize;
331            if x2 > text1.len {
332                // Ran off the left of the graph.
333                k2end += 2;
334            } else if y2 > text2.len {
335                // Ran off the top of the graph.
336                k2start += 2;
337            } else if !front {
338                let k1_offset = v_offset as isize + delta - k2;
339                if k1_offset >= 0 && k1_offset < v_len as isize && v1[k1_offset as usize] != -1 {
340                    let x1 = v1[k1_offset as usize] as usize;
341                    let y1 = v_offset + x1 - k1_offset as usize;
342                    // Mirror x2 onto top-left coordinate system.
343                    x2 = text1.len - x2;
344                    if x1 >= x2 {
345                        // Overlap detected.
346                        return bisect_split(text1, text2, x1, y1);
347                    }
348                }
349            }
350            k2 += 2;
351        }
352    }
353    // Number of diffs equals number of characters, no commonality at all.
354    vec![Diff::Delete(text1), Diff::Insert(text2)]
355}
356
357// Given the location of the 'middle snake', split the diff in two parts and
358// recurse.
359fn bisect_split<'a, 'b>(
360    text1: Range<'a>,
361    text2: Range<'b>,
362    x: usize,
363    y: usize,
364) -> Vec<Diff<'a, 'b>> {
365    let (text1a, text1b) = text1.split_at(x);
366    let (text2a, text2b) = text2.split_at(y);
367
368    // Compute both diffs serially.
369    let mut diffs = main(text1a, text2a).diffs;
370    diffs.extend(main(text1b, text2b).diffs);
371
372    diffs
373}
374
375// Determine the length of the common prefix of two strings.
376fn common_prefix(text1: Range, text2: Range) -> usize {
377    for (i, (b1, b2)) in text1.chars().zip(text2.chars()).enumerate() {
378        if b1 != b2 {
379            return i;
380        }
381    }
382    cmp::min(text1.len, text2.len)
383}
384
385// Determine the length of the common suffix of two strings.
386fn common_suffix(text1: Range, text2: Range) -> usize {
387    for (i, (b1, b2)) in text1.chars().rev().zip(text2.chars().rev()).enumerate() {
388        if b1 != b2 {
389            return i;
390        }
391    }
392    cmp::min(text1.len, text2.len)
393}
394
395// Determine if the suffix of one string is the prefix of another.
396//
397// Returns the number of characters common to the end of the first string and
398// the start of the second string.
399fn common_overlap(mut text1: Range, mut text2: Range) -> usize {
400    // Eliminate the null case.
401    if text1.is_empty() || text2.is_empty() {
402        return 0;
403    }
404    // Truncate the longer string.
405    if text1.len > text2.len {
406        text1 = text1.substring(text1.len - text2.len..);
407    } else if text1.len < text2.len {
408        text2 = text2.substring(..text1.len);
409    }
410    // Quick check for the worst case.
411    if slice(text1) == slice(text2) {
412        return text1.len;
413    }
414
415    // Start by looking for a single character match
416    // and increase length until no match is found.
417    // Performance analysis: https://neil.fraser.name/news/2010/11/04/
418    let mut best = 0;
419    let mut length = 1;
420    loop {
421        let pattern = text1.substring(text1.len - length..);
422        let found = match text2.find(pattern) {
423            Some(found) => found,
424            None => return best,
425        };
426        length += found;
427        if found == 0
428            || slice(text1.substring(text1.len - length..)) == slice(text2.substring(..length))
429        {
430            best = length;
431            length += 1;
432        }
433    }
434}
435
436fn cleanup_char_boundary(solution: &mut Solution) {
437    fn is_segmentation_boundary(doc: &[char], pos: usize) -> bool {
438        // FIXME: use unicode-segmentation crate?
439        let _ = doc;
440        let _ = pos;
441        true
442    }
443
444    fn boundary_down(doc: &[char], pos: usize) -> usize {
445        let mut adjust = 0;
446        while !is_segmentation_boundary(doc, pos - adjust) {
447            adjust += 1;
448        }
449        adjust
450    }
451
452    fn boundary_up(doc: &[char], pos: usize) -> usize {
453        let mut adjust = 0;
454        while !is_segmentation_boundary(doc, pos + adjust) {
455            adjust += 1;
456        }
457        adjust
458    }
459
460    fn skip_overlap<'a>(prev: &Range<'a>, range: &mut Range<'a>) {
461        let prev_end = prev.offset + prev.len;
462        if prev_end > range.offset {
463            let delta = cmp::min(prev_end - range.offset, range.len);
464            range.offset += delta;
465            range.len -= delta;
466        }
467    }
468
469    let mut read = 0;
470    let mut retain = 0;
471    let mut last_delete = Range::empty();
472    let mut last_insert = Range::empty();
473    while let Some(&(mut diff)) = solution.diffs.get(read) {
474        read += 1;
475        match &mut diff {
476            Diff::Equal(range1, range2) => {
477                let adjust = boundary_up(range1.doc, range1.offset);
478                // If the whole range is sub-character, skip it.
479                if range1.len <= adjust {
480                    continue;
481                }
482                range1.offset += adjust;
483                range1.len -= adjust;
484                range2.offset += adjust;
485                range2.len -= adjust;
486                let adjust = boundary_down(range1.doc, range1.offset + range1.len);
487                range1.len -= adjust;
488                range2.len -= adjust;
489                last_delete = Range::empty();
490                last_insert = Range::empty();
491            }
492            Diff::Delete(range) => {
493                skip_overlap(&last_delete, range);
494                if range.len == 0 {
495                    continue;
496                }
497                let adjust = boundary_down(range.doc, range.offset);
498                range.offset -= adjust;
499                range.len += adjust;
500                let adjust = boundary_up(range.doc, range.offset + range.len);
501                range.len += adjust;
502                last_delete = *range;
503            }
504            Diff::Insert(range) => {
505                skip_overlap(&last_insert, range);
506                if range.len == 0 {
507                    continue;
508                }
509                let adjust = boundary_down(range.doc, range.offset);
510                range.offset -= adjust;
511                range.len += adjust;
512                let adjust = boundary_up(range.doc, range.offset + range.len);
513                range.len += adjust;
514                last_insert = *range;
515            }
516        }
517        solution.diffs[retain] = diff;
518        retain += 1;
519    }
520
521    solution.diffs.truncate(retain);
522}
523
524// Reduce the number of edits by eliminating semantically trivial equalities.
525fn cleanup_semantic(solution: &mut Solution) {
526    let mut diffs = &mut solution.diffs;
527    if diffs.is_empty() {
528        return;
529    }
530
531    let mut changes = false;
532    let mut equalities = VecDeque::new(); // Double-ended queue of equalities.
533    let mut last_equality = None; // Always equal to equalities.peek().text
534    let mut pointer = 0;
535    // Number of characters that changed prior to the equality.
536    let mut len_insertions1 = 0;
537    let mut len_deletions1 = 0;
538    // Number of characters that changed after the equality.
539    let mut len_insertions2 = 0;
540    let mut len_deletions2 = 0;
541    while let Some(&this_diff) = diffs.get(pointer) {
542        match this_diff {
543            Diff::Equal(text1, text2) => {
544                equalities.push_back(pointer);
545                len_insertions1 = len_insertions2;
546                len_deletions1 = len_deletions2;
547                len_insertions2 = 0;
548                len_deletions2 = 0;
549                last_equality = Some((text1, text2));
550                pointer += 1;
551                continue;
552            }
553            Diff::Delete(text) => len_deletions2 += text.len,
554            Diff::Insert(text) => len_insertions2 += text.len,
555        }
556        // Eliminate an equality that is smaller or equal to the edits on both
557        // sides of it.
558        if last_equality.map_or(false, |(last_equality, _)| {
559            last_equality.len <= cmp::max(len_insertions1, len_deletions1)
560                && last_equality.len <= cmp::max(len_insertions2, len_deletions2)
561        }) {
562            // Jump back to offending equality.
563            pointer = equalities.pop_back().unwrap();
564
565            // Replace equality with a delete.
566            diffs[pointer] = Diff::Delete(last_equality.unwrap().0);
567            // Insert a corresponding insert.
568            diffs.insert(pointer + 1, Diff::Insert(last_equality.unwrap().1));
569
570            len_insertions1 = 0; // Reset the counters.
571            len_insertions2 = 0;
572            len_deletions1 = 0;
573            len_deletions2 = 0;
574            last_equality = None;
575            changes = true;
576
577            // Throw away the previous equality (it needs to be reevaluated).
578            equalities.pop_back();
579            if let Some(back) = equalities.back() {
580                // There is a safe equality we can fall back to.
581                pointer = *back;
582            } else {
583                // There are no previous equalities, jump back to the start.
584                pointer = 0;
585                continue;
586            }
587        }
588        pointer += 1;
589    }
590
591    // Normalize the diff.
592    if changes {
593        cleanup_merge(solution);
594    }
595    cleanup_semantic_lossless(solution);
596    diffs = &mut solution.diffs;
597
598    // Find any overlaps between deletions and insertions.
599    // e.g: <del>abcxxx</del><ins>xxxdef</ins>
600    //   -> <del>abc</del>xxx<ins>def</ins>
601    // e.g: <del>xxxabc</del><ins>defxxx</ins>
602    //   -> <ins>def</ins>xxx<del>abc</del>
603    // Only extract an overlap if it is as big as the edit ahead or behind it.
604    let mut pointer = 1;
605    while let Some(&this_diff) = diffs.get(pointer) {
606        let prev_diff = diffs[pointer - 1];
607        if let (Diff::Delete(deletion), Diff::Insert(insertion)) = (prev_diff, this_diff) {
608            let overlap_len1 = common_overlap(deletion, insertion);
609            let overlap_len2 = common_overlap(insertion, deletion);
610            let overlap_min = cmp::min(deletion.len, insertion.len);
611            if overlap_len1 >= overlap_len2 && 2 * overlap_len1 >= overlap_min {
612                // Overlap found. Insert an equality and trim the surrounding edits.
613                diffs.insert(
614                    pointer,
615                    Diff::Equal(
616                        deletion.substring(deletion.len - overlap_len1..deletion.len),
617                        insertion.substring(..overlap_len1),
618                    ),
619                );
620                diffs[pointer - 1] =
621                    Diff::Delete(deletion.substring(..deletion.len - overlap_len1));
622                diffs[pointer + 1] = Diff::Insert(insertion.substring(overlap_len1..));
623            } else if overlap_len1 < overlap_len2 && 2 * overlap_len2 >= overlap_min {
624                // Reverse overlap found.
625                // Insert an equality and swap and trim the surrounding edits.
626                diffs.insert(
627                    pointer,
628                    Diff::Equal(
629                        deletion.substring(..overlap_len2),
630                        insertion.substring(insertion.len - overlap_len2..insertion.len),
631                    ),
632                );
633                diffs[pointer - 1] =
634                    Diff::Insert(insertion.substring(..insertion.len - overlap_len2));
635                diffs[pointer + 1] = Diff::Delete(deletion.substring(overlap_len2..));
636            }
637            pointer += 1;
638        }
639        pointer += 1;
640    }
641}
642
643// Look for single edits surrounded on both sides by equalities which can be
644// shifted sideways to align the edit to a word boundary.
645//
646// e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
647fn cleanup_semantic_lossless(solution: &mut Solution) {
648    let diffs = &mut solution.diffs;
649    let mut pointer = 1;
650    while let Some(&next_diff) = diffs.get(pointer + 1) {
651        let prev_diff = diffs[pointer - 1];
652        if let (
653            Diff::Equal(mut prev_equal1, mut prev_equal2),
654            Diff::Equal(mut next_equal1, mut next_equal2),
655        ) = (prev_diff, next_diff)
656        {
657            // This is a single edit surrounded by equalities.
658            let mut edit = diffs[pointer];
659
660            // First, shift the edit as far left as possible.
661            let common_offset = common_suffix(prev_equal1, edit.text());
662            let original_prev_len = prev_equal1.len;
663            prev_equal1.len -= common_offset;
664            prev_equal2.len -= common_offset;
665            edit.shift_left(common_offset);
666            next_equal1.offset -= common_offset;
667            next_equal1.len += common_offset;
668            next_equal2.offset -= common_offset;
669            next_equal2.len += common_offset;
670
671            // Second, step character by character right, looking for the best fit.
672            let mut best_prev_equal = (prev_equal1, prev_equal2);
673            let mut best_edit = edit;
674            let mut best_next_equal = (next_equal1, next_equal2);
675            let mut best_score = cleanup_semantic_score(prev_equal1, edit.text())
676                + cleanup_semantic_score(edit.text(), next_equal1);
677            while !edit.text().is_empty()
678                && !next_equal1.is_empty()
679                && edit.text().chars().next().unwrap() == next_equal1.chars().next().unwrap()
680            {
681                prev_equal1.len += 1;
682                prev_equal2.len += 1;
683                edit.shift_right(1);
684                next_equal1.offset += 1;
685                next_equal1.len -= 1;
686                next_equal2.offset += 1;
687                next_equal2.len -= 1;
688                let score = cleanup_semantic_score(prev_equal1, edit.text())
689                    + cleanup_semantic_score(edit.text(), next_equal1);
690                // The >= encourages trailing rather than leading whitespace on edits.
691                if score >= best_score {
692                    best_score = score;
693                    best_prev_equal = (prev_equal1, prev_equal2);
694                    best_edit = edit;
695                    best_next_equal = (next_equal1, next_equal2);
696                }
697            }
698
699            if original_prev_len != best_prev_equal.0.len {
700                // We have an improvement, save it back to the diff.
701                if best_next_equal.0.is_empty() {
702                    diffs.remove(pointer + 1);
703                } else {
704                    diffs[pointer + 1] = Diff::Equal(best_next_equal.0, best_next_equal.1);
705                }
706                diffs[pointer] = best_edit;
707                if best_prev_equal.0.is_empty() {
708                    diffs.remove(pointer - 1);
709                    pointer -= 1;
710                } else {
711                    diffs[pointer - 1] = Diff::Equal(best_prev_equal.0, best_prev_equal.1);
712                }
713            }
714        }
715        pointer += 1;
716    }
717}
718
719// Given two strings, compute a score representing whether the internal boundary
720// falls on logical boundaries.
721//
722// Scores range from 6 (best) to 0 (worst).
723fn cleanup_semantic_score(one: Range, two: Range) -> usize {
724    if one.is_empty() || two.is_empty() {
725        // Edges are the best.
726        return 6;
727    }
728
729    // Each port of this function behaves slightly differently due to subtle
730    // differences in each language's definition of things like 'whitespace'.
731    // Since this function's purpose is largely cosmetic, the choice has been
732    // made to use each language's native features rather than force total
733    // conformity.
734    let char1 = one.chars().next_back().unwrap();
735    let char2 = two.chars().next().unwrap();
736    let non_alphanumeric1 = !char1.is_ascii_alphanumeric();
737    let non_alphanumeric2 = !char2.is_ascii_alphanumeric();
738    let whitespace1 = non_alphanumeric1 && char1.is_ascii_whitespace();
739    let whitespace2 = non_alphanumeric2 && char2.is_ascii_whitespace();
740    let line_break1 = whitespace1 && char1.is_control();
741    let line_break2 = whitespace2 && char2.is_control();
742    let blank_line1 =
743        line_break1 && (one.ends_with(['\n', '\n']) || one.ends_with(['\n', '\r', '\n']));
744    let blank_line2 =
745        line_break2 && (two.starts_with(['\n', '\n']) || two.starts_with(['\r', '\n', '\r', '\n']));
746
747    if blank_line1 || blank_line2 {
748        // Five points for blank lines.
749        5
750    } else if line_break1 || line_break2 {
751        // Four points for line breaks.
752        4
753    } else if non_alphanumeric1 && !whitespace1 && whitespace2 {
754        // Three points for end of sentences.
755        3
756    } else if whitespace1 || whitespace2 {
757        // Two points for whitespace.
758        2
759    } else if non_alphanumeric1 || non_alphanumeric2 {
760        // One point for non-alphanumeric.
761        1
762    } else {
763        0
764    }
765}
766
767// Reorder and merge like edit sections. Merge equalities. Any edit section can
768// move as long as it doesn't cross an equality.
769fn cleanup_merge(solution: &mut Solution) {
770    let diffs = &mut solution.diffs;
771    while !diffs.is_empty() {
772        diffs.push(Diff::Equal(
773            solution.text1.substring(solution.text1.len..),
774            solution.text2.substring(solution.text2.len..),
775        )); // Add a dummy entry at the end.
776        let mut pointer = 0;
777        let mut count_delete = 0;
778        let mut count_insert = 0;
779        let mut text_delete = Range::empty();
780        let mut text_insert = Range::empty();
781        while let Some(&this_diff) = diffs.get(pointer) {
782            match this_diff {
783                Diff::Insert(text) => {
784                    count_insert += 1;
785                    if text_insert.is_empty() {
786                        text_insert = text;
787                    } else {
788                        text_insert.len += text.len;
789                    }
790                }
791                Diff::Delete(text) => {
792                    count_delete += 1;
793                    if text_delete.is_empty() {
794                        text_delete = text;
795                    } else {
796                        text_delete.len += text.len;
797                    }
798                }
799                Diff::Equal(text, _) => {
800                    let count_both = count_delete + count_insert;
801                    if count_both > 1 {
802                        let both_types = count_delete != 0 && count_insert != 0;
803                        // Delete the offending records.
804                        diffs.drain(pointer - count_both..pointer);
805                        pointer -= count_both;
806                        if both_types {
807                            // Factor out any common prefix.
808                            let common_length = common_prefix(text_insert, text_delete);
809                            if common_length != 0 {
810                                if pointer > 0 {
811                                    match &mut diffs[pointer - 1] {
812                                        Diff::Equal(this_diff1, this_diff2) => {
813                                            this_diff1.len += common_length;
814                                            this_diff2.len += common_length;
815                                        }
816                                        _ => unreachable!(
817                                            "previous diff should have been an equality"
818                                        ),
819                                    }
820                                } else {
821                                    diffs.insert(
822                                        pointer,
823                                        Diff::Equal(
824                                            text_delete.substring(..common_length),
825                                            text_insert.substring(..common_length),
826                                        ),
827                                    );
828                                    pointer += 1;
829                                }
830                                text_insert = text_insert.substring(common_length..);
831                                text_delete = text_delete.substring(common_length..);
832                            }
833                            // Factor out any common suffix.
834                            let common_length = common_suffix(text_insert, text_delete);
835                            if common_length != 0 {
836                                diffs[pointer].grow_left(common_length);
837                                text_insert.len -= common_length;
838                                text_delete.len -= common_length;
839                            }
840                        }
841                        // Insert the merged records.
842                        if !text_delete.is_empty() {
843                            diffs.insert(pointer, Diff::Delete(text_delete));
844                            pointer += 1;
845                        }
846                        if !text_insert.is_empty() {
847                            diffs.insert(pointer, Diff::Insert(text_insert));
848                            pointer += 1;
849                        }
850                    } else if pointer > 0 {
851                        if let Some(Diff::Equal(prev_equal1, prev_equal2)) =
852                            diffs.get_mut(pointer - 1)
853                        {
854                            // Merge this equality with the previous one.
855                            prev_equal1.len += text.len;
856                            prev_equal2.len += text.len;
857                            diffs.remove(pointer);
858                            pointer -= 1;
859                        }
860                    }
861                    count_insert = 0;
862                    count_delete = 0;
863                    text_delete = Range::empty();
864                    text_insert = Range::empty();
865                }
866            }
867            pointer += 1;
868        }
869        if diffs.last().unwrap().text().is_empty() {
870            diffs.pop(); // Remove the dummy entry at the end.
871        }
872
873        // Second pass: look for single edits surrounded on both sides by equalities
874        // which can be shifted sideways to eliminate an equality.
875        // e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
876        let mut changes = false;
877        let mut pointer = 1;
878        // Intentionally ignore the first and last element (don't need checking).
879        while let Some(&next_diff) = diffs.get(pointer + 1) {
880            let prev_diff = diffs[pointer - 1];
881            let this_diff = diffs[pointer];
882            if let (Diff::Equal(prev_diff, _), Diff::Equal(next_diff, _)) = (prev_diff, next_diff) {
883                // This is a single edit surrounded by equalities.
884                if this_diff.text().ends_with(prev_diff) {
885                    // Shift the edit over the previous equality.
886                    diffs[pointer].shift_left(prev_diff.len);
887                    diffs[pointer + 1].grow_left(prev_diff.len);
888                    diffs.remove(pointer - 1); // Delete prev_diff.
889                    changes = true;
890                } else if this_diff.text().starts_with(next_diff) {
891                    // Shift the edit over the next equality.
892                    diffs[pointer - 1].grow_right(next_diff.len);
893                    diffs[pointer].shift_right(next_diff.len);
894                    diffs.remove(pointer + 1); // Delete next_diff.
895                    changes = true;
896                }
897            }
898            pointer += 1;
899        }
900        // If shifts were made, the diff needs reordering and another shift sweep.
901        if !changes {
902            return;
903        }
904    }
905}
906
907impl Debug for Chunk<'_> {
908    fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
909        let (name, text) = match *self {
910            Chunk::Equal(text) => ("Equal", text),
911            Chunk::Delete(text) => ("Delete", text),
912            Chunk::Insert(text) => ("Insert", text),
913        };
914        write!(formatter, "{}({:?})", name, text)
915    }
916}
917
918impl Debug for Diff<'_, '_> {
919    fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
920        let (name, range) = match *self {
921            Diff::Equal(range, _) => ("Equal", range),
922            Diff::Delete(range) => ("Delete", range),
923            Diff::Insert(range) => ("Insert", range),
924        };
925        formatter.write_str(name)?;
926        formatter.write_str("(\"")?;
927        for ch in range.chars() {
928            if ch == '\'' {
929                // escape_debug turns this into "\'" which is unnecessary.
930                formatter.write_char(ch)?;
931            } else {
932                Display::fmt(&ch.escape_debug(), formatter)?;
933            }
934        }
935        formatter.write_str("\")")?;
936        Ok(())
937    }
938}