Skip to main content

tldr_cli/commands/remaining/difftastic/
lcs_diff.rs

1//! A fast diff for linear content, particularly lines of text.
2//!
3//! This file uses the Wu algorithm, using the `wu-diff` crate.
4//!
5//! Difftastic has the files huge_cpp_1.cpp and huge_cpp_2.cpp in the
6//! sample_files directory for a performance stress test. These files
7//! are 22 MiB and 590,000 lines.
8
9use std::hash::Hash;
10
11use super::hash::{DftHashMap, DftHashSet};
12
13#[derive(Debug, PartialEq)]
14pub enum DiffResult<T> {
15    Left(T),
16    Both(T, T),
17    Right(T),
18}
19
20/// Compute a linear diff between `lhs` and `rhs`.
21pub fn slice<'a, T: PartialEq + Clone>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffResult<&'a T>> {
22    wu_diff::diff(lhs, rhs)
23        .into_iter()
24        .map(|result| match result {
25            wu_diff::DiffResult::Removed(r) => DiffResult::Left(&lhs[r.old_index.unwrap()]),
26            wu_diff::DiffResult::Common(c) => {
27                let lhs_id = c.old_index.unwrap();
28                let rhs_id = c.new_index.unwrap();
29                DiffResult::Both(&lhs[lhs_id], &rhs[rhs_id])
30            }
31            wu_diff::DiffResult::Added(a) => DiffResult::Right(&rhs[a.new_index.unwrap()]),
32        })
33        .collect::<Vec<_>>()
34}
35
36/// Compute a linear diff between `lhs` and `rhs`, but use hashed
37/// values internally.
38///
39/// This is faster when equality checks on `T` are expensive, such as
40/// large strings.
41pub fn slice_by_hash<'a, T: Eq + Hash>(lhs: &'a [T], rhs: &'a [T]) -> Vec<DiffResult<&'a T>> {
42    // Compute a unique numeric value for each item, use that for
43    // diffing, then return diff results in terms of the original
44    // type.
45    //
46    // This is the decorate-sort-undecorate pattern, or Schwartzian
47    // transform, for diffing.
48    let mut value_ids: DftHashMap<&T, u32> = DftHashMap::default();
49    let mut id_values: DftHashMap<u32, &T> = DftHashMap::default();
50
51    let mut lhs_ids = Vec::with_capacity(lhs.len());
52    for value in lhs {
53        let id: u32 = match value_ids.get(value) {
54            Some(id) => *id,
55            None => {
56                let new_id = value_ids.len() as u32;
57                value_ids.insert(value, new_id);
58                id_values.insert(new_id, value);
59                new_id
60            }
61        };
62        lhs_ids.push(id);
63    }
64
65    let mut rhs_ids = Vec::with_capacity(rhs.len());
66    for value in rhs {
67        let id = match value_ids.get(value) {
68            Some(id) => *id,
69            None => {
70                let new_id = value_ids.len() as u32;
71                value_ids.insert(value, new_id);
72                id_values.insert(new_id, value);
73                new_id
74            }
75        };
76        rhs_ids.push(id);
77    }
78
79    slice(&lhs_ids[..], &rhs_ids[..])
80        .into_iter()
81        .map(|result| match result {
82            DiffResult::Left(id) => DiffResult::Left(*id_values.get(id).unwrap()),
83            DiffResult::Both(lhs_id, rhs_id) => DiffResult::Both(
84                *id_values.get(lhs_id).unwrap(),
85                *id_values.get(rhs_id).unwrap(),
86            ),
87            DiffResult::Right(id) => DiffResult::Right(*id_values.get(id).unwrap()),
88        })
89        .collect::<Vec<_>>()
90}
91
92/// Compute the linear diff between `lhs` and `rhs`. If there are
93/// items that only occur on a single side, mark them as novel without
94/// processing them with Myer's diff.
95///
96/// This is substantially faster than `slice`, when `lhs` and `rhs`
97/// have few items in common.
98///
99/// (This heuristic is used in traditional diff tools too, such as GNU
100/// diff.)
101pub fn slice_unique_by_hash<'a, T: Eq + Clone + Hash>(
102    lhs: &'a [T],
103    rhs: &'a [T],
104) -> Vec<DiffResult<&'a T>> {
105    let mut lhs_set = DftHashSet::default();
106    for item in lhs {
107        lhs_set.insert(item);
108    }
109    let mut rhs_set = DftHashSet::default();
110    for item in rhs {
111        rhs_set.insert(item);
112    }
113
114    let lhs_without_unique: Vec<&'a T> = lhs.iter().filter(|n| rhs_set.contains(n)).collect();
115    let rhs_without_unique: Vec<&'a T> = rhs.iter().filter(|n| lhs_set.contains(n)).collect();
116
117    let mut res: Vec<DiffResult<&'a T>> = Vec::with_capacity(lhs.len());
118    let mut lhs_i = 0;
119    let mut rhs_i = 0;
120
121    for item in slice_by_hash(&lhs_without_unique, &rhs_without_unique) {
122        match item {
123            DiffResult::Left(lhs_item) => {
124                while lhs_i < lhs.len() {
125                    if &lhs[lhs_i] != *lhs_item {
126                        res.push(DiffResult::Left(&lhs[lhs_i]));
127                        lhs_i += 1;
128                    } else {
129                        break;
130                    }
131                }
132
133                res.push(DiffResult::Left(*lhs_item));
134                lhs_i += 1;
135            }
136            DiffResult::Both(lhs_item, rhs_item) => {
137                while lhs_i < lhs.len() {
138                    if &lhs[lhs_i] != *lhs_item {
139                        res.push(DiffResult::Left(&lhs[lhs_i]));
140                        lhs_i += 1;
141                    } else {
142                        break;
143                    }
144                }
145
146                while rhs_i < rhs.len() {
147                    if &rhs[rhs_i] != *rhs_item {
148                        res.push(DiffResult::Right(&rhs[rhs_i]));
149                        rhs_i += 1;
150                    } else {
151                        break;
152                    }
153                }
154
155                res.push(DiffResult::Both(*lhs_item, *rhs_item));
156                lhs_i += 1;
157                rhs_i += 1;
158            }
159            DiffResult::Right(rhs_item) => {
160                while rhs_i < rhs.len() {
161                    if &rhs[rhs_i] != *rhs_item {
162                        res.push(DiffResult::Right(&rhs[rhs_i]));
163                        rhs_i += 1;
164                    } else {
165                        break;
166                    }
167                }
168
169                res.push(DiffResult::Right(*rhs_item));
170                rhs_i += 1;
171            }
172        }
173    }
174
175    while lhs_i < lhs.len() {
176        res.push(DiffResult::Left(&lhs[lhs_i]));
177        lhs_i += 1;
178    }
179    while rhs_i < rhs.len() {
180        res.push(DiffResult::Right(&rhs[rhs_i]));
181        rhs_i += 1;
182    }
183
184    res
185}