superdiff/
comp.rs

1use crate::cli::Cli;
2use crate::types::{CompFile, ComparisonFn, Match, Matches, MatchesLookup};
3
4use std::sync::mpsc;
5
6const INSERTION_COST: usize = 1;
7const DELETION_COST: usize = 1;
8const SUBSTITUTION_COST: usize = 1;
9
10/// Create a comparison function based on the given threshold.
11///
12/// If the threshold is 0, we use string comparison. If not, we use Levenshtein distance.
13pub fn comparison_lambda(args: &Cli) -> ComparisonFn {
14    let threshold = args.lev_threshold;
15    if threshold == 0 {
16        Box::new(move |x, y| x == y)
17    } else {
18        Box::new(move |x, y| levenshtein_distance(x, y, threshold) <= threshold)
19    }
20}
21
22/// Find block length of the matching code block.
23///
24/// Stops comparison when we reach the end of the file, or if the files are the same and the
25/// original index hits the occurrance index. This stops code blocks from "eating" the other code
26/// block (i.e. no nested overlapping blocks that are similar).
27fn get_max_block_size(comp: &ComparisonFn, f1: &CompFile, f2: &CompFile) -> usize {
28    let mut block_length = 1;
29
30    loop {
31        let i1 = f1.start + block_length;
32        let i2 = f2.start + block_length;
33
34        if f1.file == f2.file && i1 == f2.start {
35            return block_length;
36        }
37
38        if i1 >= f1.lines.len() || i2 >= f2.lines.len() {
39            return block_length;
40        }
41
42        if comp(&f1.lines[i1], &f2.lines[i2]) {
43            block_length += 1;
44        } else {
45            return block_length;
46        }
47    }
48}
49
50/// Add or remove entries from lookup and matches based on given pair of matches.
51///
52/// There are 4 situations that we prepare for.
53///
54/// 1. **The pair of matches both exist in the same bucket.** We don't need to do anything.
55/// 2. **The pair of matches both exist in different buckets.** We combine both buckets.
56/// 3. **One of the pair of matches exist in a bucket.** We put the other one in that bucket.
57/// 4. **None of the matches exist in any bucket.** We put one of them in a bucket labelled with
58///    the other.
59pub fn update_matches(
60    (a, b): (Match, Match),
61    (where_is_match, matches_hash): (&mut MatchesLookup, &mut Matches),
62) {
63    let mut where_is_match_to_insert = Vec::new();
64    let (refa, refb) = (where_is_match.0.get(&a), where_is_match.0.get(&b));
65    match (refa, refb, &a, &b) {
66        (Some(refa), Some(refb), _, _) if refa != refb => {
67            // Reassign all of refb references to refa
68            let mut refb_v = matches_hash.0.remove(refb).unwrap();
69            refb_v.push(refb.clone());
70            for block in &refb_v {
71                where_is_match_to_insert.push((block.clone(), refa.clone()));
72            }
73
74            // Append all of the refb into refa's bucket
75            matches_hash
76                .0
77                .entry(refa.clone())
78                .and_modify(|v| v.append(&mut refb_v));
79        }
80        (Some(refblock), None, _, b) | (None, Some(refblock), b, _) => {
81            matches_hash
82                .0
83                .entry(refblock.clone())
84                .and_modify(|v| v.push(b.clone()));
85
86            where_is_match_to_insert.push((b.clone(), refblock.clone()));
87        }
88        (None, None, a, b) => {
89            matches_hash.0.insert(b.clone(), vec![a.clone()]);
90
91            where_is_match_to_insert.push((a.clone(), b.clone()));
92            where_is_match_to_insert.push((b.clone(), b.clone()));
93        }
94        _ => {}
95    }
96
97    for (key, val) in where_is_match_to_insert {
98        where_is_match.0.insert(key, val);
99    }
100}
101
102pub fn get_matches_from_2_files(
103    args: &Cli,
104    tx: &mpsc::Sender<(Match, Match)>,
105    comp: &ComparisonFn,
106    (mut f1, mut f2): (CompFile, CompFile),
107) {
108    f1.start = 0;
109
110    while f1.start < f1.lines.len() {
111        // Don't consider line lengths below the threshold
112        if f1.current_line().len() < args.line_threshold {
113            f1.start += 1;
114            continue;
115        }
116
117        f2.start = if f1.file == f2.file { f1.start + 1 } else { 0 };
118        let mut max_block_length = 1;
119
120        while f2.start < f2.lines.len() {
121            if comp(f1.current_line(), f2.current_line()) {
122                let block_length = get_max_block_size(comp, &f1, &f2);
123
124                if block_length < args.block_threshold {
125                    f2.start += block_length;
126                    continue;
127                }
128
129                let matches = Match::from_compfiles(&f1, &f2, block_length);
130                tx.send(matches).unwrap_or(());
131
132                f2.start += block_length;
133                max_block_length = std::cmp::max(max_block_length, block_length);
134            } else {
135                f2.start += 1;
136            }
137        }
138
139        f1.start += max_block_length;
140    }
141}
142
143/// Make a `Vec<char>`.
144///
145/// We use a preallocated `Vec` instead of `.collect()` to avoid allocation penalties.
146fn to_char_vec(s: &str) -> Vec<char> {
147    let mut v = Vec::with_capacity(s.len());
148
149    for c in s.chars() {
150        v.push(c);
151    }
152
153    v
154}
155
156/// Compute the edit distance of 2 strings, with shortcuts.
157///
158/// Modified from wikipedia pseudocode for matrix approach (no recursion).
159///
160/// For strings x and y with length m and n respectively, we create an m+1 by n+1 matrix
161/// (represented by 1d array) of costs where moving to the right constitutes as inserting a
162/// character from y; moving down constitutes as deleting a character from y; moving diagonally
163/// across constitutes as substituting a character from y into a.
164///
165/// We stop computing if we find that nothing of our current row is under the threshold, in which
166/// case we would exit early.
167///
168/// We can also stop computing if we know that the threshold is greater than m + n, which is the
169/// maximum.
170///
171/// This algorithm runs at a time complexity of O(mn).
172#[allow(clippy::needless_range_loop)]
173pub fn levenshtein_distance(x: &str, y: &str, threshold: usize) -> usize {
174    let (x, y) = (to_char_vec(x), to_char_vec(y));
175    let (m, n) = (x.len(), y.len());
176    let mut d = vec![0usize; (m + 1) * (n + 1)];
177    let size = m + 1;
178
179    // Distance is at most the length of the longer string
180    if threshold >= std::cmp::max(m, n) {
181        return threshold;
182    }
183
184    // Distance is at least the absolute value of the difference in sizes of the two strings
185    if threshold < m.abs_diff(n) {
186        return threshold + 1;
187    }
188
189    for i in 1..(m + 1) {
190        d[i] = i;
191    }
192
193    for j in 1..(n + 1) {
194        d[j * size] = j;
195    }
196
197    for j in 1..(n + 1) {
198        let mut has_changed_row = false;
199
200        for i in 1..(m + 1) {
201            let sub_cost = if x[i - 1] == y[j - 1] {
202                0
203            } else {
204                SUBSTITUTION_COST
205            };
206            d[i + j * size] = std::cmp::min(
207                d[(i - 1) + j * size] + INSERTION_COST,
208                std::cmp::min(
209                    d[i + (j - 1) * size] + DELETION_COST,
210                    d[(i - 1) + (j - 1) * size] + sub_cost,
211                ),
212            );
213
214            if d[i + j * size] <= threshold {
215                has_changed_row = true;
216            }
217        }
218
219        // Guarantee to not pass the threshold check
220        if !has_changed_row {
221            return threshold + 1;
222        }
223    }
224
225    d[m + n * size]
226}
227
228#[cfg(test)]
229mod tests {
230    use super::levenshtein_distance;
231
232    macro_rules! check_lev {
233        ( $a:literal, $b:literal, $t:literal ) => {{
234            check_lev!($a, $b, $t, $t);
235        }};
236
237        ( $a:literal, $b:literal, $t:literal, $e:literal ) => {{
238            let dist = levenshtein_distance($a, $b, $t);
239            assert_eq!(
240                dist, $e,
241                "levenshtein_distance({}, {}, {}) = {}, expected {}",
242                $a, $b, $t, dist, $e
243            );
244        }};
245    }
246
247    #[test]
248    fn test_lev_distance() {
249        // Normal use of function
250        check_lev!("the same the same", "the same the same", 10, 0);
251        check_lev!("kitten", "sitting", 3);
252        check_lev!("train", "shine", 4);
253        check_lev!("a", "aaa", 2);
254        // Maximum threshold
255        check_lev!("arst", "zxcv", 4);
256        // Short circuit at the end
257        check_lev!("ieanrstien", "            ", 5, 6);
258        // Short circuit at the start
259        check_lev!("arstarst", "zxcv", 100, 100);
260        // A bit tight
261        check_lev!("the same", "the same", 0);
262    }
263}