1use crate::cli::Cli;
2use crate::types::{CompFile, ComparisonFn, Match, Matches, MatchesLookup};
3
4use std::sync::mpsc;
5
6const INSERTION_COST: usize = 1;
7const DELETION_COST: usize = 1;
8const SUBSTITUTION_COST: usize = 1;
9
10pub fn comparison_lambda(args: &Cli) -> ComparisonFn {
14 let threshold = args.lev_threshold;
15 if threshold == 0 {
16 Box::new(move |x, y| x == y)
17 } else {
18 Box::new(move |x, y| levenshtein_distance(x, y, threshold) <= threshold)
19 }
20}
21
22fn get_max_block_size(comp: &ComparisonFn, f1: &CompFile, f2: &CompFile) -> usize {
28 let mut block_length = 1;
29
30 loop {
31 let i1 = f1.start + block_length;
32 let i2 = f2.start + block_length;
33
34 if f1.file == f2.file && i1 == f2.start {
35 return block_length;
36 }
37
38 if i1 >= f1.lines.len() || i2 >= f2.lines.len() {
39 return block_length;
40 }
41
42 if comp(&f1.lines[i1], &f2.lines[i2]) {
43 block_length += 1;
44 } else {
45 return block_length;
46 }
47 }
48}
49
50pub fn update_matches(
60 (a, b): (Match, Match),
61 (where_is_match, matches_hash): (&mut MatchesLookup, &mut Matches),
62) {
63 let mut where_is_match_to_insert = Vec::new();
64 let (refa, refb) = (where_is_match.0.get(&a), where_is_match.0.get(&b));
65 match (refa, refb, &a, &b) {
66 (Some(refa), Some(refb), _, _) if refa != refb => {
67 let mut refb_v = matches_hash.0.remove(refb).unwrap();
69 refb_v.push(refb.clone());
70 for block in &refb_v {
71 where_is_match_to_insert.push((block.clone(), refa.clone()));
72 }
73
74 matches_hash
76 .0
77 .entry(refa.clone())
78 .and_modify(|v| v.append(&mut refb_v));
79 }
80 (Some(refblock), None, _, b) | (None, Some(refblock), b, _) => {
81 matches_hash
82 .0
83 .entry(refblock.clone())
84 .and_modify(|v| v.push(b.clone()));
85
86 where_is_match_to_insert.push((b.clone(), refblock.clone()));
87 }
88 (None, None, a, b) => {
89 matches_hash.0.insert(b.clone(), vec![a.clone()]);
90
91 where_is_match_to_insert.push((a.clone(), b.clone()));
92 where_is_match_to_insert.push((b.clone(), b.clone()));
93 }
94 _ => {}
95 }
96
97 for (key, val) in where_is_match_to_insert {
98 where_is_match.0.insert(key, val);
99 }
100}
101
102pub fn get_matches_from_2_files(
103 args: &Cli,
104 tx: &mpsc::Sender<(Match, Match)>,
105 comp: &ComparisonFn,
106 (mut f1, mut f2): (CompFile, CompFile),
107) {
108 f1.start = 0;
109
110 while f1.start < f1.lines.len() {
111 if f1.current_line().len() < args.line_threshold {
113 f1.start += 1;
114 continue;
115 }
116
117 f2.start = if f1.file == f2.file { f1.start + 1 } else { 0 };
118 let mut max_block_length = 1;
119
120 while f2.start < f2.lines.len() {
121 if comp(f1.current_line(), f2.current_line()) {
122 let block_length = get_max_block_size(comp, &f1, &f2);
123
124 if block_length < args.block_threshold {
125 f2.start += block_length;
126 continue;
127 }
128
129 let matches = Match::from_compfiles(&f1, &f2, block_length);
130 tx.send(matches).unwrap_or(());
131
132 f2.start += block_length;
133 max_block_length = std::cmp::max(max_block_length, block_length);
134 } else {
135 f2.start += 1;
136 }
137 }
138
139 f1.start += max_block_length;
140 }
141}
142
143fn to_char_vec(s: &str) -> Vec<char> {
147 let mut v = Vec::with_capacity(s.len());
148
149 for c in s.chars() {
150 v.push(c);
151 }
152
153 v
154}
155
156#[allow(clippy::needless_range_loop)]
173pub fn levenshtein_distance(x: &str, y: &str, threshold: usize) -> usize {
174 let (x, y) = (to_char_vec(x), to_char_vec(y));
175 let (m, n) = (x.len(), y.len());
176 let mut d = vec![0usize; (m + 1) * (n + 1)];
177 let size = m + 1;
178
179 if threshold >= std::cmp::max(m, n) {
181 return threshold;
182 }
183
184 if threshold < m.abs_diff(n) {
186 return threshold + 1;
187 }
188
189 for i in 1..(m + 1) {
190 d[i] = i;
191 }
192
193 for j in 1..(n + 1) {
194 d[j * size] = j;
195 }
196
197 for j in 1..(n + 1) {
198 let mut has_changed_row = false;
199
200 for i in 1..(m + 1) {
201 let sub_cost = if x[i - 1] == y[j - 1] {
202 0
203 } else {
204 SUBSTITUTION_COST
205 };
206 d[i + j * size] = std::cmp::min(
207 d[(i - 1) + j * size] + INSERTION_COST,
208 std::cmp::min(
209 d[i + (j - 1) * size] + DELETION_COST,
210 d[(i - 1) + (j - 1) * size] + sub_cost,
211 ),
212 );
213
214 if d[i + j * size] <= threshold {
215 has_changed_row = true;
216 }
217 }
218
219 if !has_changed_row {
221 return threshold + 1;
222 }
223 }
224
225 d[m + n * size]
226}
227
228#[cfg(test)]
229mod tests {
230 use super::levenshtein_distance;
231
232 macro_rules! check_lev {
233 ( $a:literal, $b:literal, $t:literal ) => {{
234 check_lev!($a, $b, $t, $t);
235 }};
236
237 ( $a:literal, $b:literal, $t:literal, $e:literal ) => {{
238 let dist = levenshtein_distance($a, $b, $t);
239 assert_eq!(
240 dist, $e,
241 "levenshtein_distance({}, {}, {}) = {}, expected {}",
242 $a, $b, $t, dist, $e
243 );
244 }};
245 }
246
247 #[test]
248 fn test_lev_distance() {
249 check_lev!("the same the same", "the same the same", 10, 0);
251 check_lev!("kitten", "sitting", 3);
252 check_lev!("train", "shine", 4);
253 check_lev!("a", "aaa", 2);
254 check_lev!("arst", "zxcv", 4);
256 check_lev!("ieanrstien", " ", 5, 6);
258 check_lev!("arstarst", "zxcv", 100, 100);
260 check_lev!("the same", "the same", 0);
262 }
263}