1use edit_distance::edit_distance;
2use rayon::prelude::*;
3use std::collections::HashMap;
4
5use crate::matching::min_edit_errors;
6use crate::qgram::*;
7
8type RightError = usize;
9type SuffixSumArray = Vec<(Loc, RightError)>;
10
11fn compare_qgrams(
26 x: &PosQGramArray,
27 y: &PosQGramArray,
28 invert: &InvertedIndex,
29 tau: usize,
30) -> (PosQGramArray, usize) {
31 let mut i: usize = 0;
32 let mut j: usize = 0;
33 let mut epsilon: usize = 0;
34 let mut loose_mismatch: PosQGramArray = PosQGramArray::new();
35
36 let comparator = |x: &PosQGramArray,
37 y: &PosQGramArray,
38 i: &mut usize,
39 j: usize,
40 epsilon: &mut usize,
41 loose_mismatch: &mut PosQGramArray| {
42 if ((*i >= 1) && (x[*i].token != x[*i - 1].token))
43 || ((j >= 1) && (x[*i].token != y[j - 1].token))
44 || ((j >= 1) && ((x[*i].loc as isize - y[j - 1].loc as isize).abs() > tau as isize))
45 {
46 loose_mismatch.push(x[*i].clone());
47 }
48 *i += 1;
49 *epsilon += 1;
50 };
51
52 let get_len = |token_x: &Token| invert.get(token_x).unwrap().1;
53
54 while i < x.len() && j < y.len() {
55 if x[i].token == y[j].token {
56 if (x[i].loc as isize - y[j].loc as isize).abs() <= tau as isize {
57 i += 1;
58 j += 1;
59 } else if x[i].loc < y[j].loc {
60 comparator(x, y, &mut i, j, &mut epsilon, &mut loose_mismatch);
61 } else {
62 j += 1;
63 }
64 } else if (get_len(&x[i].token) < get_len(&y[j].token))
65 || ((get_len(&x[i].token) == get_len(&y[j].token))
66 & ((x[i].token.as_bytes()) < (y[j].token.as_bytes())))
67 {
68 comparator(x, y, &mut i, j, &mut epsilon, &mut loose_mismatch);
69 } else {
70 j += 1;
71 }
72 }
73 while i < x.len() {
74 comparator(x, y, &mut i, j, &mut epsilon, &mut loose_mismatch);
75 }
76
77 loose_mismatch.sort_by_location();
78
79 (loose_mismatch, epsilon)
80}
81
82fn sum_right_errors(qgram_array: &mut PosQGramArray, q: usize) -> Option<SuffixSumArray> {
94 if qgram_array.len() == 0 {
95 None
96 } else {
97 qgram_array.reverse();
98 let mut cnt: usize = 0;
99 let mut loc: usize = qgram_array[0].loc + 1;
100
101 let mut suffix_sum: SuffixSumArray = Vec::new();
102
103 qgram_array.iter().for_each(|qgram| {
104 if qgram.loc < loc {
105 cnt += 1;
106 suffix_sum.push((qgram.loc, cnt));
107 if qgram.loc + 1 >= q {
108 loc = qgram.loc + 1 - q;
109 } else {
110 loc = 0;
111 }
112 }
113 });
114
115 qgram_array.reverse();
116 Some(suffix_sum)
117 }
118}
119
120fn frequency_histogram(s: &str) -> HashMap<char, usize> {
121 let mut map: HashMap<char, usize> = HashMap::new();
122
123 s.chars().for_each(|c| {
124 map.entry(c).and_modify(|v| *v += 1).or_insert(1);
125 });
126
127 map
128}
129
130fn l1_distance(s: &str, t: &str, lo: usize, hi: usize) -> usize {
142 let h_s: HashMap<char, usize> = frequency_histogram(&s[lo..hi]);
143 let h_t: HashMap<char, usize> = frequency_histogram(&t[lo..hi]);
144
145 let mut keys: Vec<&char> = h_s.keys().collect::<Vec<&char>>();
146 keys.append(&mut h_t.keys().collect::<Vec<&char>>());
147 keys.par_sort();
148 keys.dedup();
149
150 let mut v_s: Vec<usize> = Vec::with_capacity(keys.len());
151 let mut v_t: Vec<usize> = Vec::with_capacity(keys.len());
152
153 keys.iter().for_each(|k| {
154 v_s.push(*h_s.get(k).unwrap_or(&0));
155 v_t.push(*h_t.get(k).unwrap_or(&0));
156 });
157
158 let distance: usize = v_s
159 .par_iter()
160 .zip_eq(v_t.par_iter())
161 .map(|(a, b)| (*a as isize - *b as isize).abs() as usize)
162 .sum();
163 distance
164}
165
166fn content_filter(
181 from: &str,
182 to: &str,
183 mismatch: PosQGramArray,
184 suffix_sum: SuffixSumArray,
185 q: usize,
186 tau: usize,
187) -> Option<usize> {
188 let mut i: usize = 1;
189 let mut j: usize = 0;
190 let mut epsilon: usize;
191
192 let epsi = |s, t, mismatch: &PosQGramArray, q, ii: usize, jj: usize| {
193 let l1 = l1_distance(s, t, mismatch[jj].loc, mismatch[ii - 1].loc + q - 1);
194 let right_error = suffix_sum
195 .par_iter()
196 .find_first(|e| e.0 >= mismatch[ii - 1].loc + q) .unwrap_or(&(0, 0)) .1; l1 / 2 + right_error };
201
202 if mismatch.len() >= 2 {
204 while i < mismatch.len() {
205 if mismatch[i].loc - mismatch[i - 1].loc > 1 {
206 epsilon = epsi(from, to, &mismatch, q, i, j);
207 if epsilon > tau {
208 return Some(2 * tau + 1);
209 }
210 j = i;
211 }
212 i += 1;
213 }
214
215 let epsilon = epsi(from, to, &mismatch, q, i, j);
216 Some(epsilon)
217 } else {
218 None
219 }
220}
221
222pub fn verify(
240 x: Vec<PosQGram>,
241 line_id: usize,
242 line_content: &str,
243 y: &mut PosQGramArray,
244 candidate_id: usize,
245 candidate_content: &str,
246 inverted: &InvertedIndex,
247 q: usize,
248 tau: usize,
249) -> Option<(ID, Vec<(ID, usize)>)> {
250 #[cfg(feature = "cli")]
251 debug!(
252 "Verify `{}: {}` against `{}: {}`",
253 line_id, line_content, candidate_id, candidate_content
254 );
255 let mut out: Vec<(ID, usize)> = Vec::new();
256
257 let mut x = PosQGramArray { inner: x };
259 x.sort_by_frequency(inverted);
260 y.sort_by_frequency(inverted);
261
262 let (mut loose_mismatch, epsilon_1) = compare_qgrams(&x, &y, &inverted, q);
263 #[cfg(feature = "cli")]
264 trace!(
265 "x: {}\n y: {}\n Loosely-Mismatch: {}\n # of Strongly Mismatch: {}",
266 x,
267 y,
268 loose_mismatch,
269 epsilon_1
270 );
271
272 #[cfg(feature = "cli")]
274 trace!(
275 "Count filtering on `{}: {}`: epsilon_1 = {}",
276 candidate_id,
277 candidate_content,
278 epsilon_1
279 );
280 if epsilon_1 <= q * tau {
281 loose_mismatch.par_sort_by_key(|qgram| qgram.loc);
284 let epsilon_2 = min_edit_errors(&loose_mismatch, q);
285
286 #[cfg(feature = "cli")]
288 trace!(
289 "Location-based filtering on `{}: {}`: epsilon_2 = {}",
290 candidate_id,
291 candidate_content,
292 epsilon_2
293 );
294 if epsilon_2 <= tau {
295 if let Some(right_error) = sum_right_errors(&mut loose_mismatch, q) {
296 let suffix_sum_array: SuffixSumArray = right_error;
297 #[cfg(feature = "cli")]
298 trace!("Suffix Sum Array: {:?}", suffix_sum_array);
299 let epsilon_3 = content_filter(
300 line_content,
301 candidate_content,
302 loose_mismatch,
303 suffix_sum_array,
304 q,
305 tau,
306 );
307
308 if let Some(v) = epsilon_3 {
310 #[cfg(feature = "cli")]
311 trace!(
312 "Content-based filtering on `{}: {}`: epsilon_3 = {}",
313 candidate_id,
314 candidate_content,
315 v,
316 );
317 if v <= tau {
319 let ed: usize = edit_distance(line_content, candidate_content);
320 #[cfg(feature = "cli")]
321 trace!(
322 "Ed of `{}: {}` against `{}: {}`",
323 line_id,
324 line_content,
325 candidate_id,
326 candidate_content
327 );
328 if ed <= tau {
329 #[cfg(feature = "cli")]
330 trace!(
331 "Add `{}: {}` to matched set of `{}: {}`",
332 line_id,
333 line_content,
334 candidate_id,
335 candidate_content
336 );
337 out.push((candidate_id, ed));
338 }
339 }
340 } else {
341 let ed: usize = edit_distance(line_content, candidate_content);
343 #[cfg(feature = "cli")]
344 trace!(
345 "Ed of `{}: {}` against `{}: {}`",
346 line_id,
347 line_content,
348 candidate_id,
349 candidate_content
350 );
351 if ed <= tau {
352 #[cfg(feature = "cli")]
353 trace!(
354 "Add `{}: {}` to matched set of `{}: {}`",
355 line_id,
356 line_content,
357 candidate_id,
358 candidate_content
359 );
360 out.push((candidate_id, ed));
361 }
362 }
363 } else {
364 let ed: usize = edit_distance(line_content, candidate_content);
366 #[cfg(feature = "cli")]
367 trace!(
368 "Ed of `{}: {}` against `{}: {}`",
369 line_id,
370 line_content,
371 candidate_id,
372 candidate_content
373 );
374 if ed <= tau {
375 #[cfg(feature = "cli")]
376 trace!(
377 "Add `{}: {}` to matched set of `{}: {}`",
378 line_id,
379 line_content,
380 candidate_id,
381 candidate_content
382 );
383 out.push((candidate_id, ed));
384 }
385 }
386 }
387 }
388
389 if !out.is_empty() {
390 Some((line_id, out))
391 } else {
392 None
393 }
394}