ed_join/
verification.rs

1use edit_distance::edit_distance;
2use rayon::prelude::*;
3use std::collections::HashMap;
4
5use crate::matching::min_edit_errors;
6use crate::qgram::*;
7
8type RightError = usize;
9type SuffixSumArray = Vec<(Loc, RightError)>;
10
11// Algorithm 8
12/// Given two sorted q-gram arrays, in increasing order of location,
13/// find the set of loosely mismatching q-grams and the number of strictly mismatching q-grams.
14///
15/// # Parameters
16///
17///  * `x` and `y`: PosQGramArrays as `source` and `target` for matching.
18///  * `invert`: The inverted index.
19///  * `tau`: A positive integer as the tuning parameter for threshold for matching.
20///
21/// # Return
22///
23///  * A set of loosely mismatching q-grams from `x` to `y`.
24///  * The number of strictly mismatching q-grams from `x` to `y`.
25fn compare_qgrams(
26    x: &PosQGramArray,
27    y: &PosQGramArray,
28    invert: &InvertedIndex,
29    tau: usize,
30) -> (PosQGramArray, usize) {
31    let mut i: usize = 0;
32    let mut j: usize = 0;
33    let mut epsilon: usize = 0;
34    let mut loose_mismatch: PosQGramArray = PosQGramArray::new();
35
36    let comparator = |x: &PosQGramArray,
37                      y: &PosQGramArray,
38                      i: &mut usize,
39                      j: usize,
40                      epsilon: &mut usize,
41                      loose_mismatch: &mut PosQGramArray| {
42        if ((*i >= 1) && (x[*i].token != x[*i - 1].token))
43            || ((j >= 1) && (x[*i].token != y[j - 1].token))
44            || ((j >= 1) && ((x[*i].loc as isize - y[j - 1].loc as isize).abs() > tau as isize))
45        {
46            loose_mismatch.push(x[*i].clone());
47        }
48        *i += 1;
49        *epsilon += 1;
50    };
51
52    let get_len = |token_x: &Token| invert.get(token_x).unwrap().1;
53
54    while i < x.len() && j < y.len() {
55        if x[i].token == y[j].token {
56            if (x[i].loc as isize - y[j].loc as isize).abs() <= tau as isize {
57                i += 1;
58                j += 1;
59            } else if x[i].loc < y[j].loc {
60                comparator(x, y, &mut i, j, &mut epsilon, &mut loose_mismatch);
61            } else {
62                j += 1;
63            }
64        } else if (get_len(&x[i].token) < get_len(&y[j].token))
65            || ((get_len(&x[i].token) == get_len(&y[j].token))
66                & ((x[i].token.as_bytes()) < (y[j].token.as_bytes())))
67        {
68            comparator(x, y, &mut i, j, &mut epsilon, &mut loose_mismatch);
69        } else {
70            j += 1;
71        }
72    }
73    while i < x.len() {
74        comparator(x, y, &mut i, j, &mut epsilon, &mut loose_mismatch);
75    }
76
77    loose_mismatch.sort_by_location();
78
79    (loose_mismatch, epsilon)
80}
81
82// Based on Algorithm 2
83/// Given a set of q-grams, find the minimum number of edit operations in the suffix that destroys all q-grams.
84///
85/// # Parameters
86///
87///  * `qgram_array`: A PosQGramArray, i.e. a set of positional q-grams.
88///  * `q`: A positive integer as the tuning parameter for length of q-grams.
89///
90/// # Return
91///
92/// The minimum number of edit operations on the suffix that destroy all q-grams.
93fn sum_right_errors(qgram_array: &mut PosQGramArray, q: usize) -> Option<SuffixSumArray> {
94    if qgram_array.len() == 0 {
95        None
96    } else {
97        qgram_array.reverse();
98        let mut cnt: usize = 0;
99        let mut loc: usize = qgram_array[0].loc + 1;
100
101        let mut suffix_sum: SuffixSumArray = Vec::new();
102
103        qgram_array.iter().for_each(|qgram| {
104            if qgram.loc < loc {
105                cnt += 1;
106                suffix_sum.push((qgram.loc, cnt));
107                if qgram.loc + 1 >= q {
108                    loc = qgram.loc + 1 - q;
109                } else {
110                    loc = 0;
111                }
112            }
113        });
114
115        qgram_array.reverse();
116        Some(suffix_sum)
117    }
118}
119
120fn frequency_histogram(s: &str) -> HashMap<char, usize> {
121    let mut map: HashMap<char, usize> = HashMap::new();
122
123    s.chars().for_each(|c| {
124        map.entry(c).and_modify(|v| *v += 1).or_insert(1);
125    });
126
127    map
128}
129
130// Algorithm 6
131/// Given two strings, calculate their L1 distance.
132///
133/// # Parameters
134///
135///  * `s` and `t`: (Sub-)String that is under probing window.
136///  * `lo` and `hi`: Indicates the start and end point of the probing window.
137///
138/// # Return
139///
140/// L1 distance of the two given strings with given probing window.
141fn l1_distance(s: &str, t: &str, lo: usize, hi: usize) -> usize {
142    let h_s: HashMap<char, usize> = frequency_histogram(&s[lo..hi]);
143    let h_t: HashMap<char, usize> = frequency_histogram(&t[lo..hi]);
144
145    let mut keys: Vec<&char> = h_s.keys().collect::<Vec<&char>>();
146    keys.append(&mut h_t.keys().collect::<Vec<&char>>());
147    keys.par_sort();
148    keys.dedup();
149
150    let mut v_s: Vec<usize> = Vec::with_capacity(keys.len());
151    let mut v_t: Vec<usize> = Vec::with_capacity(keys.len());
152
153    keys.iter().for_each(|k| {
154        v_s.push(*h_s.get(k).unwrap_or(&0));
155        v_t.push(*h_t.get(k).unwrap_or(&0));
156    });
157
158    let distance: usize = v_s
159        .par_iter()
160        .zip_eq(v_t.par_iter())
161        .map(|(a, b)| (*a as isize - *b as isize).abs() as usize)
162        .sum();
163    distance
164}
165
166// Algorithm 5
167/// Content-based mismatch filtering by combining L1-distance and minimum edit errors in the suffix to the probing window.
168///
169/// # Parameters
170///
171///  * `from` and `to`: (Sub-)String that is under probing window.
172///  * `mismatch`: A PosQGramArray with loosely mismatching q-grams from `s` to `t`.
173///  * `suffix_sum`: A condensed suffix sum array.
174///  * `q`: A positive integer as the tuning parameter for length of q-grams.
175///  * `tau`: A positive integer as the tuning parameter for threshold for matching.
176///
177/// # Return
178///
179/// A lower bound of the edit distance from `s` to `t`.
180fn content_filter(
181    from: &str,
182    to: &str,
183    mismatch: PosQGramArray,
184    suffix_sum: SuffixSumArray,
185    q: usize,
186    tau: usize,
187) -> Option<usize> {
188    let mut i: usize = 1;
189    let mut j: usize = 0;
190    let mut epsilon: usize;
191
192    let epsi = |s, t, mismatch: &PosQGramArray, q, ii: usize, jj: usize| {
193        let l1 = l1_distance(s, t, mismatch[jj].loc, mismatch[ii - 1].loc + q - 1);
194        let right_error = suffix_sum
195            .par_iter()
196            .find_first(|e| e.0 >= mismatch[ii - 1].loc + q) // e is a PosQGram, e.0 is location
197            .unwrap_or(&(0, 0)) // returns (Loc, RightError)
198            .1; // returns RightError
199        l1 / 2 + right_error // NOTE: I believe author had a typo here and I fixed it
200    };
201
202    // otherwise index is out-of-bound
203    if mismatch.len() >= 2 {
204        while i < mismatch.len() {
205            if mismatch[i].loc - mismatch[i - 1].loc > 1 {
206                epsilon = epsi(from, to, &mismatch, q, i, j);
207                if epsilon > tau {
208                    return Some(2 * tau + 1);
209                }
210                j = i;
211            }
212            i += 1;
213        }
214
215        let epsilon = epsi(from, to, &mismatch, q, i, j);
216        Some(epsilon)
217    } else {
218        None
219    }
220}
221
222// Algorithm 7
223/// Given a string and a set of possible candidates for matching,
224/// verify whether each of the candidate is valid by various filters,
225/// and eventually output all matched candidates and corresponding edit distance.
226///
227/// # Parameters
228///
229/// * `x` and `y`: The `inner` of a PosQGramArray, based on a line of `doc_x` or `doc_y`, respectively
230/// * `line_id` and `candidate_id`: Line number of `x` and `y`, respectively
231/// * `line_content` and `candidate_content`: String representation of `x` and `y`, respectively
232/// * `inverted`: The inverted index.
233/// * `q`: A positive integer as the tuning parameter for length of q-grams.
234/// * `tau`: A positive integer as the tuning parameter for threshold for matching.
235///
236/// # Return
237///
238/// Verified matched paris from the candidates set.
239pub fn verify(
240    x: Vec<PosQGram>,
241    line_id: usize,
242    line_content: &str,
243    y: &mut PosQGramArray,
244    candidate_id: usize,
245    candidate_content: &str,
246    inverted: &InvertedIndex,
247    q: usize,
248    tau: usize,
249) -> Option<(ID, Vec<(ID, usize)>)> {
250    #[cfg(feature = "cli")]
251    debug!(
252        "Verify `{}: {}` against `{}: {}`",
253        line_id, line_content, candidate_id, candidate_content
254    );
255    let mut out: Vec<(ID, usize)> = Vec::new();
256
257    // PosQGramArray is only sorted in increasing order of location, now sort it in increasing order of frequency
258    let mut x = PosQGramArray { inner: x };
259    x.sort_by_frequency(inverted);
260    y.sort_by_frequency(inverted);
261
262    let (mut loose_mismatch, epsilon_1) = compare_qgrams(&x, &y, &inverted, q);
263    #[cfg(feature = "cli")]
264    trace!(
265        "x: {}\n y: {}\n Loosely-Mismatch: {}\n # of Strongly Mismatch: {}",
266        x,
267        y,
268        loose_mismatch,
269        epsilon_1
270    );
271
272    // count filtering
273    #[cfg(feature = "cli")]
274    trace!(
275        "Count filtering on `{}: {}`: epsilon_1 = {}",
276        candidate_id,
277        candidate_content,
278        epsilon_1
279    );
280    if epsilon_1 <= q * tau {
281        // loose_mismatch is a PosQGramArray, which is generated from &x, &y, who were sorted in increasing order of frequency
282        // now sort it in increasing order of location
283        loose_mismatch.par_sort_by_key(|qgram| qgram.loc);
284        let epsilon_2 = min_edit_errors(&loose_mismatch, q);
285
286        // location-based filtering
287        #[cfg(feature = "cli")]
288        trace!(
289            "Location-based filtering on `{}: {}`: epsilon_2 = {}",
290            candidate_id,
291            candidate_content,
292            epsilon_2
293        );
294        if epsilon_2 <= tau {
295            if let Some(right_error) = sum_right_errors(&mut loose_mismatch, q) {
296                let suffix_sum_array: SuffixSumArray = right_error;
297                #[cfg(feature = "cli")]
298                trace!("Suffix Sum Array: {:?}", suffix_sum_array);
299                let epsilon_3 = content_filter(
300                    line_content,
301                    candidate_content,
302                    loose_mismatch,
303                    suffix_sum_array,
304                    q,
305                    tau,
306                );
307
308                // content-based filtering
309                if let Some(v) = epsilon_3 {
310                    #[cfg(feature = "cli")]
311                    trace!(
312                        "Content-based filtering on `{}: {}`: epsilon_3 = {}",
313                        candidate_id,
314                        candidate_content,
315                        v,
316                    );
317                    // NOTE: I believe author made a mistake here
318                    if v <= tau {
319                        let ed: usize = edit_distance(line_content, candidate_content);
320                        #[cfg(feature = "cli")]
321                        trace!(
322                            "Ed of `{}: {}` against `{}: {}`",
323                            line_id,
324                            line_content,
325                            candidate_id,
326                            candidate_content
327                        );
328                        if ed <= tau {
329                            #[cfg(feature = "cli")]
330                            trace!(
331                                "Add `{}: {}` to matched set of `{}: {}`",
332                                line_id,
333                                line_content,
334                                candidate_id,
335                                candidate_content
336                            );
337                            out.push((candidate_id, ed));
338                        }
339                    }
340                } else {
341                    // when mismatch is empty, cannot apply content filter, go to this branch
342                    let ed: usize = edit_distance(line_content, candidate_content);
343                    #[cfg(feature = "cli")]
344                    trace!(
345                        "Ed of `{}: {}` against `{}: {}`",
346                        line_id,
347                        line_content,
348                        candidate_id,
349                        candidate_content
350                    );
351                    if ed <= tau {
352                        #[cfg(feature = "cli")]
353                        trace!(
354                            "Add `{}: {}` to matched set of `{}: {}`",
355                            line_id,
356                            line_content,
357                            candidate_id,
358                            candidate_content
359                        );
360                        out.push((candidate_id, ed));
361                    }
362                }
363            } else {
364                // when mismatch is empty, sum_right_errors is empty, go to this branch
365                let ed: usize = edit_distance(line_content, candidate_content);
366                #[cfg(feature = "cli")]
367                trace!(
368                    "Ed of `{}: {}` against `{}: {}`",
369                    line_id,
370                    line_content,
371                    candidate_id,
372                    candidate_content
373                );
374                if ed <= tau {
375                    #[cfg(feature = "cli")]
376                    trace!(
377                        "Add `{}: {}` to matched set of `{}: {}`",
378                        line_id,
379                        line_content,
380                        candidate_id,
381                        candidate_content
382                    );
383                    out.push((candidate_id, ed));
384                }
385            }
386        }
387    }
388
389    if !out.is_empty() {
390        Some((line_id, out))
391    } else {
392        None
393    }
394}