bitap_reference/
baseline.rs

1use super::*;
2
3use std::cmp;
4use strsim;
5
6pub enum DistanceFn {
7    Levenshtein,
8    DamerauLevenshtein,
9    OptimalStringAlignment,
10}
11
12/// The baseline functions should be functionally the same as the reference
13/// functions, but internally use a much slower algorithm. It's a reference
14/// for the reference so to speak. Handy because early iterations of the
15/// reference function had pretty serious logic flaws that would not have been
16/// caught without this!
17pub fn baseline(
18    pattern: &str,
19    text: &str,
20    max_distance: usize,
21    distance_fn: DistanceFn,
22) -> super::BitapResult {
23    // The results of bitap (the algorithm) should be the same as taking the edit
24    // distance between the pattern and _every possible_ substring of the text.
25    // This function doesn't need to be fast, it just needs to be right, so
26    // that's exactly what it does! This should prevent a class of issues that
27    // came up in the past where my "reference" impl had the same flaws as my
28    // production version.
29
30    // Enforce bitap-specific pattern size limits.
31    let pattern_len = pattern.chars().count();
32    if !pattern_length_is_valid(pattern_len) {
33        return Err(ERR_INVALID_PATTERN);
34    }
35
36    // Clamp max edit distance to the length of the pattern.
37    let max_distance = cmp::min(max_distance, pattern_len);
38
39    let text_chars = text.chars().collect::<Vec<_>>();
40
41    let mut results = Vec::new();
42
43    for i in 0..text_chars.len() {
44        // Clamp the range of substrings we test; we know that to achieve
45        // max_distance, the difference in length between the pattern and the
46        // substring can be at most max_distance.
47        let max_diff = max_distance + pattern_len;
48        let start = if i > max_diff { i - max_diff } else { 0 };
49
50        let mut best_distance: usize = max_distance + 1;
51        for j in start..=i {
52            let sub_text: String = text_chars[j..=i].iter().collect();
53            let distance = match distance_fn {
54                DistanceFn::Levenshtein => strsim::levenshtein(pattern, &sub_text),
55                DistanceFn::DamerauLevenshtein => strsim::damerau_levenshtein(pattern, &sub_text),
56                DistanceFn::OptimalStringAlignment => strsim::osa_distance(pattern, &sub_text),
57            };
58            if distance < best_distance {
59                best_distance = distance;
60            }
61            if best_distance == 0 {
62                break;
63            }
64        }
65        if best_distance <= max_distance {
66            results.push(Match {
67                distance: best_distance,
68                end: i,
69            })
70        }
71    }
72    Ok(results)
73}
74
75pub fn find(pattern: &str, text: &str) -> FindResult {
76    baseline(pattern, text, 0, DistanceFn::Levenshtein).map(|v| {
77        let offset = pattern.chars().count() - 1;
78        v.iter().map(|m| m.end - offset).collect::<Vec<_>>()
79    })
80}
81
82pub fn lev(pattern: &str, text: &str, k: usize) -> BitapResult {
83    baseline(pattern, text, k, DistanceFn::Levenshtein)
84}
85
86pub fn damerau(pattern: &str, text: &str, k: usize) -> BitapResult {
87    baseline(pattern, text, k, DistanceFn::DamerauLevenshtein)
88}
89
90pub fn osa(pattern: &str, text: &str, k: usize) -> BitapResult {
91    baseline(pattern, text, k, DistanceFn::OptimalStringAlignment)
92}