bitap_reference/
lib.rs

1use colored::*;
2use std::cmp;
3use std::collections::HashMap;
4use std::mem;
5
6#[cfg(test)]
7extern crate quickcheck;
8#[cfg(test)]
9#[macro_use(quickcheck)]
10extern crate quickcheck_macros;
11
12pub mod baseline;
13
14#[cfg(test)]
15mod test;
16
17/// Match represents a single match of a pattern in some text.
18#[derive(Copy, Clone, Debug, Eq, PartialEq)]
19pub struct Match {
20    /// The edit distance for this match.
21    pub distance: usize,
22    /// The index of the character (not necessarily byte) that _ends_ this
23    /// match. Determining start position isn't possible (unless
24    /// max_distance is zero), so this is all I'm returning.
25    pub end: usize,
26}
27
28pub type BitapResult = Result<Vec<Match>, &'static str>;
29pub type FindResult = Result<Vec<usize>, &'static str>;
30
31static ERR_INVALID_PATTERN: &'static str = "invalid pattern";
32
33fn pattern_length_is_valid(pattern_length: usize) -> bool {
34    pattern_length > 0 && pattern_length < mem::size_of::<usize>() * 8
35}
36
37/// The reference implementation of the bitap algorithm.
38pub fn reference(
39    pattern: &str,
40    text: &str,
41    max_distance: usize,
42    allow_transpositions: bool,
43    debug: bool,
44) -> BitapResult {
45    let pattern_len = pattern.chars().count();
46    if !pattern_length_is_valid(pattern_len) {
47        return Err(ERR_INVALID_PATTERN);
48    }
49
50    // Clamp max edit distance to the length of the pattern.
51    let max_distance = cmp::min(max_distance, pattern_len);
52
53    // Create a mapping from characters to character masks. A "character's
54    // mask" in this case is a bitmask where, for every index that character
55    // is used in the pattern string, the value is zero.
56    //
57    // Roughly if the pattern were "abcab" the character masks would be as
58    // follows (albeit reversed, so the first character corresponds to the
59    // least significant bit). The remaining bits are all set to 1.
60    //
61    //        abcab abcab
62    //   "a": X..X. 01101
63    //   "b": .X..X 10110
64    //   "c": ..X.. 11011
65    //
66    let mut masks: HashMap<char, usize> = HashMap::new();
67    for (i, c) in pattern.chars().enumerate() {
68        masks
69            .entry(c)
70            .and_modify(|mask| *mask &= !(1usize << i))
71            .or_insert(!(1usize << i));
72    }
73
74    // We need to initialize the state with each error level already having
75    // that many characters "correct". Without this, partial matches at the
76    // beginning of the text wouldn't work. Basically we want r to look like this:
77    //
78    //   r[0] ...11111110
79    //   r[1] ...11111100
80    //   r[2] ...11111000
81    //   ...(etc)
82    //
83    // That's what the kinda opaque one-liner below does.
84    let mut r = (0..=max_distance).map(|i| !1usize << i).collect::<Vec<_>>();
85    let mut trans = vec![!1usize; max_distance];
86
87    let results = text
88        .chars()
89        .enumerate()
90        .filter_map(move |(i, c)| {
91            let letter_mask = match masks.get(&c) {
92                Some(mask) => *mask,
93                None => !0usize,
94            };
95            let mut prev_parent = r[0];
96            r[0] |= letter_mask;
97            r[0] <<= 1;
98
99            for j in 1..=max_distance {
100                let prev = r[j];
101                let current = (prev | letter_mask) << 1;
102                let replace = prev_parent << 1;
103                let delete = r[j - 1] << 1;
104                let insert = prev_parent;
105                r[j] = current & insert & delete & replace;
106
107                if allow_transpositions {
108                    let transpose = (trans[j - 1] | (letter_mask << 1)) << 1;
109                    r[j] &= transpose;
110
111                    // roughly: the current letter matches the _next_ position in
112                    // the parent. I couldn't find any reference implementations
113                    // of bitap that includes transposition, so this may not be
114                    // correct. But I thought about it for a long time?
115                    trans[j - 1] = (prev_parent << 1) | letter_mask;
116                }
117
118                prev_parent = prev;
119            }
120
121            if debug {
122                debug_bitap(pattern, text, i, &r);
123            }
124
125            for (k, rv) in r.iter().enumerate() {
126                if 0 == (rv & (1usize << pattern_len)) {
127                    return Some(Match {
128                        distance: k,
129                        end: i,
130                    });
131                }
132            }
133            None
134        })
135        .collect::<Vec<_>>();
136    Ok(results)
137}
138
139// Visualizing bitap is really difficult, so this little helper makes it a bit
140// easier by printing the internal state after each iteration. I used this a
141// lot during development, but not much anymore.
142fn debug_bitap(pattern: &str, text: &str, index: usize, state: &[usize]) {
143    let pattern_len = pattern.chars().count();
144    let text_colored = text
145        .chars()
146        .enumerate()
147        .map(|(i, c)| {
148            if i == index {
149                format!("{}", c.to_string().green())
150            } else {
151                c.to_string()
152            }
153        })
154        .collect::<Vec<String>>()
155        .concat();
156    println!("{: <4}| {}", index, text_colored);
157    println!("-------------------");
158    println!("    | {}", pattern);
159    println!("-------------------");
160    for (i, r) in state.iter().enumerate() {
161        let bitmask: String = format!("{:b}", r)
162            .chars()
163            .rev()
164            .skip(1)
165            .take(pattern_len)
166            .collect();
167        let is_match = bitmask.ends_with('0');
168        if is_match {
169            println!("{}   | {}", i, bitmask.green());
170        } else {
171            println!("{}   | {}", i, bitmask);
172        }
173    }
174    println!("-------------------");
175}
176
177pub fn find(pattern: &str, text: &str) -> FindResult {
178    reference(pattern, text, 0, false, false).map(|v| {
179        let offset = pattern.chars().count() - 1;
180        v.iter().map(|m| m.end - offset).collect::<Vec<_>>()
181    })
182}
183
184pub fn lev(pattern: &str, text: &str, max_distance: usize) -> BitapResult {
185    reference(pattern, text, max_distance, false, false)
186}
187
188pub fn osa(pattern: &str, text: &str, max_distance: usize) -> BitapResult {
189    reference(pattern, text, max_distance, true, false)
190}