asearch/
lib.rs

1#![warn(missing_docs)]
2//! Approximate pattern matching crate
3
4const INITPAT: u32 = 0x80000000; // 0100,0000,0000,0000,0000,0000,0000,0000
5const MAXCHAR: usize = 0x10000;
6const INITSTATE: [u32; 4] = [INITPAT, 0, 0, 0];
7
8/// Approximate pattern matching engine
9pub struct Asearch {
10    shiftpat: [u32; MAXCHAR],
11    acceptpat: u32,
12    epsilon: u32,
13}
14
15impl Asearch {
16    /// Create a new approximate pattern matching engine
17    ///
18    /// # Arguments
19    ///
20    /// - `source` - text which is searched for.
21    pub fn new(source: impl Into<String>) -> Asearch {
22        let mut shiftpat: [u32; MAXCHAR] = [0; MAXCHAR];
23        let mut mask = INITPAT;
24        let mut epsilon: u32 = 0;
25        for item in &unpack(source.into()) {
26            // 0x20 is a space
27            if *item == 0x20 {
28                epsilon |= mask;
29            } else {
30                shiftpat[*item] |= mask;
31                shiftpat[to_upper(*item)] |= mask;
32                shiftpat[to_lower(*item)] |= mask;
33                mask >>= 1;
34            }
35        }
36        Asearch {
37            acceptpat: mask,
38            shiftpat,
39            epsilon,
40        }
41    }
42
43    fn state(&self, text: impl Into<String>) -> [u32; 4] {
44        let mut i0 = INITSTATE[0];
45        let mut i1 = INITSTATE[1];
46        let mut i2 = INITSTATE[2];
47        let mut i3 = INITSTATE[3];
48        for item in &unpack(text.into()) {
49            let mask = self.shiftpat[*item];
50            i3 = (i3 & self.epsilon) | ((i3 & mask) >> 1) | (i2 >> 1) | i2;
51            i2 = (i2 & self.epsilon) | ((i2 & mask) >> 1) | (i1 >> 1) | i1;
52            i1 = (i1 & self.epsilon) | ((i1 & mask) >> 1) | (i0 >> 1) | i0;
53            i0 = (i0 & self.epsilon) | ((i0 & mask) >> 1);
54            i1 |= i0 >> 1;
55            i2 |= i1 >> 1;
56            i3 |= i2 >> 1;
57        }
58        [i0, i1, i2, i3]
59    }
60
61    /// Do approximate pattern matching
62    ///
63    /// # Arguments
64    ///
65    /// - `text` - text which is searched.
66    /// - `ambig` - Levenshtein distance.
67    ///
68    /// # Examples
69    ///
70    /// ```
71    /// // pattern: abcde
72    /// {
73    ///     let asearch = asearch::Asearch::new("abcde");
74    ///
75    ///     assert!(asearch.find("abcde", 0));
76    ///     assert!(asearch.find("aBCDe", 0));
77    ///     assert!(asearch.find("abXcde", 1));
78    ///     assert!(asearch.find("ab?de", 1));
79    ///     assert!(asearch.find("abXXde", 2));
80    ///     assert!(!asearch.find("abXcde", 0));
81    ///     assert!(!asearch.find("ab?de", 0));
82    ///     assert!(!asearch.find("abde", 0));
83    ///     assert!(!asearch.find("abXXde", 1));
84    ///     assert!(asearch.find("abcde", 1));
85    ///     assert!(!asearch.find("abcd", 0));
86    ///     assert!(asearch.find("abcd", 1));
87    ///     assert!(asearch.find("bcde", 2));
88    /// }
89    ///
90    /// // pattern: ab de
91    /// {
92    ///     let asearch = asearch::Asearch::new("ab de");
93    ///     assert!(asearch.find("abcde", 0));
94    ///     assert!(asearch.find("abccde", 0));
95    ///     assert!(asearch.find("abXXXXXXXde", 0));
96    ///     assert!(asearch.find("ababcccccxede", 1));
97    ///     assert!(!asearch.find("abcccccxe", 0));
98    /// }
99    ///
100    /// // pattern: partial match
101    /// {
102    ///     let asearch = asearch::Asearch::new(" java ");
103    ///
104    ///     assert!(asearch.find("javascript", 0));
105    ///     assert!(asearch.find("abcjavascript", 0));
106    ///     assert!(asearch.find("jabascript", 1));
107    ///     assert!(asearch.find("awesome jabascript", 1));
108    /// }
109    ///
110    /// // pattern: unicode
111    /// {
112    ///     let asearch = asearch::Asearch::new("漢字文字列");
113    ///
114    ///     assert!(asearch.find("漢字文字列", 0));
115    ///     assert!(!asearch.find("漢字の文字列", 0));
116    ///     assert!(asearch.find("漢字の文字列", 1));
117    ///     assert!(!asearch.find("漢字文字", 0));
118    ///     assert!(asearch.find("漢字文字", 1));
119    ///     assert!(!asearch.find("漢字文字烈", 0));
120    ///     assert!(asearch.find("漢字文字烈", 1));
121    ///     assert!(!asearch.find("漢和辞典", 2));
122    /// }
123    /// ```
124    pub fn find(&self, text: impl Into<String>, ambig: u8) -> bool {
125        let ambig_ = if (ambig as usize) < INITSTATE.len() {
126            ambig as usize
127        } else {
128            INITSTATE.len() - 1
129        };
130
131        let s = self.state(text.into());
132        (s[ambig_] & self.acceptpat) != 0
133    }
134}
135
136/// convert each char to a code point
137/// They are used for indice of the finite automaton
138fn unpack(text: impl Into<String>) -> Vec<usize> {
139    text.into()
140        .chars()
141        .into_iter()
142        .map(|c| c as usize)
143        .collect()
144}
145
146fn is_upper(c: usize) -> bool {
147    (0x41..=0x5a).contains(&c)
148}
149fn is_lower(c: usize) -> bool {
150    (0x61..=0x7a).contains(&c)
151}
152fn to_lower(c: usize) -> usize {
153    if is_upper(c) {
154        c + 0x20
155    } else {
156        c
157    }
158}
159fn to_upper(c: usize) -> usize {
160    if is_lower(c) {
161        c - 0x20
162    } else {
163        c
164    }
165}