asearch/lib.rs
1#![warn(missing_docs)]
2//! Approximate pattern matching crate
3
4const INITPAT: u32 = 0x80000000; // 0100,0000,0000,0000,0000,0000,0000,0000
5const MAXCHAR: usize = 0x10000;
6const INITSTATE: [u32; 4] = [INITPAT, 0, 0, 0];
7
8/// Approximate pattern matching engine
9pub struct Asearch {
10 shiftpat: [u32; MAXCHAR],
11 acceptpat: u32,
12 epsilon: u32,
13}
14
15impl Asearch {
16 /// Create a new approximate pattern matching engine
17 ///
18 /// # Arguments
19 ///
20 /// - `source` - text which is searched for.
21 pub fn new(source: impl Into<String>) -> Asearch {
22 let mut shiftpat: [u32; MAXCHAR] = [0; MAXCHAR];
23 let mut mask = INITPAT;
24 let mut epsilon: u32 = 0;
25 for item in &unpack(source.into()) {
26 // 0x20 is a space
27 if *item == 0x20 {
28 epsilon |= mask;
29 } else {
30 shiftpat[*item] |= mask;
31 shiftpat[to_upper(*item)] |= mask;
32 shiftpat[to_lower(*item)] |= mask;
33 mask >>= 1;
34 }
35 }
36 Asearch {
37 acceptpat: mask,
38 shiftpat,
39 epsilon,
40 }
41 }
42
43 fn state(&self, text: impl Into<String>) -> [u32; 4] {
44 let mut i0 = INITSTATE[0];
45 let mut i1 = INITSTATE[1];
46 let mut i2 = INITSTATE[2];
47 let mut i3 = INITSTATE[3];
48 for item in &unpack(text.into()) {
49 let mask = self.shiftpat[*item];
50 i3 = (i3 & self.epsilon) | ((i3 & mask) >> 1) | (i2 >> 1) | i2;
51 i2 = (i2 & self.epsilon) | ((i2 & mask) >> 1) | (i1 >> 1) | i1;
52 i1 = (i1 & self.epsilon) | ((i1 & mask) >> 1) | (i0 >> 1) | i0;
53 i0 = (i0 & self.epsilon) | ((i0 & mask) >> 1);
54 i1 |= i0 >> 1;
55 i2 |= i1 >> 1;
56 i3 |= i2 >> 1;
57 }
58 [i0, i1, i2, i3]
59 }
60
61 /// Do approximate pattern matching
62 ///
63 /// # Arguments
64 ///
65 /// - `text` - text which is searched.
66 /// - `ambig` - Levenshtein distance.
67 ///
68 /// # Examples
69 ///
70 /// ```
71 /// // pattern: abcde
72 /// {
73 /// let asearch = asearch::Asearch::new("abcde");
74 ///
75 /// assert!(asearch.find("abcde", 0));
76 /// assert!(asearch.find("aBCDe", 0));
77 /// assert!(asearch.find("abXcde", 1));
78 /// assert!(asearch.find("ab?de", 1));
79 /// assert!(asearch.find("abXXde", 2));
80 /// assert!(!asearch.find("abXcde", 0));
81 /// assert!(!asearch.find("ab?de", 0));
82 /// assert!(!asearch.find("abde", 0));
83 /// assert!(!asearch.find("abXXde", 1));
84 /// assert!(asearch.find("abcde", 1));
85 /// assert!(!asearch.find("abcd", 0));
86 /// assert!(asearch.find("abcd", 1));
87 /// assert!(asearch.find("bcde", 2));
88 /// }
89 ///
90 /// // pattern: ab de
91 /// {
92 /// let asearch = asearch::Asearch::new("ab de");
93 /// assert!(asearch.find("abcde", 0));
94 /// assert!(asearch.find("abccde", 0));
95 /// assert!(asearch.find("abXXXXXXXde", 0));
96 /// assert!(asearch.find("ababcccccxede", 1));
97 /// assert!(!asearch.find("abcccccxe", 0));
98 /// }
99 ///
100 /// // pattern: partial match
101 /// {
102 /// let asearch = asearch::Asearch::new(" java ");
103 ///
104 /// assert!(asearch.find("javascript", 0));
105 /// assert!(asearch.find("abcjavascript", 0));
106 /// assert!(asearch.find("jabascript", 1));
107 /// assert!(asearch.find("awesome jabascript", 1));
108 /// }
109 ///
110 /// // pattern: unicode
111 /// {
112 /// let asearch = asearch::Asearch::new("漢字文字列");
113 ///
114 /// assert!(asearch.find("漢字文字列", 0));
115 /// assert!(!asearch.find("漢字の文字列", 0));
116 /// assert!(asearch.find("漢字の文字列", 1));
117 /// assert!(!asearch.find("漢字文字", 0));
118 /// assert!(asearch.find("漢字文字", 1));
119 /// assert!(!asearch.find("漢字文字烈", 0));
120 /// assert!(asearch.find("漢字文字烈", 1));
121 /// assert!(!asearch.find("漢和辞典", 2));
122 /// }
123 /// ```
124 pub fn find(&self, text: impl Into<String>, ambig: u8) -> bool {
125 let ambig_ = if (ambig as usize) < INITSTATE.len() {
126 ambig as usize
127 } else {
128 INITSTATE.len() - 1
129 };
130
131 let s = self.state(text.into());
132 (s[ambig_] & self.acceptpat) != 0
133 }
134}
135
136/// convert each char to a code point
137/// They are used for indice of the finite automaton
138fn unpack(text: impl Into<String>) -> Vec<usize> {
139 text.into()
140 .chars()
141 .into_iter()
142 .map(|c| c as usize)
143 .collect()
144}
145
146fn is_upper(c: usize) -> bool {
147 (0x41..=0x5a).contains(&c)
148}
149fn is_lower(c: usize) -> bool {
150 (0x61..=0x7a).contains(&c)
151}
152fn to_lower(c: usize) -> usize {
153 if is_upper(c) {
154 c + 0x20
155 } else {
156 c
157 }
158}
159fn to_upper(c: usize) -> usize {
160 if is_lower(c) {
161 c - 0x20
162 } else {
163 c
164 }
165}