1use anyhow::Result;
2use derivre::{ExprRef, RegexBuilder};
3use std::collections::{hash_map::Entry, HashMap};
4
5#[derive(Debug)]
6struct State<'a> {
7 len: usize,
8 link: Option<usize>,
9 next: HashMap<&'a str, usize>,
10 regex: Option<ExprRef>,
11}
12
13struct SuffixAutomaton<'a> {
16 states: Vec<State<'a>>,
17 last: usize,
18}
19
20impl<'a> SuffixAutomaton<'a> {
21 fn new() -> Self {
22 let init_state = State {
23 len: 0,
24 link: None,
25 next: HashMap::default(),
26 regex: None,
27 };
28 SuffixAutomaton {
29 states: vec![init_state],
30 last: 0,
31 }
32 }
33
34 fn from_string(chunks: Vec<&'a str>) -> Self {
35 let mut sa = SuffixAutomaton::new();
36 for s in chunks.into_iter() {
37 sa.extend(s);
38 }
39 sa
40 }
41
42 fn extend(&mut self, s: &'a str) {
43 let cur_index = self.states.len();
44 self.states.push(State {
45 len: self.states[self.last].len + 1,
46 link: None,
47 next: HashMap::default(),
48 regex: None,
49 });
50
51 let mut p = Some(self.last);
52 while let Some(pp) = p {
53 match self.states[pp].next.entry(s) {
54 Entry::Occupied(_) => break,
55 Entry::Vacant(entry) => {
56 entry.insert(cur_index);
57 p = self.states[pp].link;
58 }
59 }
60 }
61
62 if let Some(pp) = p {
63 let q = self.states[pp].next[&s];
64 if self.states[pp].len + 1 == self.states[q].len {
65 self.states[cur_index].link = Some(q);
66 } else {
67 let clone_index = self.states.len();
68 self.states.push(State {
69 len: self.states[pp].len + 1,
70 link: self.states[q].link,
71 next: self.states[q].next.clone(),
72 regex: None,
73 });
74 while let Some(ppp) = p {
75 if self.states[ppp].next[&s] == q {
76 self.states[ppp].next.insert(s, clone_index);
77 } else {
78 break;
79 }
80 p = self.states[ppp].link;
81 }
82 self.states[q].link = Some(clone_index);
83 self.states[cur_index].link = Some(clone_index);
84 }
85 } else {
86 self.states[cur_index].link = Some(0);
87 }
88 self.last = cur_index;
89 }
90}
91
92pub fn substring(builder: &mut RegexBuilder, chunks: Vec<&str>) -> Result<ExprRef> {
93 let mut sa = SuffixAutomaton::from_string(chunks);
94 let mut state_stack = vec![0];
95
96 let empty = ExprRef::EMPTY_STRING;
97
98 while let Some(state_index) = state_stack.last() {
99 let state_index = *state_index;
100 let state = &sa.states[state_index];
101 if state.regex.is_some() {
102 state_stack.pop();
103 continue;
104 }
105
106 if state.next.is_empty() {
107 sa.states[state_index].regex = Some(empty);
108 state_stack.pop();
109 continue;
110 }
111
112 let prev_stack = state_stack.len();
113 for child_index in state.next.values() {
114 if sa.states[*child_index].regex.is_none() {
115 state_stack.push(*child_index);
116 }
117 }
118
119 if prev_stack != state_stack.len() {
120 continue;
121 }
122
123 let mut options = state
124 .next
125 .iter()
126 .map(|(k, v)| (k.to_string().into_bytes(), sa.states[*v].regex.unwrap()))
127 .collect::<Vec<_>>();
128 options.push((Vec::new(), empty));
129 let expr = builder.mk_prefix_tree(options)?;
130 sa.states[state_index].regex = Some(expr);
131 state_stack.pop();
132 }
133 Ok(sa.states[0].regex.unwrap())
134}
135
136pub fn chunk_into_chars(input: &str) -> Vec<&str> {
137 let mut chunks = vec![];
138 let mut char_indices = input.char_indices().peekable();
139
140 while let Some((start, _)) = char_indices.next() {
141 let end = match char_indices.peek() {
142 Some(&(next_index, _)) => next_index,
143 None => input.len(),
144 };
145 chunks.push(&input[start..end]);
146 }
147
148 chunks
149}
150
151#[derive(PartialEq)]
152enum TokenType {
153 Whitespace,
154 Word,
155 Other,
156}
157
158fn classify(ch: char) -> TokenType {
159 if ch.is_whitespace() {
160 TokenType::Whitespace
161 } else if ch.is_alphanumeric() || ch == '_' {
162 TokenType::Word
163 } else {
164 TokenType::Other
165 }
166}
167
168pub fn chunk_into_words(input: &str) -> Vec<&str> {
169 if input.is_empty() {
170 return Vec::new();
171 }
172 let mut chunks = Vec::new();
173 let mut start = 0;
174 let mut current_type = classify(input.chars().next().unwrap());
175 for (i, ch) in input.char_indices() {
176 let token_type = classify(ch);
177 if token_type != current_type {
178 chunks.push(&input[start..i]);
179 start = i;
180 current_type = token_type;
181 }
182 }
183 chunks.push(&input[start..]);
184 chunks
185}
186
187#[cfg(test)]
188mod test {
189 use super::{chunk_into_chars, chunk_into_words, substring};
190 use derivre::{ExprRef, Regex, RegexBuilder};
191
192 fn to_regex(builder: RegexBuilder, expr: ExprRef) -> Regex {
193 builder.to_regex(expr)
194 }
195
196 #[test]
197 fn test_tokenize_chars() {
198 let input = "The quick brown fox jumps over the lazy dog.";
199 let tokens = chunk_into_chars(input);
200 assert_eq!(input, tokens.join(""));
201 assert_eq!(
202 tokens,
203 vec![
204 "T", "h", "e", " ", "q", "u", "i", "c", "k", " ", "b", "r", "o", "w", "n", " ",
205 "f", "o", "x", " ", "j", "u", "m", "p", "s", " ", "o", "v", "e", "r", " ", "t",
206 "h", "e", " ", "l", "a", "z", "y", " ", "d", "o", "g", "."
207 ]
208 );
209 }
210
211 #[test]
212 fn test_tokenize_chars_unicode() {
213 let input = "빠른 갈색 여우가 게으른 개를 뛰어넘었다.";
214 let tokens = chunk_into_chars(input);
215 assert_eq!(input, tokens.join(""));
216 assert_eq!(
217 tokens,
218 vec![
219 "빠", "른", " ", "갈", "색", " ", "여", "우", "가", " ", "게", "으", "른", " ",
220 "개", "를", " ", "뛰", "어", "넘", "었", "다", "."
221 ]
222 );
223 }
224
225 #[test]
226 fn test_tokenize_words() {
227 let input = "The quick brown fox jumps over the lazy dog.";
228 let tokens = chunk_into_words(input);
229 assert_eq!(input, tokens.join(""));
230 assert_eq!(
231 tokens,
232 vec![
233 "The", " ", "quick", " ", "brown", " ", "fox", " ", "jumps", " ", "over", " ",
234 "the", " ", "lazy", " ", "dog", "."
235 ]
236 );
237 }
238
239 #[test]
240 fn test_tokenize_words_unicode() {
241 let input = "빠른 갈색 여우가 게으른 개를 뛰어넘었다.";
242 let tokens = chunk_into_words(input);
243 assert_eq!(input, tokens.join(""));
244 assert_eq!(
245 tokens,
246 vec![
247 "빠른",
248 " ",
249 "갈색",
250 " ",
251 "여우가",
252 " ",
253 "게으른",
254 " ",
255 "개를",
256 " ",
257 "뛰어넘었다",
258 "."
259 ]
260 );
261 }
262
263 #[test]
264 fn test_substring_chars() {
265 let mut builder = RegexBuilder::new();
266 let expr = substring(
267 &mut builder,
268 chunk_into_chars("The quick brown fox jumps over the lazy dog."),
269 )
270 .unwrap();
271 let mut regex = to_regex(builder, expr);
272 assert!(regex.is_match("The quick brown fox jumps over the lazy dog."));
273 assert!(regex.is_match("The quick brown fox"));
274 assert!(regex.is_match("he quick brow"));
275 assert!(regex.is_match("fox jump"));
276 assert!(regex.is_match("dog."));
277 assert!(!regex.is_match("brown fx"));
278 }
279
280 #[test]
281 fn test_substring_chars_unicode() {
282 let mut builder = RegexBuilder::new();
283 let expr = substring(
284 &mut builder,
285 chunk_into_chars("빠른 갈색 여우가 게으른 개를 뛰어넘었다."),
286 )
287 .unwrap();
288 let mut regex = to_regex(builder, expr);
289 assert!(regex.is_match("빠른 갈색 여우가 게으른 개를 뛰어넘었다."));
290 assert!(regex.is_match("빠른 갈색 여우가 게으른"));
291 assert!(regex.is_match("른 갈색 여우"));
292 assert!(regex.is_match("여우가 게으"));
293 assert!(regex.is_match("뛰어넘었다."));
294 assert!(!regex.is_match("갈색 여가"));
295 }
296
297 #[test]
298 fn test_substring_words() {
299 let mut builder = RegexBuilder::new();
300 let expr = substring(
301 &mut builder,
302 chunk_into_words("The quick brown fox jumps over the lazy dog."),
303 )
304 .unwrap();
305 let mut regex = to_regex(builder, expr);
306 assert!(regex.is_match("The quick brown fox jumps over the lazy dog."));
307 assert!(regex.is_match("The quick brown fox"));
308 assert!(!regex.is_match("he quick brow"));
309 assert!(!regex.is_match("fox jump"));
310 assert!(regex.is_match("dog."));
311 assert!(!regex.is_match("brown fx"));
312 }
313
314 #[test]
315 fn test_substring_words_unicode() {
316 let mut builder = RegexBuilder::new();
317 let expr = substring(
318 &mut builder,
319 chunk_into_words("빠른 갈색 여우가 게으른 개를 뛰어넘었다."),
320 )
321 .unwrap();
322 let mut regex = to_regex(builder, expr);
323 assert!(regex.is_match("빠른 갈색 여우가 게으른 개를 뛰어넘었다."));
324 assert!(regex.is_match("빠른 갈색 여우가 게으른"));
325 assert!(!regex.is_match("른 갈색 여우"));
326 assert!(!regex.is_match("여우가 게으"));
327 assert!(regex.is_match("뛰어넘었다."));
328 assert!(!regex.is_match("갈색 여가"));
329 }
330}