llguidance/
substring.rs

1use anyhow::Result;
2use derivre::{ExprRef, RegexBuilder};
3use std::collections::{hash_map::Entry, HashMap};
4
5#[derive(Debug)]
6struct State<'a> {
7    len: usize,
8    link: Option<usize>,
9    next: HashMap<&'a str, usize>,
10    regex: Option<ExprRef>,
11}
12
13/// For details see https://en.wikipedia.org/wiki/Suffix_automaton.
14/// Implementation is based on https://cp-algorithms.com/string/suffix-automaton.html
15struct SuffixAutomaton<'a> {
16    states: Vec<State<'a>>,
17    last: usize,
18}
19
20impl<'a> SuffixAutomaton<'a> {
21    fn new() -> Self {
22        let init_state = State {
23            len: 0,
24            link: None,
25            next: HashMap::default(),
26            regex: None,
27        };
28        SuffixAutomaton {
29            states: vec![init_state],
30            last: 0,
31        }
32    }
33
34    fn from_string(chunks: Vec<&'a str>) -> Self {
35        let mut sa = SuffixAutomaton::new();
36        for s in chunks.into_iter() {
37            sa.extend(s);
38        }
39        sa
40    }
41
42    fn extend(&mut self, s: &'a str) {
43        let cur_index = self.states.len();
44        self.states.push(State {
45            len: self.states[self.last].len + 1,
46            link: None,
47            next: HashMap::default(),
48            regex: None,
49        });
50
51        let mut p = Some(self.last);
52        while let Some(pp) = p {
53            match self.states[pp].next.entry(s) {
54                Entry::Occupied(_) => break,
55                Entry::Vacant(entry) => {
56                    entry.insert(cur_index);
57                    p = self.states[pp].link;
58                }
59            }
60        }
61
62        if let Some(pp) = p {
63            let q = self.states[pp].next[&s];
64            if self.states[pp].len + 1 == self.states[q].len {
65                self.states[cur_index].link = Some(q);
66            } else {
67                let clone_index = self.states.len();
68                self.states.push(State {
69                    len: self.states[pp].len + 1,
70                    link: self.states[q].link,
71                    next: self.states[q].next.clone(),
72                    regex: None,
73                });
74                while let Some(ppp) = p {
75                    if self.states[ppp].next[&s] == q {
76                        self.states[ppp].next.insert(s, clone_index);
77                    } else {
78                        break;
79                    }
80                    p = self.states[ppp].link;
81                }
82                self.states[q].link = Some(clone_index);
83                self.states[cur_index].link = Some(clone_index);
84            }
85        } else {
86            self.states[cur_index].link = Some(0);
87        }
88        self.last = cur_index;
89    }
90}
91
92pub fn substring(builder: &mut RegexBuilder, chunks: Vec<&str>) -> Result<ExprRef> {
93    let mut sa = SuffixAutomaton::from_string(chunks);
94    let mut state_stack = vec![0];
95
96    let empty = ExprRef::EMPTY_STRING;
97
98    while let Some(state_index) = state_stack.last() {
99        let state_index = *state_index;
100        let state = &sa.states[state_index];
101        if state.regex.is_some() {
102            state_stack.pop();
103            continue;
104        }
105
106        if state.next.is_empty() {
107            sa.states[state_index].regex = Some(empty);
108            state_stack.pop();
109            continue;
110        }
111
112        let prev_stack = state_stack.len();
113        for child_index in state.next.values() {
114            if sa.states[*child_index].regex.is_none() {
115                state_stack.push(*child_index);
116            }
117        }
118
119        if prev_stack != state_stack.len() {
120            continue;
121        }
122
123        let mut options = state
124            .next
125            .iter()
126            .map(|(k, v)| (k.to_string().into_bytes(), sa.states[*v].regex.unwrap()))
127            .collect::<Vec<_>>();
128        options.push((Vec::new(), empty));
129        let expr = builder.mk_prefix_tree(options)?;
130        sa.states[state_index].regex = Some(expr);
131        state_stack.pop();
132    }
133    Ok(sa.states[0].regex.unwrap())
134}
135
136pub fn chunk_into_chars(input: &str) -> Vec<&str> {
137    let mut chunks = vec![];
138    let mut char_indices = input.char_indices().peekable();
139
140    while let Some((start, _)) = char_indices.next() {
141        let end = match char_indices.peek() {
142            Some(&(next_index, _)) => next_index,
143            None => input.len(),
144        };
145        chunks.push(&input[start..end]);
146    }
147
148    chunks
149}
150
151#[derive(PartialEq)]
152enum TokenType {
153    Whitespace,
154    Word,
155    Other,
156}
157
158fn classify(ch: char) -> TokenType {
159    if ch.is_whitespace() {
160        TokenType::Whitespace
161    } else if ch.is_alphanumeric() || ch == '_' {
162        TokenType::Word
163    } else {
164        TokenType::Other
165    }
166}
167
168pub fn chunk_into_words(input: &str) -> Vec<&str> {
169    if input.is_empty() {
170        return Vec::new();
171    }
172    let mut chunks = Vec::new();
173    let mut start = 0;
174    let mut current_type = classify(input.chars().next().unwrap());
175    for (i, ch) in input.char_indices() {
176        let token_type = classify(ch);
177        if token_type != current_type {
178            chunks.push(&input[start..i]);
179            start = i;
180            current_type = token_type;
181        }
182    }
183    chunks.push(&input[start..]);
184    chunks
185}
186
187#[cfg(test)]
188mod test {
189    use super::{chunk_into_chars, chunk_into_words, substring};
190    use derivre::{ExprRef, Regex, RegexBuilder};
191
192    fn to_regex(builder: RegexBuilder, expr: ExprRef) -> Regex {
193        builder.to_regex(expr)
194    }
195
196    #[test]
197    fn test_tokenize_chars() {
198        let input = "The quick brown fox jumps over the lazy dog.";
199        let tokens = chunk_into_chars(input);
200        assert_eq!(input, tokens.join(""));
201        assert_eq!(
202            tokens,
203            vec![
204                "T", "h", "e", " ", "q", "u", "i", "c", "k", " ", "b", "r", "o", "w", "n", " ",
205                "f", "o", "x", " ", "j", "u", "m", "p", "s", " ", "o", "v", "e", "r", " ", "t",
206                "h", "e", " ", "l", "a", "z", "y", " ", "d", "o", "g", "."
207            ]
208        );
209    }
210
211    #[test]
212    fn test_tokenize_chars_unicode() {
213        let input = "빠른 갈색 여우가 게으른 개를 뛰어넘었다.";
214        let tokens = chunk_into_chars(input);
215        assert_eq!(input, tokens.join(""));
216        assert_eq!(
217            tokens,
218            vec![
219                "빠", "른", " ", "갈", "색", " ", "여", "우", "가", " ", "게", "으", "른", " ",
220                "개", "를", " ", "뛰", "어", "넘", "었", "다", "."
221            ]
222        );
223    }
224
225    #[test]
226    fn test_tokenize_words() {
227        let input = "The quick brown fox jumps over the lazy dog.";
228        let tokens = chunk_into_words(input);
229        assert_eq!(input, tokens.join(""));
230        assert_eq!(
231            tokens,
232            vec![
233                "The", " ", "quick", " ", "brown", " ", "fox", " ", "jumps", " ", "over", " ",
234                "the", " ", "lazy", " ", "dog", "."
235            ]
236        );
237    }
238
239    #[test]
240    fn test_tokenize_words_unicode() {
241        let input = "빠른 갈색 여우가 게으른 개를 뛰어넘었다.";
242        let tokens = chunk_into_words(input);
243        assert_eq!(input, tokens.join(""));
244        assert_eq!(
245            tokens,
246            vec![
247                "빠른",
248                " ",
249                "갈색",
250                " ",
251                "여우가",
252                " ",
253                "게으른",
254                " ",
255                "개를",
256                " ",
257                "뛰어넘었다",
258                "."
259            ]
260        );
261    }
262
263    #[test]
264    fn test_substring_chars() {
265        let mut builder = RegexBuilder::new();
266        let expr = substring(
267            &mut builder,
268            chunk_into_chars("The quick brown fox jumps over the lazy dog."),
269        )
270        .unwrap();
271        let mut regex = to_regex(builder, expr);
272        assert!(regex.is_match("The quick brown fox jumps over the lazy dog."));
273        assert!(regex.is_match("The quick brown fox"));
274        assert!(regex.is_match("he quick brow"));
275        assert!(regex.is_match("fox jump"));
276        assert!(regex.is_match("dog."));
277        assert!(!regex.is_match("brown fx"));
278    }
279
280    #[test]
281    fn test_substring_chars_unicode() {
282        let mut builder = RegexBuilder::new();
283        let expr = substring(
284            &mut builder,
285            chunk_into_chars("빠른 갈색 여우가 게으른 개를 뛰어넘었다."),
286        )
287        .unwrap();
288        let mut regex = to_regex(builder, expr);
289        assert!(regex.is_match("빠른 갈색 여우가 게으른 개를 뛰어넘었다."));
290        assert!(regex.is_match("빠른 갈색 여우가 게으른"));
291        assert!(regex.is_match("른 갈색 여우"));
292        assert!(regex.is_match("여우가 게으"));
293        assert!(regex.is_match("뛰어넘었다."));
294        assert!(!regex.is_match("갈색 여가"));
295    }
296
297    #[test]
298    fn test_substring_words() {
299        let mut builder = RegexBuilder::new();
300        let expr = substring(
301            &mut builder,
302            chunk_into_words("The quick brown fox jumps over the lazy dog."),
303        )
304        .unwrap();
305        let mut regex = to_regex(builder, expr);
306        assert!(regex.is_match("The quick brown fox jumps over the lazy dog."));
307        assert!(regex.is_match("The quick brown fox"));
308        assert!(!regex.is_match("he quick brow"));
309        assert!(!regex.is_match("fox jump"));
310        assert!(regex.is_match("dog."));
311        assert!(!regex.is_match("brown fx"));
312    }
313
314    #[test]
315    fn test_substring_words_unicode() {
316        let mut builder = RegexBuilder::new();
317        let expr = substring(
318            &mut builder,
319            chunk_into_words("빠른 갈색 여우가 게으른 개를 뛰어넘었다."),
320        )
321        .unwrap();
322        let mut regex = to_regex(builder, expr);
323        assert!(regex.is_match("빠른 갈색 여우가 게으른 개를 뛰어넘었다."));
324        assert!(regex.is_match("빠른 갈색 여우가 게으른"));
325        assert!(!regex.is_match("른 갈색 여우"));
326        assert!(!regex.is_match("여우가 게으"));
327        assert!(regex.is_match("뛰어넘었다."));
328        assert!(!regex.is_match("갈색 여가"));
329    }
330}