Skip to main content

ainu_utils/tokenizer/
tokenizer.rs

1use crate::tokenizer::unfix::unfix;
2
3pub fn tokenize(text: &str, keep_whitespace: bool) -> Vec<String> {
4    let mut words = Vec::new();
5    let mut word = String::new();
6
7    for c in text.chars() {
8        if c.is_alphabetic() || c.is_numeric() || c == '=' {
9            word.push(c);
10        } else if c == '\'' && !word.is_empty() {
11            word.push(c);
12        } else if c == '-' && !word.is_empty() {
13            word.push(c);
14        } else {
15            if !word.is_empty() {
16                words.extend(unfix(word));
17                word = String::new();
18            }
19
20            if !c.is_whitespace() {
21                words.push(c.to_string());
22            }
23
24            if c.is_whitespace() && keep_whitespace {
25                words.push(c.to_string());
26            }
27        }
28    }
29
30    if !word.is_empty() {
31        words.extend(unfix(word));
32    }
33
34    words
35}