ainu_utils/tokenizer/
tokenizer.rs1use crate::tokenizer::unfix::unfix;
2
3pub fn tokenize(text: &str, keep_whitespace: bool) -> Vec<String> {
4 let mut words = Vec::new();
5 let mut word = String::new();
6
7 for c in text.chars() {
8 if c.is_alphabetic() || c.is_numeric() || c == '=' {
9 word.push(c);
10 } else if c == '\'' && !word.is_empty() {
11 word.push(c);
12 } else if c == '-' && !word.is_empty() {
13 word.push(c);
14 } else {
15 if !word.is_empty() {
16 words.extend(unfix(word));
17 word = String::new();
18 }
19
20 if !c.is_whitespace() {
21 words.push(c.to_string());
22 }
23
24 if c.is_whitespace() && keep_whitespace {
25 words.push(c.to_string());
26 }
27 }
28 }
29
30 if !word.is_empty() {
31 words.extend(unfix(word));
32 }
33
34 words
35}