hat_splitter/
split.rs

1use icu_segmenter::WordSegmenter;
2use once_cell::sync::Lazy;
3use regex::Regex;
4
5#[derive(Clone)]
6enum Token {
7    Word(String),
8    Punctuation(String),
9    Whitespace(String),
10    Space(String),
11}
12
13pub trait Splitter {
14    // At some point it would be great to do this without allocations...
15    //fn split<'a>(&self, input: &'a str) -> Vec<&'a str>;
16    fn split(&self, input: &str) -> Vec<String>;
17}
18
19pub struct HATSplitter;
20
21impl Default for HATSplitter {
22    fn default() -> Self {
23        Self::new()
24    }
25}
26
27impl HATSplitter {
28    pub fn new() -> Self {
29        HATSplitter
30    }
31
32    fn _unicode_word_split(input: &str) -> Vec<&str> {
33        // TODO make this a member of the struct;
34        // this is not currently trivial as it is not `Sync`
35        // and Py03 requires `Send` and `Sync` due to the python GIL
36        // (see https://pyo3.rs/v0.24.0/class/thread-safety)
37        // (of course I would take care of this in the python bindings. not here)
38        let segmenter = WordSegmenter::new_auto();
39
40        let breakpoints: Vec<usize> = segmenter.segment_str(input).collect();
41
42        breakpoints.windows(2).map(|w| &input[w[0]..w[1]]).collect()
43    }
44
45    fn _split_camel_case(s: &str) -> Vec<&str> {
46        static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\p{Ll})(\p{Lu})").unwrap());
47        let mut indices = RE.find_iter(s).map(|m| m.start() + 1).collect::<Vec<_>>();
48
49        indices.insert(0, 0);
50        indices.push(s.len());
51
52        indices.windows(2).map(|w| &s[w[0]..w[1]]).collect()
53    }
54
55    fn _concatenate_spaces(strings: Vec<&str>) -> Vec<String> {
56        strings.into_iter().fold(Vec::new(), |mut acc, s| {
57            if s == " " {
58                // If we have a space and the last element is also spaces, append to it
59                if let Some(last) = acc.last_mut() {
60                    if last.chars().all(|c| c == ' ') {
61                        last.push(' ');
62                        return acc;
63                    }
64                }
65            }
66            // Otherwise add as a new element
67            acc.push(s.to_string());
68            acc
69        })
70    }
71
72    fn _lexer(s: &str) -> Vec<Token> {
73        let words = HATSplitter::_unicode_word_split(s);
74
75        let words = words
76            .iter()
77            .flat_map(|s| HATSplitter::_split_camel_case(s))
78            .collect::<Vec<&str>>();
79
80        let words = HATSplitter::_concatenate_spaces(words.clone());
81
82        static WHITESPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\s+$").unwrap());
83        static PUNCTUATION_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\p{P}$").unwrap());
84
85        words
86            .into_iter()
87            .map(|s| {
88                if s == " " {
89                    Token::Space(s)
90                } else if WHITESPACE_RE.is_match(s.as_str()) {
91                    Token::Whitespace(s)
92                } else if PUNCTUATION_RE.is_match(s.as_str()) {
93                    Token::Punctuation(s)
94                } else {
95                    Token::Word(s)
96                }
97            })
98            .collect()
99    }
100
101    fn _parser(tokens: Vec<Token>) -> Vec<String> {
102        let groups = tokens
103            .iter()
104            .fold(Vec::<Vec<Token>>::new(), |mut groups, token| {
105                match token {
106                    Token::Whitespace(_) => {
107                        // Create a separate group for whitespace
108                        groups.push(vec![token.clone()]);
109                    }
110                    Token::Space(_) => {
111                        // Start new group with space
112                        groups.push(vec![token.clone()]);
113                    }
114                    Token::Word(_) => {
115                        // Append to current group if last token is a space, otherwise start new group
116                        if let Some(last_group) = groups.last_mut() {
117                            if let Some(Token::Space(_)) = last_group.last() {
118                                last_group.push(token.clone());
119                                return groups;
120                            }
121                        }
122                        groups.push(vec![token.clone()]);
123                    }
124                    Token::Punctuation(_) => {
125                        // Append to current group if last token is a word, punctuation or space, otherwise start new group
126                        if let Some(last_group) = groups.last_mut() {
127                            if let Some(last_token) = last_group.last() {
128                                if matches!(
129                                    last_token,
130                                    Token::Space(_) | Token::Word(_) | Token::Punctuation(_)
131                                ) {
132                                    last_group.push(token.clone());
133                                    return groups;
134                                }
135                            }
136                        }
137                        groups.push(vec![token.clone()]);
138                    }
139                }
140                groups
141            });
142
143        // Concatenate groups
144        groups
145            .into_iter()
146            .map(|group| {
147                group.into_iter().fold(String::new(), |mut acc, token| {
148                    match token {
149                        Token::Word(s) => acc.push_str(&s),
150                        Token::Punctuation(s) => acc.push_str(&s),
151                        Token::Whitespace(s) => acc.push_str(&s),
152                        Token::Space(s) => acc.push_str(&s),
153                    }
154                    acc
155                })
156            })
157            .collect()
158    }
159}
160
161impl Splitter for HATSplitter {
162    fn split(&self, input: &str) -> Vec<String> {
163        let tokens = HATSplitter::_lexer(input);
164        HATSplitter::_parser(tokens)
165    }
166}
167
168#[cfg(test)]
169mod tests {
170    use super::*;
171
172    #[test]
173    fn it_works() {
174        let splitter = HATSplitter;
175        let input = "Hello, world!";
176        let result = splitter.split(input);
177        assert_eq!(result, vec!["Hello,", " world!"]);
178    }
179}