1use icu_segmenter::WordSegmenter;
2use once_cell::sync::Lazy;
3use regex::Regex;
4
5#[derive(Clone)]
6enum Token {
7 Word(String),
8 Punctuation(String),
9 Whitespace(String),
10 Space(String),
11}
12
13pub trait Splitter {
14 fn split(&self, input: &str) -> Vec<String>;
17}
18
19pub struct HATSplitter;
20
21impl Default for HATSplitter {
22 fn default() -> Self {
23 Self::new()
24 }
25}
26
27impl HATSplitter {
28 pub fn new() -> Self {
29 HATSplitter
30 }
31
32 fn _unicode_word_split(input: &str) -> Vec<&str> {
33 let segmenter = WordSegmenter::new_auto();
39
40 let breakpoints: Vec<usize> = segmenter.segment_str(input).collect();
41
42 breakpoints.windows(2).map(|w| &input[w[0]..w[1]]).collect()
43 }
44
45 fn _split_camel_case(s: &str) -> Vec<&str> {
46 static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(\p{Ll})(\p{Lu})").unwrap());
47 let mut indices = RE.find_iter(s).map(|m| m.start() + 1).collect::<Vec<_>>();
48
49 indices.insert(0, 0);
50 indices.push(s.len());
51
52 indices.windows(2).map(|w| &s[w[0]..w[1]]).collect()
53 }
54
55 fn _concatenate_spaces(strings: Vec<&str>) -> Vec<String> {
56 strings.into_iter().fold(Vec::new(), |mut acc, s| {
57 if s == " " {
58 if let Some(last) = acc.last_mut() {
60 if last.chars().all(|c| c == ' ') {
61 last.push(' ');
62 return acc;
63 }
64 }
65 }
66 acc.push(s.to_string());
68 acc
69 })
70 }
71
72 fn _lexer(s: &str) -> Vec<Token> {
73 let words = HATSplitter::_unicode_word_split(s);
74
75 let words = words
76 .iter()
77 .flat_map(|s| HATSplitter::_split_camel_case(s))
78 .collect::<Vec<&str>>();
79
80 let words = HATSplitter::_concatenate_spaces(words.clone());
81
82 static WHITESPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\s+$").unwrap());
83 static PUNCTUATION_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\p{P}$").unwrap());
84
85 words
86 .into_iter()
87 .map(|s| {
88 if s == " " {
89 Token::Space(s)
90 } else if WHITESPACE_RE.is_match(s.as_str()) {
91 Token::Whitespace(s)
92 } else if PUNCTUATION_RE.is_match(s.as_str()) {
93 Token::Punctuation(s)
94 } else {
95 Token::Word(s)
96 }
97 })
98 .collect()
99 }
100
101 fn _parser(tokens: Vec<Token>) -> Vec<String> {
102 let groups = tokens
103 .iter()
104 .fold(Vec::<Vec<Token>>::new(), |mut groups, token| {
105 match token {
106 Token::Whitespace(_) => {
107 groups.push(vec![token.clone()]);
109 }
110 Token::Space(_) => {
111 groups.push(vec![token.clone()]);
113 }
114 Token::Word(_) => {
115 if let Some(last_group) = groups.last_mut() {
117 if let Some(Token::Space(_)) = last_group.last() {
118 last_group.push(token.clone());
119 return groups;
120 }
121 }
122 groups.push(vec![token.clone()]);
123 }
124 Token::Punctuation(_) => {
125 if let Some(last_group) = groups.last_mut() {
127 if let Some(last_token) = last_group.last() {
128 if matches!(
129 last_token,
130 Token::Space(_) | Token::Word(_) | Token::Punctuation(_)
131 ) {
132 last_group.push(token.clone());
133 return groups;
134 }
135 }
136 }
137 groups.push(vec![token.clone()]);
138 }
139 }
140 groups
141 });
142
143 groups
145 .into_iter()
146 .map(|group| {
147 group.into_iter().fold(String::new(), |mut acc, token| {
148 match token {
149 Token::Word(s) => acc.push_str(&s),
150 Token::Punctuation(s) => acc.push_str(&s),
151 Token::Whitespace(s) => acc.push_str(&s),
152 Token::Space(s) => acc.push_str(&s),
153 }
154 acc
155 })
156 })
157 .collect()
158 }
159}
160
161impl Splitter for HATSplitter {
162 fn split(&self, input: &str) -> Vec<String> {
163 let tokens = HATSplitter::_lexer(input);
164 HATSplitter::_parser(tokens)
165 }
166}
167
168#[cfg(test)]
169mod tests {
170 use super::*;
171
172 #[test]
173 fn it_works() {
174 let splitter = HATSplitter;
175 let input = "Hello, world!";
176 let result = splitter.split(input);
177 assert_eq!(result, vec!["Hello,", " world!"]);
178 }
179}