1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
use itertools::{Itertools, MultiPeek};
use unicode_segmentation::{GraphemeIndices, UnicodeSegmentation};
#[derive(Debug)]
pub struct WordSplit<'a> {
graphemes: MultiPeek<GraphemeIndices<'a>>,
}
impl<'heystack_> WordSplit<'heystack_> {
pub fn new(heystack: &'heystack_ str) -> Self {
WordSplit {
graphemes: heystack.grapheme_indices(true).multipeek(),
}
}
}
impl<'heystack> Iterator for WordSplit<'heystack> {
type Item = (usize, usize); // Start and end index of a word
fn next(&mut self) -> Option<Self::Item> {
// In this method c0, c1, c2 are the 3 next characters, not the current one
// i.e. c0 would be the result of graphemes.next()
let graphemes = self.graphemes.by_ref();
let mut word_start_index = 0;
let mut can_start_new_word = true;
// Loop until got 1 complete word then return Some(word)
// None if no word found
// Ignore all listed special characters
loop {
// Analyze c0
// If c0 is None -> end of str -> is boundary
// If c0 is a symbol -> is boundary
let peek0 = graphemes.peek();
let is_c0_boundary = peek0.map_or(true, |(_, c)| is_not_alphanumeric(c));
let is_c0_uppercase = peek0.map_or(false, |(_, c)| is_uppercase(c));
let c0_len = peek0.map_or(0, |(_, c)| c.len());
// Analyze c1
let peek1 = graphemes.peek();
let is_c1_none = peek1.is_none();
let is_c1_uppercase = peek1.map_or(false, |(_, c)| is_uppercase(c));
// Analyze c2
let peek2 = graphemes.peek();
// Check if c2 is neither an uppercase letter nor special char nor end of str
let is_c2_lowercase =
peek2.map_or(false, |(_, c)| !is_uppercase(c) && !is_not_alphanumeric(c));
// 1. Check boundary
// slice when a symbol is detected or end of str
if is_c0_boundary {
if let Some((index, _boundary)) = graphemes.next() {
// Ignore boundaries at the start of the word
if can_start_new_word {
continue;
}
// Return the option to the current word's indexes since a boundary is reached
return Some((word_start_index, index));
}
return None;
}
// 2. Check end of str
// Check if c1 is end of str
// Ex: hello world -> currently at "l", c0 is at "d", c1 is None
if is_c1_none {
if let Some((index, _)) = graphemes.next() {
// Edge case: only 1 letter as last word
// Ex: hello_world-x
if can_start_new_word {
word_start_index = index;
}
return Some((word_start_index, index + c0_len));
}
return None;
}
// 3. Check acronym
// If UPPER - UPPER - LOWER -> is a boundary
// i.e. HTMLFile -> c0 is at "L" , c1 at "F", c2 at "i"
if is_c0_uppercase && is_c1_uppercase && is_c2_lowercase {
if let Some((index, _)) = graphemes.next() {
return Some((word_start_index, index + c0_len));
}
return None;
}
// 4. Check camel case boundary
// If LOWER - UPPER -> is a boundary
// i.e. helloWorld -> c0 is "o", c1 is "W"
if !is_c0_uppercase && is_c1_uppercase {
if let Some((index, _)) = graphemes.next() {
// Edge case: only 1 letter before this boundary and the last one
// Ex: .cD
if can_start_new_word {
word_start_index = index;
}
return Some((word_start_index, index + c0_len));
}
return None;
}
// Handle lowercase character
let (index, _) = graphemes.next().unwrap();
if can_start_new_word {
word_start_index = index;
can_start_new_word = false;
}
}
}
}
pub fn is_uppercase(grapheme: &str) -> bool {
// If empty grapheme return false
grapheme.chars().next().map_or(false, |c| c.is_uppercase())
}
pub fn is_not_alphanumeric(grapheme: &str) -> bool {
// Check if grapheme is a symbol of some sort. Null doesn't count as one.
grapheme
.chars()
.next()
.map_or(false, |c| !c.is_alphanumeric())
}
#[cfg(test)]
mod utils_tests {
mod test_word_split_iter {
use crate::utils::WordSplit;
#[test]
fn test_iter_basic() {
let s = "hello world/1234";
let words: Vec<_> = WordSplit::new(s).map(|(x, y)| &s[x..y]).collect();
assert_eq!(words, vec!["hello", "world", "1234"]);
}
#[test]
fn test_iter_delim_at_beginning_end() {
let s = "_hello ...world-";
let words: Vec<_> = WordSplit::new(s).map(|(x, y)| &s[x..y]).collect();
assert_eq!(words, vec!["hello", "world"]);
}
#[test]
fn test_iter_uppercase() {
let s = "- -helloWorld";
let words: Vec<_> = WordSplit::new(s).map(|(x, y)| &s[x..y]).collect();
assert_eq!(words, vec!["hello", "World"]);
}
#[ignore = "Emoji too hard :("]
#[test]
fn test_iter_complex_graphemes() {
// 🦀 is 4 bytes, 👩👩👧👦 is 25 bytes!
let s = "🦀Family👩👩👧👦";
let words: Vec<_> = WordSplit::new(s).map(|(x, y)| &s[x..y]).collect();
// Assuming symbols like emojis separate words, or stay attached
assert_eq!(words, vec!["🦀", "Family", "👩👩👧👦"]);
}
#[test]
fn test_iter_acronym_at_end() {
let s = "myHTML";
let words: Vec<_> = WordSplit::new(s).map(|(x, y)| &s[x..y]).collect();
assert_eq!(words, vec!["my", "HTML"]);
}
#[test]
fn test_iter_acronym_start() {
let s = "HTMLParser";
let words: Vec<_> = WordSplit::new(s).map(|(x, y)| &s[x..y]).collect();
assert_eq!(words, vec!["HTML", "Parser"]);
}
#[test]
fn test_iter_consecutive_symbols() {
let s = "---hello___world ";
let words: Vec<_> = WordSplit::new(s).map(|(x, y)| &s[x..y]).collect();
assert_eq!(words, vec!["hello", "world"]);
}
#[test]
fn test_iter_numbers() {
let s = "v1.2.3Release";
let words: Vec<_> = WordSplit::new(s).map(|(x, y)| &s[x..y]).collect();
// This depends on your 'is_boundary' definition, but common expectation:
assert_eq!(words, vec!["v1", "2", "3", "Release"]);
}
#[test]
fn test_iter_empty_and_whitespace() {
assert_eq!(WordSplit::new("").next(), None);
assert_eq!(WordSplit::new(" ").next(), None);
assert_eq!(WordSplit::new("---").next(), None);
}
#[test]
fn test_iter_mixed_suite() {
// The "All-in-One" Benchmark
let s = "JSONParser_v2-beta__HTMLFile/path.to.mixedCase_ID";
let words: Vec<_> = WordSplit::new(s).map(|(x, y)| &s[x..y]).collect();
assert_eq!(
words,
vec![
"JSON", // Acronym boundary detected
"Parser", // Standard Pascal
"v2", // Alphanumeric kept together (assuming numbers aren't separators)
"beta", // Separator skipped
"HTML", // Acronym boundary
"File", // Pascal
"path", // Slash separator
"to", // Dot separator
"mixed", // camelCase start
"Case", // camelCase split
"ID" // Trailing acronym
]
);
}
#[test]
fn test_iter_messy_separators() {
// Consecutive separators
let s = "double__under..score";
let words: Vec<_> = WordSplit::new(s).map(|(x, y)| &s[x..y]).collect();
assert_eq!(words, vec!["double", "under", "score"]);
}
#[test]
fn test_iter_utf8_basic() {
let s = "Noël_München";
let words: Vec<_> = WordSplit::new(s).map(|(x, y)| &s[x..y]).collect();
assert_eq!(words, vec!["Noël", "München"]);
}
#[test]
fn test_iter_single_char() {
let s = "a";
let words: Vec<_> = WordSplit::new(s).map(|(x, y)| &s[x..y]).collect();
assert_eq!(words, vec!["a"]);
}
#[test]
fn test_iter_multiple_chars() {
let s = "aB/cD_e f";
let words: Vec<_> = WordSplit::new(s).map(|(x, y)| &s[x..y]).collect();
assert_eq!(words, vec!["a", "B", "c", "D", "e", "f"]);
}
}
mod test_uppercase {
use crate::utils::*;
#[test]
fn is_uppercase_one_char_ascii() {
assert!(is_uppercase("S"));
assert!(!is_uppercase("s"));
assert!(!is_uppercase("i"));
assert!(is_uppercase("I"));
assert!(!is_uppercase("."));
assert!(!is_uppercase("?"));
assert!(!is_uppercase("9"));
}
#[test]
fn is_uppercase_one_char_utf8() {
assert!(is_uppercase("Ä"));
assert!(!is_uppercase("ä"));
assert!(!is_uppercase("ö"));
assert!(is_uppercase("Å"));
assert!(!is_uppercase("ß"));
assert!(!is_uppercase("と"));
assert!(!is_uppercase("á"));
}
#[test]
fn uppercase_std() {
assert_eq!("ß".to_uppercase(), "SS".to_string());
}
}
}