1#![forbid(unsafe_code)]
2#![doc = include_str!("../README.md")]
3
4#[derive(Clone, Debug, Eq, PartialEq)]
6pub struct Token {
7 pub kind: TokenKind,
9 pub text: String,
11 pub span: TokenSpan,
13}
14
15impl Token {
16 fn new(kind: TokenKind, text: String, start: usize, end: usize) -> Self {
17 Self {
18 kind,
19 text,
20 span: TokenSpan { start, end },
21 }
22 }
23}
24
25#[derive(Clone, Copy, Debug, Eq, PartialEq)]
27pub enum TokenKind {
28 Text,
30 Word,
32 Sentence,
34 Char,
36}
37
38#[derive(Clone, Copy, Debug, Eq, PartialEq)]
40pub struct TokenSpan {
41 pub start: usize,
43 pub end: usize,
45}
46
47#[derive(Clone, Copy, Debug, Eq, PartialEq)]
49pub struct TokenizerOptions {
50 pub trim_empty: bool,
52 pub include_whitespace: bool,
54}
55
56impl Default for TokenizerOptions {
57 fn default() -> Self {
58 Self {
59 trim_empty: true,
60 include_whitespace: false,
61 }
62 }
63}
64
65pub fn tokenize_whitespace(input: &str) -> Vec<Token> {
67 let mut tokens = Vec::new();
68 let mut start = None;
69
70 for (index, character) in input.char_indices() {
71 if character.is_whitespace() {
72 if let Some(token_start) = start.take() {
73 tokens.push(Token::new(
74 TokenKind::Text,
75 input[token_start..index].to_owned(),
76 token_start,
77 index,
78 ));
79 }
80 } else if start.is_none() {
81 start = Some(index);
82 }
83 }
84
85 if let Some(token_start) = start {
86 tokens.push(Token::new(
87 TokenKind::Text,
88 input[token_start..].to_owned(),
89 token_start,
90 input.len(),
91 ));
92 }
93
94 tokens
95}
96
97pub fn tokenize_words(input: &str) -> Vec<Token> {
99 word_ranges(input)
100 .into_iter()
101 .map(|(start, end)| Token::new(TokenKind::Word, input[start..end].to_owned(), start, end))
102 .collect()
103}
104
105pub fn tokenize_sentences(input: &str) -> Vec<Token> {
107 let characters: Vec<(usize, char)> = input.char_indices().collect();
108 let mut tokens = Vec::new();
109 let mut start = None;
110 let mut last_non_whitespace_end = 0;
111 let mut index = 0;
112
113 while index < characters.len() {
114 let (byte_index, character) = characters[index];
115 let character_end = byte_index + character.len_utf8();
116
117 if start.is_none() {
118 if character.is_whitespace() {
119 index += 1;
120 continue;
121 }
122
123 start = Some(byte_index);
124 }
125
126 if !character.is_whitespace() {
127 last_non_whitespace_end = character_end;
128 }
129
130 if matches!(character, '.' | '!' | '?') {
131 let mut sentence_end = character_end;
132 let mut lookahead = index + 1;
133
134 while let Some((next_byte, next_character)) = characters.get(lookahead).copied() {
135 if matches!(
136 next_character,
137 '.' | '!' | '?' | '"' | '\'' | '”' | '’' | ')' | ']'
138 ) {
139 sentence_end = next_byte + next_character.len_utf8();
140 lookahead += 1;
141 } else {
142 break;
143 }
144 }
145
146 let next_character = characters.get(lookahead).map(|(_, value)| *value);
147 if next_character.is_none() || next_character.is_some_and(char::is_whitespace) {
148 let token_start = start.expect("sentence start should exist");
149 tokens.push(Token::new(
150 TokenKind::Sentence,
151 input[token_start..sentence_end].to_owned(),
152 token_start,
153 sentence_end,
154 ));
155 start = None;
156 last_non_whitespace_end = sentence_end;
157 index = lookahead;
158 continue;
159 }
160 }
161
162 index += 1;
163 }
164
165 if let Some(token_start) = start {
166 tokens.push(Token::new(
167 TokenKind::Sentence,
168 input[token_start..last_non_whitespace_end].to_owned(),
169 token_start,
170 last_non_whitespace_end,
171 ));
172 }
173
174 tokens
175}
176
177pub fn tokenize_chars(input: &str) -> Vec<Token> {
179 input
180 .char_indices()
181 .map(|(start, character)| {
182 let end = start + character.len_utf8();
183 Token::new(TokenKind::Char, character.to_string(), start, end)
184 })
185 .collect()
186}
187
188pub fn token_count(input: &str) -> usize {
190 tokenize_words(input).len()
191}
192
193fn word_ranges(input: &str) -> Vec<(usize, usize)> {
194 let characters: Vec<(usize, char)> = input.char_indices().collect();
195 let mut ranges = Vec::new();
196 let mut start = None;
197
198 for (index, (byte_index, character)) in characters.iter().copied().enumerate() {
199 let previous = index.checked_sub(1).map(|value| characters[value].1);
200 let next = characters.get(index + 1).map(|(_, value)| *value);
201 let is_word_character =
202 character.is_alphanumeric() || is_apostrophe(previous, character, next);
203
204 if is_word_character {
205 if start.is_none() {
206 start = Some(byte_index);
207 }
208 } else if let Some(token_start) = start.take() {
209 ranges.push((token_start, byte_index));
210 }
211 }
212
213 if let Some(token_start) = start {
214 ranges.push((token_start, input.len()));
215 }
216
217 ranges
218}
219
220fn is_apostrophe(previous: Option<char>, current: char, next: Option<char>) -> bool {
221 matches!(current, '\'' | '’')
222 && previous.is_some_and(char::is_alphanumeric)
223 && next.is_some_and(char::is_alphanumeric)
224}
225
226#[cfg(test)]
227mod tests {
228 use super::{
229 TokenKind, TokenizerOptions, token_count, tokenize_chars, tokenize_sentences,
230 tokenize_whitespace, tokenize_words,
231 };
232
233 #[test]
234 fn handles_empty_and_whitespace_only_input() {
235 assert!(tokenize_whitespace("").is_empty());
236 assert!(tokenize_words(" \n").is_empty());
237 assert_eq!(token_count("\t "), 0);
238 }
239
240 #[test]
241 fn tokenizes_whitespace_and_tracks_spans() {
242 let tokens = tokenize_whitespace(" hello world ");
243 assert_eq!(tokens.len(), 2);
244 assert_eq!(tokens[0].kind, TokenKind::Text);
245 assert_eq!(tokens[0].text, "hello");
246 assert_eq!(tokens[0].span.start, 1);
247 assert_eq!(tokens[1].span.end, 13);
248 }
249
250 #[test]
251 fn tokenizes_words_with_punctuation_and_apostrophes() {
252 let tokens = tokenize_words("Hello, world! don't-stop");
253 let texts: Vec<&str> = tokens.iter().map(|token| token.text.as_str()).collect();
254 assert_eq!(texts, vec!["Hello", "world", "don't", "stop"]);
255 assert!(tokens.iter().all(|token| token.kind == TokenKind::Word));
256 }
257
258 #[test]
259 fn tokenizes_sentences_and_multiline_text() {
260 let tokens = tokenize_sentences("One. Two!\nThree");
261 let texts: Vec<&str> = tokens.iter().map(|token| token.text.as_str()).collect();
262 assert_eq!(texts, vec!["One.", "Two!", "Three"]);
263 }
264
265 #[test]
266 fn tokenizes_unicode_characters() {
267 let tokens = tokenize_chars("A🙂");
268 assert_eq!(tokens.len(), 2);
269 assert_eq!(tokens[1].text, "🙂");
270 assert_eq!(tokens[1].span.start, 1);
271 assert_eq!(tokens[1].span.end, 5);
272 }
273
274 #[test]
275 fn tokenizes_unicode_words_conservatively() {
276 let tokens = tokenize_words("naïve façade");
277 let texts: Vec<&str> = tokens.iter().map(|token| token.text.as_str()).collect();
278 assert_eq!(texts, vec!["naïve", "façade"]);
279 assert!(TokenizerOptions::default().trim_empty);
280 }
281}