1pub mod bpe;
7
8use crate::error::{Result, TextError};
9use lazy_static::lazy_static;
10use regex::Regex;
11use unicode_segmentation::UnicodeSegmentation;
12
13pub use bpe::{BpeConfig, BpeTokenizer, BpeVocabulary};
14
15lazy_static! {
16 static ref WORD_PATTERN: Regex = Regex::new(r"\b\w+\b").unwrap();
17 static ref SENTENCE_PATTERN: Regex = Regex::new(r"[^.!?]+[.!?]").unwrap();
18}
19
20pub trait Tokenizer {
22 fn tokenize(&self, text: &str) -> Result<Vec<String>>;
24
25 fn tokenize_batch(&self, texts: &[&str]) -> Result<Vec<Vec<String>>> {
27 texts.iter().map(|text| self.tokenize(text)).collect()
28 }
29
30 fn clone_box(&self) -> Box<dyn Tokenizer + Send + Sync>;
32}
33
34#[derive(Debug, Clone)]
36pub struct WordTokenizer {
37 lowercase: bool,
38 pattern: Option<Regex>,
39}
40
41impl WordTokenizer {
42 pub fn new(lowercase: bool) -> Self {
44 Self {
45 lowercase,
46 pattern: None,
47 }
48 }
49
50 pub fn withpattern(lowercase: bool, pattern: &str) -> Result<Self> {
52 match Regex::new(pattern) {
53 Ok(regex) => Ok(Self {
54 lowercase,
55 pattern: Some(regex),
56 }),
57 Err(e) => Err(TextError::TokenizationError(format!(
58 "Invalid regex pattern: {e}"
59 ))),
60 }
61 }
62}
63
64impl Default for WordTokenizer {
65 fn default() -> Self {
66 Self::new(true)
67 }
68}
69
70impl Tokenizer for WordTokenizer {
71 fn tokenize(&self, text: &str) -> Result<Vec<String>> {
72 if text.trim().is_empty() {
73 return Ok(Vec::new());
74 }
75
76 let text = if self.lowercase {
77 text.to_lowercase()
78 } else {
79 text.to_string()
80 };
81
82 let tokens = match &self.pattern {
83 Some(pattern) => pattern
84 .find_iter(&text)
85 .map(|m| m.as_str().to_string())
86 .collect(),
87 None => WORD_PATTERN
88 .find_iter(&text)
89 .map(|m| m.as_str().to_string())
90 .collect(),
91 };
92
93 Ok(tokens)
94 }
95
96 fn clone_box(&self) -> Box<dyn Tokenizer + Send + Sync> {
97 Box::new(self.clone())
98 }
99}
100
101#[derive(Debug, Clone)]
103pub struct SentenceTokenizer {
104 pattern: Option<Regex>,
105}
106
107impl SentenceTokenizer {
108 pub fn new() -> Self {
110 Self { pattern: None }
111 }
112
113 pub fn withpattern(pattern: &str) -> Result<Self> {
115 match Regex::new(pattern) {
116 Ok(regex) => Ok(Self {
117 pattern: Some(regex),
118 }),
119 Err(e) => Err(TextError::TokenizationError(format!(
120 "Invalid regex pattern: {e}"
121 ))),
122 }
123 }
124}
125
126impl Default for SentenceTokenizer {
127 fn default() -> Self {
128 Self::new()
129 }
130}
131
132impl Tokenizer for SentenceTokenizer {
133 fn tokenize(&self, text: &str) -> Result<Vec<String>> {
134 if text.trim().is_empty() {
135 return Ok(Vec::new());
136 }
137
138 let tokens = match &self.pattern {
139 Some(pattern) => pattern
140 .find_iter(text)
141 .map(|m| m.as_str().trim().to_string())
142 .collect(),
143 None => SENTENCE_PATTERN
144 .find_iter(text)
145 .map(|m| m.as_str().trim().to_string())
146 .collect(),
147 };
148
149 Ok(tokens)
150 }
151
152 fn clone_box(&self) -> Box<dyn Tokenizer + Send + Sync> {
153 Box::new(self.clone())
154 }
155}
156
157#[derive(Debug, Clone)]
159pub struct CharacterTokenizer {
160 _use_graphemeclusters: bool,
161}
162
163impl CharacterTokenizer {
164 pub fn new(_use_graphemeclusters: bool) -> Self {
166 Self {
167 _use_graphemeclusters,
168 }
169 }
170}
171
172impl Default for CharacterTokenizer {
173 fn default() -> Self {
174 Self::new(true)
175 }
176}
177
178impl Tokenizer for CharacterTokenizer {
179 fn tokenize(&self, text: &str) -> Result<Vec<String>> {
180 if text.trim().is_empty() {
181 return Ok(Vec::new());
182 }
183
184 let tokens = if self._use_graphemeclusters {
185 text.graphemes(true).map(|g| g.to_string()).collect()
186 } else {
187 text.chars().map(|c| c.to_string()).collect()
188 };
189
190 Ok(tokens)
191 }
192
193 fn clone_box(&self) -> Box<dyn Tokenizer + Send + Sync> {
194 Box::new(self.clone())
195 }
196}
197
198#[derive(Debug, Clone)]
200pub struct NgramTokenizer {
201 n: usize,
202 min_n: usize,
203 only_alphanumeric: bool,
204 separator: String,
205}
206
207impl NgramTokenizer {
208 pub fn new(n: usize) -> Result<Self> {
210 if n == 0 {
211 return Err(TextError::TokenizationError(
212 "N-gram size must be greater than 0".to_string(),
213 ));
214 }
215
216 Ok(Self {
217 n,
218 min_n: n,
219 only_alphanumeric: false,
220 separator: " ".to_string(),
221 })
222 }
223
224 pub fn with_range(_min_n: usize, maxn: usize) -> Result<Self> {
226 if _min_n == 0 || maxn < _min_n {
227 return Err(TextError::TokenizationError(
228 "Invalid _n-gram range".to_string(),
229 ));
230 }
231
232 Ok(Self {
233 n: maxn,
234 min_n: _min_n,
235 only_alphanumeric: false,
236 separator: " ".to_string(),
237 })
238 }
239
240 pub fn only_alphanumeric(mut self, value: bool) -> Self {
242 self.only_alphanumeric = value;
243 self
244 }
245
246 pub fn with_separator(mut self, separator: String) -> Self {
248 self.separator = separator;
249 self
250 }
251
252 fn extract_ngrams(&self, tokens: &[String], n: usize) -> Vec<String> {
254 if tokens.len() < n {
255 return Vec::new();
256 }
257
258 tokens
259 .windows(n)
260 .map(|window| window.join(&self.separator))
261 .collect()
262 }
263}
264
265impl Tokenizer for NgramTokenizer {
266 fn tokenize(&self, text: &str) -> Result<Vec<String>> {
267 if text.trim().is_empty() {
268 return Ok(Vec::new());
269 }
270
271 let word_tokenizer = WordTokenizer::new(true);
273 let words = word_tokenizer.tokenize(text)?;
274
275 let filtered_words = if self.only_alphanumeric {
276 words
277 .into_iter()
278 .filter(|w| w.chars().all(|c| c.is_alphanumeric()))
279 .collect()
280 } else {
281 words
282 };
283
284 let mut ngrams = Vec::new();
285
286 for n in self.min_n..=self.n {
288 ngrams.extend(self.extract_ngrams(&filtered_words, n));
289 }
290
291 Ok(ngrams)
292 }
293
294 fn clone_box(&self) -> Box<dyn Tokenizer + Send + Sync> {
295 Box::new(self.clone())
296 }
297}
298
299#[derive(Debug, Clone)]
301pub struct RegexTokenizer {
302 pattern: Regex,
303 gaps: bool,
304}
305
306impl RegexTokenizer {
307 pub fn new(pattern: &str, gaps: bool) -> Result<Self> {
313 match Regex::new(pattern) {
314 Ok(regex) => Ok(Self {
315 pattern: regex,
316 gaps,
317 }),
318 Err(e) => Err(TextError::TokenizationError(format!(
319 "Invalid regex pattern: {e}"
320 ))),
321 }
322 }
323}
324
325impl Tokenizer for RegexTokenizer {
326 fn tokenize(&self, text: &str) -> Result<Vec<String>> {
327 if text.trim().is_empty() {
328 return Ok(Vec::new());
329 }
330
331 let tokens = if self.gaps {
332 self.pattern
334 .split(text)
335 .filter(|s| !s.is_empty())
336 .map(|s| s.to_string())
337 .collect()
338 } else {
339 self.pattern
341 .find_iter(text)
342 .map(|m| m.as_str().to_string())
343 .collect()
344 };
345
346 Ok(tokens)
347 }
348
349 fn clone_box(&self) -> Box<dyn Tokenizer + Send + Sync> {
350 Box::new(self.clone())
351 }
352}
353
354#[derive(Debug, Clone)]
356pub struct WhitespaceTokenizer;
357
358impl WhitespaceTokenizer {
359 pub fn new() -> Self {
361 Self
362 }
363}
364
365impl Default for WhitespaceTokenizer {
366 fn default() -> Self {
367 Self::new()
368 }
369}
370
371impl Tokenizer for WhitespaceTokenizer {
372 fn tokenize(&self, text: &str) -> Result<Vec<String>> {
373 if text.trim().is_empty() {
374 return Ok(Vec::new());
375 }
376
377 Ok(text.split_whitespace().map(|s| s.to_string()).collect())
378 }
379
380 fn clone_box(&self) -> Box<dyn Tokenizer + Send + Sync> {
381 Box::new(self.clone())
382 }
383}
384
385#[cfg(test)]
386mod tests {
387 use super::*;
388
389 #[test]
390 fn test_word_tokenizer() {
391 let tokenizer = WordTokenizer::default();
392 let text = "Hello, world! This is a test.";
393 let tokens = tokenizer.tokenize(text).unwrap();
394 assert_eq!(tokens, vec!["hello", "world", "this", "is", "a", "test"]);
395 }
396
397 #[test]
398 fn test_word_tokenizer_custompattern() {
399 let tokenizer = WordTokenizer::withpattern(false, r"\w+").unwrap();
400 let text = "Hello, world! This is a test.";
401 let tokens = tokenizer.tokenize(text).unwrap();
402 assert_eq!(tokens, vec!["Hello", "world", "This", "is", "a", "test"]);
403 }
404
405 #[test]
406 fn test_sentence_tokenizer() {
407 let tokenizer = SentenceTokenizer::default();
408 let text = "Hello, world! This is a test. How are you today?";
409 let tokens = tokenizer.tokenize(text).unwrap();
410 assert_eq!(
411 tokens,
412 vec!["Hello, world!", "This is a test.", "How are you today?"]
413 );
414 }
415
416 #[test]
417 fn test_character_tokenizer() {
418 let tokenizer = CharacterTokenizer::new(false);
419 let text = "Hello";
420 let tokens = tokenizer.tokenize(text).unwrap();
421 assert_eq!(tokens, vec!["H", "e", "l", "l", "o"]);
422 }
423
424 #[test]
425 fn test_grapheme_tokenizer() {
426 let tokenizer = CharacterTokenizer::default();
427 let text = "café";
428 let tokens = tokenizer.tokenize(text).unwrap();
429 assert_eq!(tokens, vec!["c", "a", "f", "é"]);
430 }
431
432 #[test]
433 fn test_ngram_tokenizer() {
434 let tokenizer = NgramTokenizer::new(2).unwrap();
435 let text = "hello world test";
436 let tokens = tokenizer.tokenize(text).unwrap();
437 assert_eq!(tokens, vec!["hello world", "world test"]);
438 }
439
440 #[test]
441 fn test_ngram_tokenizer_range() {
442 let tokenizer = NgramTokenizer::with_range(1, 2).unwrap();
443 let text = "hello world";
444 let tokens = tokenizer.tokenize(text).unwrap();
445 assert_eq!(tokens, vec!["hello", "world", "hello world"]);
446 }
447
448 #[test]
449 fn test_ngram_tokenizer_alphanumeric() {
450 let tokenizer = NgramTokenizer::new(2).unwrap().only_alphanumeric(true);
451 let text = "hello, world! test123";
452 let tokens = tokenizer.tokenize(text).unwrap();
453 assert_eq!(tokens, vec!["hello world", "world test123"]);
454 }
455
456 #[test]
457 fn test_regex_tokenizer_matches() {
458 let tokenizer = RegexTokenizer::new(r"\b\w+\b", false).unwrap();
459 let text = "Hello, world! Test 123.";
460 let tokens = tokenizer.tokenize(text).unwrap();
461 assert_eq!(tokens, vec!["Hello", "world", "Test", "123"]);
462 }
463
464 #[test]
465 fn test_regex_tokenizer_gaps() {
466 let tokenizer = RegexTokenizer::new(r"\s*,\s*", true).unwrap();
467 let text = "apple, banana, cherry";
468 let tokens = tokenizer.tokenize(text).unwrap();
469 assert_eq!(tokens, vec!["apple", "banana", "cherry"]);
470 }
471
472 #[test]
473 fn test_whitespace_tokenizer() {
474 let tokenizer = WhitespaceTokenizer::new();
475 let text = "hello world\ttest\nline";
476 let tokens = tokenizer.tokenize(text).unwrap();
477 assert_eq!(tokens, vec!["hello", "world", "test", "line"]);
478 }
479}