scirs2_text/
tokenize.rs

1//! Text tokenization utilities
2//!
3//! This module provides functionality for tokenizing text into
4//! words, sentences, or characters.
5
6pub mod bpe;
7
8use crate::error::{Result, TextError};
9use lazy_static::lazy_static;
10use regex::Regex;
11use unicode_segmentation::UnicodeSegmentation;
12
13pub use bpe::{BpeConfig, BpeTokenizer, BpeVocabulary};
14
15lazy_static! {
16    static ref WORD_PATTERN: Regex = Regex::new(r"\b\w+\b").unwrap();
17    static ref SENTENCE_PATTERN: Regex = Regex::new(r"[^.!?]+[.!?]").unwrap();
18}
19
20/// Trait for tokenizing text
21pub trait Tokenizer {
22    /// Tokenize the input text into tokens
23    fn tokenize(&self, text: &str) -> Result<Vec<String>>;
24
25    /// Tokenize batch of text
26    fn tokenize_batch(&self, texts: &[&str]) -> Result<Vec<Vec<String>>> {
27        texts.iter().map(|text| self.tokenize(text)).collect()
28    }
29
30    /// Clone the tokenizer (for use in parallel processing)
31    fn clone_box(&self) -> Box<dyn Tokenizer + Send + Sync>;
32}
33
34/// Tokenizer for splitting text into words
35#[derive(Debug, Clone)]
36pub struct WordTokenizer {
37    lowercase: bool,
38    pattern: Option<Regex>,
39}
40
41impl WordTokenizer {
42    /// Create a new word tokenizer
43    pub fn new(lowercase: bool) -> Self {
44        Self {
45            lowercase,
46            pattern: None,
47        }
48    }
49
50    /// Create a new word tokenizer with a custom pattern
51    pub fn withpattern(lowercase: bool, pattern: &str) -> Result<Self> {
52        match Regex::new(pattern) {
53            Ok(regex) => Ok(Self {
54                lowercase,
55                pattern: Some(regex),
56            }),
57            Err(e) => Err(TextError::TokenizationError(format!(
58                "Invalid regex pattern: {e}"
59            ))),
60        }
61    }
62}
63
64impl Default for WordTokenizer {
65    fn default() -> Self {
66        Self::new(true)
67    }
68}
69
70impl Tokenizer for WordTokenizer {
71    fn tokenize(&self, text: &str) -> Result<Vec<String>> {
72        if text.trim().is_empty() {
73            return Ok(Vec::new());
74        }
75
76        let text = if self.lowercase {
77            text.to_lowercase()
78        } else {
79            text.to_string()
80        };
81
82        let tokens = match &self.pattern {
83            Some(pattern) => pattern
84                .find_iter(&text)
85                .map(|m| m.as_str().to_string())
86                .collect(),
87            None => WORD_PATTERN
88                .find_iter(&text)
89                .map(|m| m.as_str().to_string())
90                .collect(),
91        };
92
93        Ok(tokens)
94    }
95
96    fn clone_box(&self) -> Box<dyn Tokenizer + Send + Sync> {
97        Box::new(self.clone())
98    }
99}
100
101/// Tokenizer for splitting text into sentences
102#[derive(Debug, Clone)]
103pub struct SentenceTokenizer {
104    pattern: Option<Regex>,
105}
106
107impl SentenceTokenizer {
108    /// Create a new sentence tokenizer
109    pub fn new() -> Self {
110        Self { pattern: None }
111    }
112
113    /// Create a new sentence tokenizer with a custom pattern
114    pub fn withpattern(pattern: &str) -> Result<Self> {
115        match Regex::new(pattern) {
116            Ok(regex) => Ok(Self {
117                pattern: Some(regex),
118            }),
119            Err(e) => Err(TextError::TokenizationError(format!(
120                "Invalid regex pattern: {e}"
121            ))),
122        }
123    }
124}
125
126impl Default for SentenceTokenizer {
127    fn default() -> Self {
128        Self::new()
129    }
130}
131
132impl Tokenizer for SentenceTokenizer {
133    fn tokenize(&self, text: &str) -> Result<Vec<String>> {
134        if text.trim().is_empty() {
135            return Ok(Vec::new());
136        }
137
138        let tokens = match &self.pattern {
139            Some(pattern) => pattern
140                .find_iter(text)
141                .map(|m| m.as_str().trim().to_string())
142                .collect(),
143            None => SENTENCE_PATTERN
144                .find_iter(text)
145                .map(|m| m.as_str().trim().to_string())
146                .collect(),
147        };
148
149        Ok(tokens)
150    }
151
152    fn clone_box(&self) -> Box<dyn Tokenizer + Send + Sync> {
153        Box::new(self.clone())
154    }
155}
156
157/// Tokenizer for splitting text into characters or grapheme clusters
158#[derive(Debug, Clone)]
159pub struct CharacterTokenizer {
160    _use_graphemeclusters: bool,
161}
162
163impl CharacterTokenizer {
164    /// Create a new character tokenizer
165    pub fn new(_use_graphemeclusters: bool) -> Self {
166        Self {
167            _use_graphemeclusters,
168        }
169    }
170}
171
172impl Default for CharacterTokenizer {
173    fn default() -> Self {
174        Self::new(true)
175    }
176}
177
178impl Tokenizer for CharacterTokenizer {
179    fn tokenize(&self, text: &str) -> Result<Vec<String>> {
180        if text.trim().is_empty() {
181            return Ok(Vec::new());
182        }
183
184        let tokens = if self._use_graphemeclusters {
185            text.graphemes(true).map(|g| g.to_string()).collect()
186        } else {
187            text.chars().map(|c| c.to_string()).collect()
188        };
189
190        Ok(tokens)
191    }
192
193    fn clone_box(&self) -> Box<dyn Tokenizer + Send + Sync> {
194        Box::new(self.clone())
195    }
196}
197
198/// Tokenizer for extracting n-grams from text
199#[derive(Debug, Clone)]
200pub struct NgramTokenizer {
201    n: usize,
202    min_n: usize,
203    only_alphanumeric: bool,
204    separator: String,
205}
206
207impl NgramTokenizer {
208    /// Create a new n-gram tokenizer
209    pub fn new(n: usize) -> Result<Self> {
210        if n == 0 {
211            return Err(TextError::TokenizationError(
212                "N-gram size must be greater than 0".to_string(),
213            ));
214        }
215
216        Ok(Self {
217            n,
218            min_n: n,
219            only_alphanumeric: false,
220            separator: " ".to_string(),
221        })
222    }
223
224    /// Create an n-gram tokenizer with a range of n values
225    pub fn with_range(_min_n: usize, maxn: usize) -> Result<Self> {
226        if _min_n == 0 || maxn < _min_n {
227            return Err(TextError::TokenizationError(
228                "Invalid _n-gram range".to_string(),
229            ));
230        }
231
232        Ok(Self {
233            n: maxn,
234            min_n: _min_n,
235            only_alphanumeric: false,
236            separator: " ".to_string(),
237        })
238    }
239
240    /// Set whether to only include alphanumeric tokens
241    pub fn only_alphanumeric(mut self, value: bool) -> Self {
242        self.only_alphanumeric = value;
243        self
244    }
245
246    /// Set the separator for n-grams
247    pub fn with_separator(mut self, separator: String) -> Self {
248        self.separator = separator;
249        self
250    }
251
252    /// Extract n-grams from a sequence of tokens
253    fn extract_ngrams(&self, tokens: &[String], n: usize) -> Vec<String> {
254        if tokens.len() < n {
255            return Vec::new();
256        }
257
258        tokens
259            .windows(n)
260            .map(|window| window.join(&self.separator))
261            .collect()
262    }
263}
264
265impl Tokenizer for NgramTokenizer {
266    fn tokenize(&self, text: &str) -> Result<Vec<String>> {
267        if text.trim().is_empty() {
268            return Ok(Vec::new());
269        }
270
271        // First tokenize into words
272        let word_tokenizer = WordTokenizer::new(true);
273        let words = word_tokenizer.tokenize(text)?;
274
275        let filtered_words = if self.only_alphanumeric {
276            words
277                .into_iter()
278                .filter(|w| w.chars().all(|c| c.is_alphanumeric()))
279                .collect()
280        } else {
281            words
282        };
283
284        let mut ngrams = Vec::new();
285
286        // Extract n-grams for each n in the range
287        for n in self.min_n..=self.n {
288            ngrams.extend(self.extract_ngrams(&filtered_words, n));
289        }
290
291        Ok(ngrams)
292    }
293
294    fn clone_box(&self) -> Box<dyn Tokenizer + Send + Sync> {
295        Box::new(self.clone())
296    }
297}
298
299/// Regular expression based tokenizer
300#[derive(Debug, Clone)]
301pub struct RegexTokenizer {
302    pattern: Regex,
303    gaps: bool,
304}
305
306impl RegexTokenizer {
307    /// Create a new regex tokenizer
308    ///
309    /// # Arguments
310    /// * `pattern` - The regex pattern to use
311    /// * `gaps` - If true, the pattern matches token separators. If false, it matches tokens.
312    pub fn new(pattern: &str, gaps: bool) -> Result<Self> {
313        match Regex::new(pattern) {
314            Ok(regex) => Ok(Self {
315                pattern: regex,
316                gaps,
317            }),
318            Err(e) => Err(TextError::TokenizationError(format!(
319                "Invalid regex pattern: {e}"
320            ))),
321        }
322    }
323}
324
325impl Tokenizer for RegexTokenizer {
326    fn tokenize(&self, text: &str) -> Result<Vec<String>> {
327        if text.trim().is_empty() {
328            return Ok(Vec::new());
329        }
330
331        let tokens = if self.gaps {
332            // Pattern matches separators
333            self.pattern
334                .split(text)
335                .filter(|s| !s.is_empty())
336                .map(|s| s.to_string())
337                .collect()
338        } else {
339            // Pattern matches tokens
340            self.pattern
341                .find_iter(text)
342                .map(|m| m.as_str().to_string())
343                .collect()
344        };
345
346        Ok(tokens)
347    }
348
349    fn clone_box(&self) -> Box<dyn Tokenizer + Send + Sync> {
350        Box::new(self.clone())
351    }
352}
353
354/// Whitespace tokenizer that splits on any whitespace character
355#[derive(Debug, Clone)]
356pub struct WhitespaceTokenizer;
357
358impl WhitespaceTokenizer {
359    /// Create a new whitespace tokenizer
360    pub fn new() -> Self {
361        Self
362    }
363}
364
365impl Default for WhitespaceTokenizer {
366    fn default() -> Self {
367        Self::new()
368    }
369}
370
371impl Tokenizer for WhitespaceTokenizer {
372    fn tokenize(&self, text: &str) -> Result<Vec<String>> {
373        if text.trim().is_empty() {
374            return Ok(Vec::new());
375        }
376
377        Ok(text.split_whitespace().map(|s| s.to_string()).collect())
378    }
379
380    fn clone_box(&self) -> Box<dyn Tokenizer + Send + Sync> {
381        Box::new(self.clone())
382    }
383}
384
385#[cfg(test)]
386mod tests {
387    use super::*;
388
389    #[test]
390    fn test_word_tokenizer() {
391        let tokenizer = WordTokenizer::default();
392        let text = "Hello, world! This is a test.";
393        let tokens = tokenizer.tokenize(text).unwrap();
394        assert_eq!(tokens, vec!["hello", "world", "this", "is", "a", "test"]);
395    }
396
397    #[test]
398    fn test_word_tokenizer_custompattern() {
399        let tokenizer = WordTokenizer::withpattern(false, r"\w+").unwrap();
400        let text = "Hello, world! This is a test.";
401        let tokens = tokenizer.tokenize(text).unwrap();
402        assert_eq!(tokens, vec!["Hello", "world", "This", "is", "a", "test"]);
403    }
404
405    #[test]
406    fn test_sentence_tokenizer() {
407        let tokenizer = SentenceTokenizer::default();
408        let text = "Hello, world! This is a test. How are you today?";
409        let tokens = tokenizer.tokenize(text).unwrap();
410        assert_eq!(
411            tokens,
412            vec!["Hello, world!", "This is a test.", "How are you today?"]
413        );
414    }
415
416    #[test]
417    fn test_character_tokenizer() {
418        let tokenizer = CharacterTokenizer::new(false);
419        let text = "Hello";
420        let tokens = tokenizer.tokenize(text).unwrap();
421        assert_eq!(tokens, vec!["H", "e", "l", "l", "o"]);
422    }
423
424    #[test]
425    fn test_grapheme_tokenizer() {
426        let tokenizer = CharacterTokenizer::default();
427        let text = "café";
428        let tokens = tokenizer.tokenize(text).unwrap();
429        assert_eq!(tokens, vec!["c", "a", "f", "é"]);
430    }
431
432    #[test]
433    fn test_ngram_tokenizer() {
434        let tokenizer = NgramTokenizer::new(2).unwrap();
435        let text = "hello world test";
436        let tokens = tokenizer.tokenize(text).unwrap();
437        assert_eq!(tokens, vec!["hello world", "world test"]);
438    }
439
440    #[test]
441    fn test_ngram_tokenizer_range() {
442        let tokenizer = NgramTokenizer::with_range(1, 2).unwrap();
443        let text = "hello world";
444        let tokens = tokenizer.tokenize(text).unwrap();
445        assert_eq!(tokens, vec!["hello", "world", "hello world"]);
446    }
447
448    #[test]
449    fn test_ngram_tokenizer_alphanumeric() {
450        let tokenizer = NgramTokenizer::new(2).unwrap().only_alphanumeric(true);
451        let text = "hello, world! test123";
452        let tokens = tokenizer.tokenize(text).unwrap();
453        assert_eq!(tokens, vec!["hello world", "world test123"]);
454    }
455
456    #[test]
457    fn test_regex_tokenizer_matches() {
458        let tokenizer = RegexTokenizer::new(r"\b\w+\b", false).unwrap();
459        let text = "Hello, world! Test 123.";
460        let tokens = tokenizer.tokenize(text).unwrap();
461        assert_eq!(tokens, vec!["Hello", "world", "Test", "123"]);
462    }
463
464    #[test]
465    fn test_regex_tokenizer_gaps() {
466        let tokenizer = RegexTokenizer::new(r"\s*,\s*", true).unwrap();
467        let text = "apple, banana, cherry";
468        let tokens = tokenizer.tokenize(text).unwrap();
469        assert_eq!(tokens, vec!["apple", "banana", "cherry"]);
470    }
471
472    #[test]
473    fn test_whitespace_tokenizer() {
474        let tokenizer = WhitespaceTokenizer::new();
475        let text = "hello   world\ttest\nline";
476        let tokens = tokenizer.tokenize(text).unwrap();
477        assert_eq!(tokens, vec!["hello", "world", "test", "line"]);
478    }
479}