Skip to main content

anno/
tokenizer.rs

1//! Language-specific tokenization for multilingual NLP.
2//!
3//! This module provides a trait-based tokenization system that supports
4//! different languages and scripts. Unlike ML backends which use transformer
5//! tokenizers, this is for **statistical methods** (keywords, summarization)
6//! that need language-aware word segmentation.
7//!
8//! # Research Context
9//!
10//! Tokenization varies dramatically by language:
11//!
12//! | Language Family | Tokenization Method | Example |
13//! |----------------|---------------------|---------|
14//! | English, Spanish, French | Whitespace + punctuation | "Hello world" → ["Hello", "world"] |
15//! | Chinese, Japanese | Word segmentation (jieba, MeCab) | "中华人民共和国" → ["中华人民共和国"] or ["中华", "人民", "共和国"] |
16//! | Thai | No spaces, needs segmentation | "ประเทศไทย" → ["ประเทศไทย"] |
17//! | Arabic | Morphological analysis (clitics) | "وأبوه" → ["و", "أب", "ه"] |
18//! | Korean | Morphological analysis | "서울시" → ["서울", "시"] |
19//!
20//! # Usage
21//!
22//! ```rust
23//! use anno::lang::Language;
24//! use anno::tokenizer::{Tokenizer, WhitespaceTokenizer};
25//!
26//! let tokenizer = WhitespaceTokenizer::new();
27//! let tokens = tokenizer.tokenize("Hello world", Some(&Language::English));
28//! assert_eq!(tokens.len(), 2);
29//! ```
30//!
31//! # Future: Language-Specific Implementations
32//!
33//! - `JiebaTokenizer` for Chinese
34//! - `MecabTokenizer` for Japanese
35//! - `KonlpyTokenizer` for Korean
36//! - `UnicodeSegmenter` using UAX#29 for fallback
37
38use crate::lang::Language;
39
40/// A token extracted from text.
41#[derive(Debug, Clone, PartialEq, Eq)]
42pub struct Token {
43    /// Surface form (raw text)
44    pub surface: String,
45    /// Normalized form (lemma, if available)
46    pub lemma: Option<String>,
47    /// Part of speech tag (if available)
48    pub pos: Option<String>,
49    /// Start position (character offset)
50    pub start: usize,
51    /// End position (character offset)
52    pub end: usize,
53}
54
55impl Token {
56    /// Create a new token with surface form and position.
57    pub fn new(surface: impl Into<String>, start: usize, end: usize) -> Self {
58        Self {
59            surface: surface.into(),
60            lemma: None,
61            pos: None,
62            start,
63            end,
64        }
65    }
66
67    /// Create with lemma.
68    pub fn with_lemma(mut self, lemma: impl Into<String>) -> Self {
69        self.lemma = Some(lemma.into());
70        self
71    }
72
73    /// Create with POS tag.
74    pub fn with_pos(mut self, pos: impl Into<String>) -> Self {
75        self.pos = Some(pos.into());
76        self
77    }
78
79    /// Get the normalized form (lemma if available, otherwise surface).
80    pub fn normalized(&self) -> &str {
81        self.lemma.as_deref().unwrap_or(&self.surface)
82    }
83}
84
85/// Trait for language-specific tokenization.
86///
87/// Implementations should handle:
88/// - Word segmentation (whitespace, morphological, statistical)
89/// - Stopword detection
90/// - Case normalization (if applicable)
91/// - Script-specific rules (CJK, Arabic, etc.)
92pub trait Tokenizer: Send + Sync {
93    /// Tokenize text into a sequence of tokens.
94    ///
95    /// # Arguments
96    /// - `text`: Input text to tokenize
97    /// - `language`: Optional language hint (ISO 639-1 code or Language enum)
98    ///
99    /// # Returns
100    /// Vector of tokens with positions and optional linguistic annotations.
101    fn tokenize(&self, text: &str, language: Option<&Language>) -> Vec<Token>;
102
103    /// Check if a token is a stopword (common function words to ignore).
104    ///
105    /// Default implementation returns `false` (no stopwords).
106    /// Language-specific implementations should override this.
107    fn is_stopword(&self, token: &Token, language: Option<&Language>) -> bool {
108        let _ = (token, language);
109        false
110    }
111
112    /// Get the tokenizer name/identifier.
113    fn name(&self) -> &'static str;
114}
115
116/// Simple whitespace-based tokenizer (English, Spanish, French, etc.).
117///
118/// Splits on whitespace and punctuation. Works for languages with
119/// clear word boundaries.
120pub struct WhitespaceTokenizer {
121    /// Whether to include punctuation as separate tokens
122    include_punctuation: bool,
123}
124
125impl WhitespaceTokenizer {
126    /// Create a new whitespace tokenizer.
127    pub fn new() -> Self {
128        Self {
129            include_punctuation: false,
130        }
131    }
132
133    /// Create with punctuation handling.
134    pub fn with_punctuation(mut self, include: bool) -> Self {
135        self.include_punctuation = include;
136        self
137    }
138}
139
140impl Default for WhitespaceTokenizer {
141    fn default() -> Self {
142        Self::new()
143    }
144}
145
146impl Tokenizer for WhitespaceTokenizer {
147    fn tokenize(&self, text: &str, _language: Option<&Language>) -> Vec<Token> {
148        let mut tokens = Vec::new();
149        let mut in_word = false;
150        let mut word_start = 0;
151
152        for (i, c) in text.char_indices() {
153            let is_word_char = c.is_alphanumeric() || c == '_';
154
155            if is_word_char {
156                if !in_word {
157                    word_start = i;
158                    in_word = true;
159                }
160            } else {
161                if in_word {
162                    // End of word
163                    let word: String = text[word_start..i].chars().collect();
164                    if !word.is_empty() {
165                        tokens.push(Token::new(word, word_start, i));
166                    }
167                    in_word = false;
168                }
169
170                if self.include_punctuation && !c.is_whitespace() {
171                    // Add punctuation as separate token
172                    let punct: String = c.to_string();
173                    tokens.push(Token::new(punct, i, i + c.len_utf8()));
174                }
175            }
176            // Note: current_start was unused, removed assignment
177        }
178
179        // Handle word at end of text
180        if in_word {
181            let word: String = text[word_start..].chars().collect();
182            if !word.is_empty() {
183                tokens.push(Token::new(word, word_start, text.len()));
184            }
185        }
186
187        tokens
188    }
189
190    fn name(&self) -> &'static str {
191        "whitespace"
192    }
193}
194
195/// Unicode segmentation-based tokenizer (fallback for CJK and other languages).
196///
197/// Uses Unicode Standard Annex #29 (UAX#29) word boundaries.
198/// This is a reasonable fallback but language-specific tokenizers are preferred.
199pub struct UnicodeSegmenter;
200
201impl UnicodeSegmenter {
202    /// Create a new Unicode segmenter.
203    pub fn new() -> Self {
204        Self
205    }
206}
207
208impl Default for UnicodeSegmenter {
209    fn default() -> Self {
210        Self::new()
211    }
212}
213
214impl Tokenizer for UnicodeSegmenter {
215    fn tokenize(&self, text: &str, _language: Option<&Language>) -> Vec<Token> {
216        // For now, use character-based segmentation for CJK
217        // In production, use unicode-segmentation crate or language-specific tools
218        let mut tokens = Vec::new();
219        let mut start = 0;
220
221        for (i, c) in text.char_indices() {
222            // Simple heuristic: split on whitespace and punctuation
223            if c.is_whitespace() || c.is_ascii_punctuation() {
224                if start < i {
225                    let word: String = text[start..i].chars().collect();
226                    if !word.trim().is_empty() {
227                        tokens.push(Token::new(word, start, i));
228                    }
229                }
230                start = i + c.len_utf8();
231            }
232        }
233
234        // Handle remaining text
235        if start < text.len() {
236            let word: String = text[start..].chars().collect();
237            if !word.trim().is_empty() {
238                tokens.push(Token::new(word, start, text.len()));
239            }
240        }
241
242        tokens
243    }
244
245    fn name(&self) -> &'static str {
246        "unicode_segmenter"
247    }
248}
249
250#[cfg(test)]
251mod tests {
252    use super::*;
253
254    #[test]
255    fn test_whitespace_tokenizer() {
256        let tokenizer = WhitespaceTokenizer::new();
257        let tokens = tokenizer.tokenize("Hello world", Some(&Language::English));
258
259        assert_eq!(tokens.len(), 2);
260        assert_eq!(tokens[0].surface, "Hello");
261        assert_eq!(tokens[0].start, 0);
262        assert_eq!(tokens[0].end, 5);
263        assert_eq!(tokens[1].surface, "world");
264        assert_eq!(tokens[1].start, 6);
265        assert_eq!(tokens[1].end, 11);
266    }
267
268    #[test]
269    fn test_whitespace_tokenizer_punctuation() {
270        let tokenizer = WhitespaceTokenizer::new().with_punctuation(true);
271        let tokens = tokenizer.tokenize("Hello, world!", Some(&Language::English));
272
273        assert!(tokens.len() >= 2);
274        assert_eq!(tokens[0].surface, "Hello");
275        // Punctuation tokens should be present
276    }
277
278    #[test]
279    fn test_unicode_segmenter_cjk() {
280        let tokenizer = UnicodeSegmenter::new();
281        let tokens = tokenizer.tokenize("北京是中国的首都", Some(&Language::Chinese));
282
283        // Should produce at least one token (character-based fallback)
284        assert!(!tokens.is_empty());
285    }
286}