anno/tokenizer.rs
1//! Language-specific tokenization for multilingual NLP.
2//!
3//! This module provides a trait-based tokenization system that supports
4//! different languages and scripts. Unlike ML backends which use transformer
5//! tokenizers, this is for **statistical methods** (keywords, summarization)
6//! that need language-aware word segmentation.
7//!
8//! # Research Context
9//!
10//! Tokenization varies dramatically by language:
11//!
12//! | Language Family | Tokenization Method | Example |
13//! |----------------|---------------------|---------|
14//! | English, Spanish, French | Whitespace + punctuation | "Hello world" → ["Hello", "world"] |
15//! | Chinese, Japanese | Word segmentation (jieba, MeCab) | "中华人民共和国" → ["中华人民共和国"] or ["中华", "人民", "共和国"] |
16//! | Thai | No spaces, needs segmentation | "ประเทศไทย" → ["ประเทศไทย"] |
17//! | Arabic | Morphological analysis (clitics) | "وأبوه" → ["و", "أب", "ه"] |
18//! | Korean | Morphological analysis | "서울시" → ["서울", "시"] |
19//!
20//! # Usage
21//!
22//! ```rust
23//! use anno::lang::Language;
24//! use anno::tokenizer::{Tokenizer, WhitespaceTokenizer};
25//!
26//! let tokenizer = WhitespaceTokenizer::new();
27//! let tokens = tokenizer.tokenize("Hello world", Some(&Language::English));
28//! assert_eq!(tokens.len(), 2);
29//! ```
30//!
31//! # Future: Language-Specific Implementations
32//!
33//! - `JiebaTokenizer` for Chinese
34//! - `MecabTokenizer` for Japanese
35//! - `KonlpyTokenizer` for Korean
36//! - `UnicodeSegmenter` using UAX#29 for fallback
37
38use crate::lang::Language;
39
40/// A token extracted from text.
41#[derive(Debug, Clone, PartialEq, Eq)]
42pub struct Token {
43 /// Surface form (raw text)
44 pub surface: String,
45 /// Normalized form (lemma, if available)
46 pub lemma: Option<String>,
47 /// Part of speech tag (if available)
48 pub pos: Option<String>,
49 /// Start position (character offset)
50 pub start: usize,
51 /// End position (character offset)
52 pub end: usize,
53}
54
55impl Token {
56 /// Create a new token with surface form and position.
57 pub fn new(surface: impl Into<String>, start: usize, end: usize) -> Self {
58 Self {
59 surface: surface.into(),
60 lemma: None,
61 pos: None,
62 start,
63 end,
64 }
65 }
66
67 /// Create with lemma.
68 pub fn with_lemma(mut self, lemma: impl Into<String>) -> Self {
69 self.lemma = Some(lemma.into());
70 self
71 }
72
73 /// Create with POS tag.
74 pub fn with_pos(mut self, pos: impl Into<String>) -> Self {
75 self.pos = Some(pos.into());
76 self
77 }
78
79 /// Get the normalized form (lemma if available, otherwise surface).
80 pub fn normalized(&self) -> &str {
81 self.lemma.as_deref().unwrap_or(&self.surface)
82 }
83}
84
85/// Trait for language-specific tokenization.
86///
87/// Implementations should handle:
88/// - Word segmentation (whitespace, morphological, statistical)
89/// - Stopword detection
90/// - Case normalization (if applicable)
91/// - Script-specific rules (CJK, Arabic, etc.)
92pub trait Tokenizer: Send + Sync {
93 /// Tokenize text into a sequence of tokens.
94 ///
95 /// # Arguments
96 /// - `text`: Input text to tokenize
97 /// - `language`: Optional language hint (ISO 639-1 code or Language enum)
98 ///
99 /// # Returns
100 /// Vector of tokens with positions and optional linguistic annotations.
101 fn tokenize(&self, text: &str, language: Option<&Language>) -> Vec<Token>;
102
103 /// Check if a token is a stopword (common function words to ignore).
104 ///
105 /// Default implementation returns `false` (no stopwords).
106 /// Language-specific implementations should override this.
107 fn is_stopword(&self, token: &Token, language: Option<&Language>) -> bool {
108 let _ = (token, language);
109 false
110 }
111
112 /// Get the tokenizer name/identifier.
113 fn name(&self) -> &'static str;
114}
115
116/// Simple whitespace-based tokenizer (English, Spanish, French, etc.).
117///
118/// Splits on whitespace and punctuation. Works for languages with
119/// clear word boundaries.
120pub struct WhitespaceTokenizer {
121 /// Whether to include punctuation as separate tokens
122 include_punctuation: bool,
123}
124
125impl WhitespaceTokenizer {
126 /// Create a new whitespace tokenizer.
127 pub fn new() -> Self {
128 Self {
129 include_punctuation: false,
130 }
131 }
132
133 /// Create with punctuation handling.
134 pub fn with_punctuation(mut self, include: bool) -> Self {
135 self.include_punctuation = include;
136 self
137 }
138}
139
140impl Default for WhitespaceTokenizer {
141 fn default() -> Self {
142 Self::new()
143 }
144}
145
146impl Tokenizer for WhitespaceTokenizer {
147 fn tokenize(&self, text: &str, _language: Option<&Language>) -> Vec<Token> {
148 let mut tokens = Vec::new();
149 let mut in_word = false;
150 let mut word_start = 0;
151
152 for (i, c) in text.char_indices() {
153 let is_word_char = c.is_alphanumeric() || c == '_';
154
155 if is_word_char {
156 if !in_word {
157 word_start = i;
158 in_word = true;
159 }
160 } else {
161 if in_word {
162 // End of word
163 let word: String = text[word_start..i].chars().collect();
164 if !word.is_empty() {
165 tokens.push(Token::new(word, word_start, i));
166 }
167 in_word = false;
168 }
169
170 if self.include_punctuation && !c.is_whitespace() {
171 // Add punctuation as separate token
172 let punct: String = c.to_string();
173 tokens.push(Token::new(punct, i, i + c.len_utf8()));
174 }
175 }
176 // Note: current_start was unused, removed assignment
177 }
178
179 // Handle word at end of text
180 if in_word {
181 let word: String = text[word_start..].chars().collect();
182 if !word.is_empty() {
183 tokens.push(Token::new(word, word_start, text.len()));
184 }
185 }
186
187 tokens
188 }
189
190 fn name(&self) -> &'static str {
191 "whitespace"
192 }
193}
194
195/// Unicode segmentation-based tokenizer (fallback for CJK and other languages).
196///
197/// Uses Unicode Standard Annex #29 (UAX#29) word boundaries.
198/// This is a reasonable fallback but language-specific tokenizers are preferred.
199pub struct UnicodeSegmenter;
200
201impl UnicodeSegmenter {
202 /// Create a new Unicode segmenter.
203 pub fn new() -> Self {
204 Self
205 }
206}
207
208impl Default for UnicodeSegmenter {
209 fn default() -> Self {
210 Self::new()
211 }
212}
213
214impl Tokenizer for UnicodeSegmenter {
215 fn tokenize(&self, text: &str, _language: Option<&Language>) -> Vec<Token> {
216 // For now, use character-based segmentation for CJK
217 // In production, use unicode-segmentation crate or language-specific tools
218 let mut tokens = Vec::new();
219 let mut start = 0;
220
221 for (i, c) in text.char_indices() {
222 // Simple heuristic: split on whitespace and punctuation
223 if c.is_whitespace() || c.is_ascii_punctuation() {
224 if start < i {
225 let word: String = text[start..i].chars().collect();
226 if !word.trim().is_empty() {
227 tokens.push(Token::new(word, start, i));
228 }
229 }
230 start = i + c.len_utf8();
231 }
232 }
233
234 // Handle remaining text
235 if start < text.len() {
236 let word: String = text[start..].chars().collect();
237 if !word.trim().is_empty() {
238 tokens.push(Token::new(word, start, text.len()));
239 }
240 }
241
242 tokens
243 }
244
245 fn name(&self) -> &'static str {
246 "unicode_segmenter"
247 }
248}
249
250#[cfg(test)]
251mod tests {
252 use super::*;
253
254 #[test]
255 fn test_whitespace_tokenizer() {
256 let tokenizer = WhitespaceTokenizer::new();
257 let tokens = tokenizer.tokenize("Hello world", Some(&Language::English));
258
259 assert_eq!(tokens.len(), 2);
260 assert_eq!(tokens[0].surface, "Hello");
261 assert_eq!(tokens[0].start, 0);
262 assert_eq!(tokens[0].end, 5);
263 assert_eq!(tokens[1].surface, "world");
264 assert_eq!(tokens[1].start, 6);
265 assert_eq!(tokens[1].end, 11);
266 }
267
268 #[test]
269 fn test_whitespace_tokenizer_punctuation() {
270 let tokenizer = WhitespaceTokenizer::new().with_punctuation(true);
271 let tokens = tokenizer.tokenize("Hello, world!", Some(&Language::English));
272
273 assert!(tokens.len() >= 2);
274 assert_eq!(tokens[0].surface, "Hello");
275 // Punctuation tokens should be present
276 }
277
278 #[test]
279 fn test_unicode_segmenter_cjk() {
280 let tokenizer = UnicodeSegmenter::new();
281 let tokens = tokenizer.tokenize("北京是中国的首都", Some(&Language::Chinese));
282
283 // Should produce at least one token (character-based fallback)
284 assert!(!tokens.is_empty());
285 }
286}