scirs2_text/tokenization/
unicode_normalizer.rs

1//! Unicode normalization and language-agnostic tokenization utilities.
2//!
3//! Provides:
4//! - [`Script`]: Unicode script detection for individual characters.
5//! - [`UnicodeNormalizer`]: NFC/NFD normalization, accent stripping, case folding.
6//! - Language-agnostic tokenization that handles CJK character segmentation.
7
8use unicode_normalization::UnicodeNormalization;
9
10// ─── Script detection ─────────────────────────────────────────────────────────
11
12/// Unicode script classification for a single character.
13///
14/// Used to determine whether whitespace should be inserted around individual
15/// characters (e.g. CJK) or whether a word-based tokenization strategy is
16/// appropriate.
17#[non_exhaustive]
18#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
19pub enum Script {
20    /// Latin characters (includes Latin Extended).
21    Latin,
22    /// CJK Unified Ideographs and related blocks.
23    Cjk,
24    /// Cyrillic script.
25    Cyrillic,
26    /// Arabic script.
27    Arabic,
28    /// Devanagari script (used for Hindi, Sanskrit, etc.).
29    Devanagari,
30    /// Hebrew script.
31    Hebrew,
32    /// Any script not listed above.
33    Other,
34}
35
36/// Detect the [`Script`] for a single Unicode character.
37///
38/// Uses Unicode block ranges.  Characters that straddle multiple blocks
39/// (e.g. punctuation) fall into [`Script::Other`].
40pub fn detect_script(c: char) -> Script {
41    let cp = c as u32;
42
43    // CJK ranges
44    if (0x4E00..=0x9FFF).contains(&cp)   // CJK Unified Ideographs
45        || (0x3400..=0x4DBF).contains(&cp)  // CJK Extension A
46        || (0x20000..=0x2A6DF).contains(&cp) // CJK Extension B
47        || (0x2A700..=0x2B73F).contains(&cp) // CJK Extension C
48        || (0x2B740..=0x2B81F).contains(&cp) // CJK Extension D
49        || (0x2B820..=0x2CEAF).contains(&cp) // CJK Extension E
50        || (0xF900..=0xFAFF).contains(&cp)  // CJK Compatibility Ideographs
51        || (0x2F800..=0x2FA1F).contains(&cp) // CJK Compatibility Supplement
52        || (0x3000..=0x303F).contains(&cp)  // CJK Symbols and Punctuation
53        || (0x3040..=0x309F).contains(&cp)  // Hiragana
54        || (0x30A0..=0x30FF).contains(&cp)
55    // Katakana
56    {
57        return Script::Cjk;
58    }
59
60    // Cyrillic U+0400–U+04FF
61    if (0x0400..=0x04FF).contains(&cp) {
62        return Script::Cyrillic;
63    }
64
65    // Arabic U+0600–U+06FF
66    if (0x0600..=0x06FF).contains(&cp) {
67        return Script::Arabic;
68    }
69
70    // Devanagari U+0900–U+097F
71    if (0x0900..=0x097F).contains(&cp) {
72        return Script::Devanagari;
73    }
74
75    // Hebrew U+0590–U+05FF
76    if (0x0590..=0x05FF).contains(&cp) {
77        return Script::Hebrew;
78    }
79
80    // Latin: Basic Latin letters + Latin-1 Supplement + Latin Extended-A/B
81    if (0x0041..=0x005A).contains(&cp)   // A-Z
82        || (0x0061..=0x007A).contains(&cp) // a-z
83        || (0x00C0..=0x00D6).contains(&cp)
84        || (0x00D8..=0x00F6).contains(&cp)
85        || (0x00F8..=0x024F).contains(&cp)
86    // Latin Extended-A and B
87    {
88        return Script::Latin;
89    }
90
91    Script::Other
92}
93
94// ─── NormForm ─────────────────────────────────────────────────────────────────
95
96/// Unicode normalization form.
97#[non_exhaustive]
98#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
99pub enum NormForm {
100    /// Canonical Decomposition, followed by Canonical Composition (NFC).
101    Nfc,
102    /// Canonical Decomposition (NFD).
103    Nfd,
104}
105
106// ─── UnicodeNormalizerConfig ───────────────────────────────────────────────────
107
108/// Configuration for [`UnicodeNormalizer`].
109#[derive(Debug, Clone)]
110pub struct UnicodeNormalizerConfig {
111    /// Normalization form to apply.
112    pub form: NormForm,
113    /// Strip combining diacritical marks (accent removal).
114    pub strip_accents: bool,
115    /// Fold all characters to lowercase.
116    pub lowercase: bool,
117    /// Insert whitespace around CJK characters to facilitate word splitting.
118    pub tokenize_cjk: bool,
119}
120
121impl Default for UnicodeNormalizerConfig {
122    fn default() -> Self {
123        UnicodeNormalizerConfig {
124            form: NormForm::Nfc,
125            strip_accents: false,
126            lowercase: false,
127            tokenize_cjk: true,
128        }
129    }
130}
131
132// ─── UnicodeNormalizer ────────────────────────────────────────────────────────
133
134/// Unicode-aware text normalizer.
135///
136/// Supports NFC/NFD normalization, accent stripping, case folding, and
137/// language-agnostic CJK tokenization.
138///
139/// # Example
140///
141/// ```rust
142/// use scirs2_text::tokenization::unicode_normalizer::{UnicodeNormalizer, UnicodeNormalizerConfig, NormForm};
143///
144/// let config = UnicodeNormalizerConfig {
145///     form: NormForm::Nfc,
146///     strip_accents: true,
147///     lowercase: true,
148///     tokenize_cjk: true,
149/// };
150/// let normalizer = UnicodeNormalizer::new(config);
151/// let tokens = normalizer.tokenize_language_agnostic("Héllo 世界");
152/// assert!(tokens.len() >= 3); // "hello", "世", "界"
153/// ```
154#[derive(Debug, Clone)]
155pub struct UnicodeNormalizer {
156    config: UnicodeNormalizerConfig,
157}
158
159impl UnicodeNormalizer {
160    /// Create a new [`UnicodeNormalizer`] with the given configuration.
161    pub fn new(config: UnicodeNormalizerConfig) -> Self {
162        UnicodeNormalizer { config }
163    }
164
165    /// Create a normalizer with default settings.
166    pub fn default_normalizer() -> Self {
167        UnicodeNormalizer::new(UnicodeNormalizerConfig::default())
168    }
169
170    /// Normalize `text` according to the configuration.
171    ///
172    /// Steps applied in order:
173    /// 1. Lowercase (if configured)
174    /// 2. NFD decomposition + accent stripping (if configured)
175    /// 3. NFC composition (if configured, after potential NFD strip)
176    pub fn normalize(&self, text: &str) -> String {
177        // Step 1: Lowercase
178        let s = if self.config.lowercase {
179            text.to_lowercase()
180        } else {
181            text.to_owned()
182        };
183
184        // Step 2 & 3: Normalize form + optional accent strip
185        match self.config.form {
186            NormForm::Nfd => {
187                if self.config.strip_accents {
188                    // NFD then remove combining marks
189                    s.nfd().filter(|&c| !is_combining_diacritic(c)).collect()
190                } else {
191                    s.nfd().collect()
192                }
193            }
194            NormForm::Nfc => {
195                if self.config.strip_accents {
196                    // NFD decompose → strip accents → NFC recompose
197                    let stripped: String =
198                        s.nfd().filter(|&c| !is_combining_diacritic(c)).collect();
199                    stripped.nfc().collect()
200                } else {
201                    s.nfc().collect()
202                }
203            }
204        }
205    }
206
207    /// Tokenize `text` in a language-agnostic manner.
208    ///
209    /// Algorithm:
210    /// 1. Normalize the text.
211    /// 2. Insert whitespace around CJK characters (when `tokenize_cjk` is set).
212    /// 3. Split on Unicode whitespace.
213    /// 4. Filter empty tokens.
214    ///
215    /// This approach works across scripts without any language-specific logic.
216    pub fn tokenize_language_agnostic(&self, text: &str) -> Vec<String> {
217        let normalized = self.normalize(text);
218
219        let mut spaced = String::with_capacity(normalized.len() * 2);
220        for ch in normalized.chars() {
221            if self.config.tokenize_cjk && is_cjk_character(ch) {
222                // Surround each CJK character with spaces so it becomes its own token
223                spaced.push(' ');
224                spaced.push(ch);
225                spaced.push(' ');
226            } else {
227                spaced.push(ch);
228            }
229        }
230
231        spaced
232            .split(|c: char| c.is_whitespace())
233            .filter(|s| !s.is_empty())
234            .map(|s| s.to_owned())
235            .collect()
236    }
237
238    /// Return the configuration.
239    pub fn config(&self) -> &UnicodeNormalizerConfig {
240        &self.config
241    }
242}
243
244impl Default for UnicodeNormalizer {
245    fn default() -> Self {
246        UnicodeNormalizer::new(UnicodeNormalizerConfig::default())
247    }
248}
249
250// ─── Helpers ─────────────────────────────────────────────────────────────────
251
252/// Return `true` for Unicode combining diacritical marks (U+0300–U+036F and
253/// related blocks).
254fn is_combining_diacritic(ch: char) -> bool {
255    let cp = ch as u32;
256    // Combining Diacritical Marks
257    (0x0300..=0x036F).contains(&cp)
258    // Combining Diacritical Marks Supplement
259    || (0x1DC0..=0x1DFF).contains(&cp)
260    // Combining Diacritical Marks Extended
261    || (0x1AB0..=0x1AFF).contains(&cp)
262    // Combining Half Marks
263    || (0xFE20..=0xFE2F).contains(&cp)
264}
265
266/// Return `true` for CJK characters that should be individually tokenized.
267fn is_cjk_character(ch: char) -> bool {
268    let cp = ch as u32;
269    (0x4E00..=0x9FFF).contains(&cp)
270        || (0x3400..=0x4DBF).contains(&cp)
271        || (0x20000..=0x2A6DF).contains(&cp)
272        || (0x2A700..=0x2B73F).contains(&cp)
273        || (0x2B740..=0x2B81F).contains(&cp)
274        || (0x2B820..=0x2CEAF).contains(&cp)
275        || (0xF900..=0xFAFF).contains(&cp)
276        || (0x2F800..=0x2FA1F).contains(&cp)
277        || (0x3040..=0x309F).contains(&cp) // Hiragana
278        || (0x30A0..=0x30FF).contains(&cp) // Katakana
279}
280
281// ─── Tests ───────────────────────────────────────────────────────────────────
282
283#[cfg(test)]
284mod tests {
285    use super::*;
286
287    // ── detect_script ───────────────────────────────────────────────────
288
289    #[test]
290    fn test_detect_script_latin() {
291        assert_eq!(detect_script('a'), Script::Latin);
292        assert_eq!(detect_script('Z'), Script::Latin);
293        assert_eq!(detect_script('é'), Script::Latin); // U+00E9 Latin Small Letter E with Acute
294    }
295
296    #[test]
297    fn test_detect_script_cjk() {
298        assert_eq!(detect_script('中'), Script::Cjk); // U+4E2D
299        assert_eq!(detect_script('日'), Script::Cjk); // U+65E5
300        assert_eq!(detect_script('語'), Script::Cjk); // U+8A9E
301    }
302
303    #[test]
304    fn test_detect_script_cyrillic() {
305        assert_eq!(detect_script('А'), Script::Cyrillic); // U+0410
306        assert_eq!(detect_script('я'), Script::Cyrillic); // U+044F
307    }
308
309    #[test]
310    fn test_detect_script_arabic() {
311        assert_eq!(detect_script('ع'), Script::Arabic); // U+0639
312        assert_eq!(detect_script('م'), Script::Arabic); // U+0645
313    }
314
315    #[test]
316    fn test_detect_script_devanagari() {
317        assert_eq!(detect_script('क'), Script::Devanagari); // U+0915
318        assert_eq!(detect_script('ा'), Script::Devanagari); // U+093E
319    }
320
321    #[test]
322    fn test_detect_script_hebrew() {
323        assert_eq!(detect_script('א'), Script::Hebrew); // U+05D0
324        assert_eq!(detect_script('ש'), Script::Hebrew); // U+05E9
325    }
326
327    #[test]
328    fn test_detect_script_other() {
329        assert_eq!(detect_script('!'), Script::Other);
330        assert_eq!(detect_script(' '), Script::Other);
331        assert_eq!(detect_script('1'), Script::Other);
332    }
333
334    // ── UnicodeNormalizer::normalize ────────────────────────────────────
335
336    #[test]
337    fn test_normalize_lowercase() {
338        let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
339            lowercase: true,
340            ..Default::default()
341        });
342        assert_eq!(n.normalize("Hello WORLD"), "hello world");
343    }
344
345    #[test]
346    fn test_normalize_no_lowercase() {
347        let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
348            lowercase: false,
349            ..Default::default()
350        });
351        assert_eq!(n.normalize("Hello WORLD"), "Hello WORLD");
352    }
353
354    #[test]
355    fn test_normalize_strip_accents_nfc() {
356        let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
357            form: NormForm::Nfc,
358            strip_accents: true,
359            lowercase: false,
360            tokenize_cjk: false,
361        });
362        // "café" → "cafe"
363        let result = n.normalize("café");
364        assert_eq!(result, "cafe");
365    }
366
367    #[test]
368    fn test_normalize_strip_accents_nfd() {
369        let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
370            form: NormForm::Nfd,
371            strip_accents: true,
372            lowercase: false,
373            tokenize_cjk: false,
374        });
375        let result = n.normalize("résumé");
376        assert_eq!(result, "resume");
377    }
378
379    #[test]
380    fn test_normalize_nfc_idempotent_on_ascii() {
381        let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
382            form: NormForm::Nfc,
383            strip_accents: false,
384            lowercase: false,
385            tokenize_cjk: false,
386        });
387        let text = "hello world 123";
388        assert_eq!(n.normalize(text), text);
389    }
390
391    // ── tokenize_language_agnostic ───────────────────────────────────────
392
393    #[test]
394    fn test_cjk_chars_split() {
395        let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
396            tokenize_cjk: true,
397            lowercase: false,
398            strip_accents: false,
399            form: NormForm::Nfc,
400        });
401        let tokens = n.tokenize_language_agnostic("Hello世界");
402        // "Hello" should be one token; "世" and "界" should each be their own
403        assert!(tokens.contains(&"Hello".to_string()), "got: {:?}", tokens);
404        assert!(tokens.contains(&"世".to_string()), "got: {:?}", tokens);
405        assert!(tokens.contains(&"界".to_string()), "got: {:?}", tokens);
406    }
407
408    #[test]
409    fn test_cjk_split_mixed_text() {
410        let n = UnicodeNormalizer::default();
411        let tokens = n.tokenize_language_agnostic("我 love Rust");
412        // "我" is CJK and should be its own token
413        assert!(tokens.iter().any(|t| t == "我"), "got: {:?}", tokens);
414        assert!(tokens.iter().any(|t| t == "love"), "got: {:?}", tokens);
415        assert!(tokens.iter().any(|t| t == "Rust"), "got: {:?}", tokens);
416    }
417
418    #[test]
419    fn test_tokenize_latin_only() {
420        let n = UnicodeNormalizer::default();
421        let tokens = n.tokenize_language_agnostic("the quick brown fox");
422        assert_eq!(tokens, vec!["the", "quick", "brown", "fox"]);
423    }
424
425    #[test]
426    fn test_tokenize_empty() {
427        let n = UnicodeNormalizer::default();
428        let tokens = n.tokenize_language_agnostic("   ");
429        assert!(tokens.is_empty());
430    }
431
432    #[test]
433    fn test_tokenize_with_lowercase_and_accent_strip() {
434        let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
435            form: NormForm::Nfc,
436            strip_accents: true,
437            lowercase: true,
438            tokenize_cjk: true,
439        });
440        let tokens = n.tokenize_language_agnostic("Héllo Wörld");
441        assert!(tokens.iter().any(|t| t == "hello"), "got: {:?}", tokens);
442        assert!(tokens.iter().any(|t| t == "world"), "got: {:?}", tokens);
443    }
444
445    #[test]
446    fn test_combining_mark_detection() {
447        // U+0301 is COMBINING ACUTE ACCENT — a combining diacritic
448        assert!(is_combining_diacritic('\u{0301}'));
449        assert!(is_combining_diacritic('\u{0300}'));
450        assert!(is_combining_diacritic('\u{036F}'));
451        // Regular ASCII should not be diacritics
452        assert!(!is_combining_diacritic('a'));
453        assert!(!is_combining_diacritic('é')); // precomposed — single codepoint
454    }
455
456    #[test]
457    fn test_cjk_character_detection() {
458        assert!(is_cjk_character('中'));
459        assert!(is_cjk_character('日'));
460        assert!(is_cjk_character('あ')); // Hiragana
461        assert!(is_cjk_character('ア')); // Katakana
462        assert!(!is_cjk_character('a'));
463        assert!(!is_cjk_character('1'));
464        assert!(!is_cjk_character(' '));
465    }
466}
scirs2_text/tokenization/unicode_normalizer.rs

scirs2_text/tokenization/
unicode_normalizer.rs