Skip to main content

anno_core/coalesce/
script.rs

1//! Unicode script detection for routing similarity algorithms.
2//!
3//! Extracted from similarity.rs to isolate potential compiler issues.
4
5use serde::{Deserialize, Serialize};
6
7/// Unicode script categories for routing similarity algorithms.
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
9pub enum Script {
10    /// Latin script (English, French, German, etc.)
11    Latin,
12    /// CJK (Chinese, Japanese Kanji, Korean Hanja)
13    Cjk,
14    /// Japanese Hiragana/Katakana
15    Kana,
16    /// Korean Hangul
17    Hangul,
18    /// Arabic script
19    Arabic,
20    /// Cyrillic script (Russian, etc.)
21    Cyrillic,
22    /// Devanagari (Hindi, Sanskrit, etc.)
23    Devanagari,
24    /// Greek script
25    Greek,
26    /// Hebrew script
27    Hebrew,
28    /// Thai script
29    Thai,
30    /// Mixed or unknown
31    Mixed,
32}
33
34impl Script {
35    /// Detect the dominant script in a string.
36    ///
37    /// Returns the script that appears most frequently.
38    /// For mixed scripts (e.g., "東京 (Tokyo)"), returns Mixed if multiple scripts
39    /// have significant presence (>= 20% of characters).
40    pub fn detect(s: &str) -> Self {
41        // Helper function to check if codepoint is in range
42        #[inline(always)]
43        fn in_range(cp: u32, start: u32, end: u32) -> bool {
44            start <= cp && cp <= end
45        }
46
47        let mut counts = [0u32; 11]; // One per Script variant
48        let mut total_chars = 0u32;
49
50        for c in s.chars() {
51            // Skip whitespace and punctuation for script detection
52            if c.is_whitespace() || c.is_ascii_punctuation() {
53                continue;
54            }
55            total_chars += 1;
56
57            let cp = c as u32;
58            // Use explicit range checks with helper to avoid compiler issues
59            if cp <= 0x007F || in_range(cp, 0x0080, 0x024F) {
60                counts[0] += 1; // Latin
61            } else if in_range(cp, 0x4E00, 0x9FFF) || in_range(cp, 0x3400, 0x4DBF) {
62                counts[1] += 1; // CJK
63            } else if in_range(cp, 0x3040, 0x309F) || in_range(cp, 0x30A0, 0x30FF) {
64                counts[2] += 1; // Kana
65            } else if in_range(cp, 0xAC00, 0xD7AF) || in_range(cp, 0x1100, 0x11FF) {
66                counts[3] += 1; // Hangul
67            } else if in_range(cp, 0x0600, 0x06FF) || in_range(cp, 0x0750, 0x077F) {
68                counts[4] += 1; // Arabic
69            } else if in_range(cp, 0x0400, 0x04FF) || in_range(cp, 0x0500, 0x052F) {
70                counts[5] += 1; // Cyrillic
71            } else if in_range(cp, 0x0900, 0x097F) {
72                counts[6] += 1; // Devanagari
73            } else if in_range(cp, 0x0370, 0x03FF) || in_range(cp, 0x1F00, 0x1FFF) {
74                counts[7] += 1; // Greek
75            } else if in_range(cp, 0x0590, 0x05FF) {
76                counts[8] += 1; // Hebrew
77            } else if in_range(cp, 0x0E00, 0x0E7F) {
78                counts[9] += 1; // Thai
79            } else {
80                counts[10] += 1; // Other
81            }
82        }
83
84        if total_chars == 0 {
85            return Script::Mixed;
86        }
87
88        // Check if multiple scripts have significant presence (>= 20%)
89        // Use at least 1 as threshold to avoid counting zero-count scripts
90        let threshold = ((total_chars as f32 * 0.2) as u32).max(1);
91        let significant_scripts = counts.iter().filter(|&&c| c >= threshold).count();
92
93        // If 2+ scripts are significant, return Mixed
94        if significant_scripts >= 2 {
95            return Script::Mixed;
96        }
97
98        // Find dominant script
99        let scripts = [
100            Script::Latin,
101            Script::Cjk,
102            Script::Kana,
103            Script::Hangul,
104            Script::Arabic,
105            Script::Cyrillic,
106            Script::Devanagari,
107            Script::Greek,
108            Script::Hebrew,
109            Script::Thai,
110            Script::Mixed,
111        ];
112
113        let max_idx = counts
114            .iter()
115            .enumerate()
116            .max_by_key(|(_, &count)| count)
117            .map(|(i, _)| i)
118            .unwrap_or(10);
119
120        scripts[max_idx]
121    }
122
123    /// Whether this script uses word boundaries (spaces).
124    pub fn has_word_boundaries(&self) -> bool {
125        matches!(
126            self,
127            Script::Latin
128                | Script::Cyrillic
129                | Script::Greek
130                | Script::Arabic
131                | Script::Hebrew
132                | Script::Devanagari
133        )
134    }
135}
136
137#[cfg(test)]
138mod tests {
139    use super::*;
140
141    #[test]
142    fn test_script_detection_latin() {
143        assert_eq!(Script::detect("Hello World"), Script::Latin);
144        assert_eq!(Script::detect("Marie Curie"), Script::Latin);
145    }
146
147    #[test]
148    fn test_script_detection_cjk() {
149        assert_eq!(Script::detect("北京"), Script::Cjk);
150        // Note: "中华人民共和国" might be detected as Mixed if it contains punctuation
151        // Test with pure CJK characters
152        assert_eq!(Script::detect("中华人民共和国"), Script::Cjk);
153        // Test with longer pure CJK text
154        assert_eq!(Script::detect("中华人民共和国是伟大的国家"), Script::Cjk);
155    }
156
157    #[test]
158    fn test_script_detection_mixed() {
159        // Mixed script strings should be detected as Mixed
160        assert_eq!(Script::detect("東京 (Tokyo)"), Script::Mixed);
161    }
162}