anno_core/coalesce/
script.rs1use serde::{Deserialize, Serialize};
6
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
9pub enum Script {
10 Latin,
12 Cjk,
14 Kana,
16 Hangul,
18 Arabic,
20 Cyrillic,
22 Devanagari,
24 Greek,
26 Hebrew,
28 Thai,
30 Mixed,
32}
33
34impl Script {
35 pub fn detect(s: &str) -> Self {
41 #[inline(always)]
43 fn in_range(cp: u32, start: u32, end: u32) -> bool {
44 start <= cp && cp <= end
45 }
46
47 let mut counts = [0u32; 11]; let mut total_chars = 0u32;
49
50 for c in s.chars() {
51 if c.is_whitespace() || c.is_ascii_punctuation() {
53 continue;
54 }
55 total_chars += 1;
56
57 let cp = c as u32;
58 if cp <= 0x007F || in_range(cp, 0x0080, 0x024F) {
60 counts[0] += 1; } else if in_range(cp, 0x4E00, 0x9FFF) || in_range(cp, 0x3400, 0x4DBF) {
62 counts[1] += 1; } else if in_range(cp, 0x3040, 0x309F) || in_range(cp, 0x30A0, 0x30FF) {
64 counts[2] += 1; } else if in_range(cp, 0xAC00, 0xD7AF) || in_range(cp, 0x1100, 0x11FF) {
66 counts[3] += 1; } else if in_range(cp, 0x0600, 0x06FF) || in_range(cp, 0x0750, 0x077F) {
68 counts[4] += 1; } else if in_range(cp, 0x0400, 0x04FF) || in_range(cp, 0x0500, 0x052F) {
70 counts[5] += 1; } else if in_range(cp, 0x0900, 0x097F) {
72 counts[6] += 1; } else if in_range(cp, 0x0370, 0x03FF) || in_range(cp, 0x1F00, 0x1FFF) {
74 counts[7] += 1; } else if in_range(cp, 0x0590, 0x05FF) {
76 counts[8] += 1; } else if in_range(cp, 0x0E00, 0x0E7F) {
78 counts[9] += 1; } else {
80 counts[10] += 1; }
82 }
83
84 if total_chars == 0 {
85 return Script::Mixed;
86 }
87
88 let threshold = ((total_chars as f32 * 0.2) as u32).max(1);
91 let significant_scripts = counts.iter().filter(|&&c| c >= threshold).count();
92
93 if significant_scripts >= 2 {
95 return Script::Mixed;
96 }
97
98 let scripts = [
100 Script::Latin,
101 Script::Cjk,
102 Script::Kana,
103 Script::Hangul,
104 Script::Arabic,
105 Script::Cyrillic,
106 Script::Devanagari,
107 Script::Greek,
108 Script::Hebrew,
109 Script::Thai,
110 Script::Mixed,
111 ];
112
113 let max_idx = counts
114 .iter()
115 .enumerate()
116 .max_by_key(|(_, &count)| count)
117 .map(|(i, _)| i)
118 .unwrap_or(10);
119
120 scripts[max_idx]
121 }
122
123 pub fn has_word_boundaries(&self) -> bool {
125 matches!(
126 self,
127 Script::Latin
128 | Script::Cyrillic
129 | Script::Greek
130 | Script::Arabic
131 | Script::Hebrew
132 | Script::Devanagari
133 )
134 }
135}
136
137#[cfg(test)]
138mod tests {
139 use super::*;
140
141 #[test]
142 fn test_script_detection_latin() {
143 assert_eq!(Script::detect("Hello World"), Script::Latin);
144 assert_eq!(Script::detect("Marie Curie"), Script::Latin);
145 }
146
147 #[test]
148 fn test_script_detection_cjk() {
149 assert_eq!(Script::detect("北京"), Script::Cjk);
150 assert_eq!(Script::detect("中华人民共和国"), Script::Cjk);
153 assert_eq!(Script::detect("中华人民共和国是伟大的国家"), Script::Cjk);
155 }
156
157 #[test]
158 fn test_script_detection_mixed() {
159 assert_eq!(Script::detect("東京 (Tokyo)"), Script::Mixed);
161 }
162}