scirs2_text/tokenization/
unicode_normalizer.rs1use unicode_normalization::UnicodeNormalization;
9
10#[non_exhaustive]
18#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
19pub enum Script {
20 Latin,
22 Cjk,
24 Cyrillic,
26 Arabic,
28 Devanagari,
30 Hebrew,
32 Other,
34}
35
36pub fn detect_script(c: char) -> Script {
41 let cp = c as u32;
42
43 if (0x4E00..=0x9FFF).contains(&cp) || (0x3400..=0x4DBF).contains(&cp) || (0x20000..=0x2A6DF).contains(&cp) || (0x2A700..=0x2B73F).contains(&cp) || (0x2B740..=0x2B81F).contains(&cp) || (0x2B820..=0x2CEAF).contains(&cp) || (0xF900..=0xFAFF).contains(&cp) || (0x2F800..=0x2FA1F).contains(&cp) || (0x3000..=0x303F).contains(&cp) || (0x3040..=0x309F).contains(&cp) || (0x30A0..=0x30FF).contains(&cp)
55 {
57 return Script::Cjk;
58 }
59
60 if (0x0400..=0x04FF).contains(&cp) {
62 return Script::Cyrillic;
63 }
64
65 if (0x0600..=0x06FF).contains(&cp) {
67 return Script::Arabic;
68 }
69
70 if (0x0900..=0x097F).contains(&cp) {
72 return Script::Devanagari;
73 }
74
75 if (0x0590..=0x05FF).contains(&cp) {
77 return Script::Hebrew;
78 }
79
80 if (0x0041..=0x005A).contains(&cp) || (0x0061..=0x007A).contains(&cp) || (0x00C0..=0x00D6).contains(&cp)
84 || (0x00D8..=0x00F6).contains(&cp)
85 || (0x00F8..=0x024F).contains(&cp)
86 {
88 return Script::Latin;
89 }
90
91 Script::Other
92}
93
94#[non_exhaustive]
98#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
99pub enum NormForm {
100 Nfc,
102 Nfd,
104}
105
106#[derive(Debug, Clone)]
110pub struct UnicodeNormalizerConfig {
111 pub form: NormForm,
113 pub strip_accents: bool,
115 pub lowercase: bool,
117 pub tokenize_cjk: bool,
119}
120
121impl Default for UnicodeNormalizerConfig {
122 fn default() -> Self {
123 UnicodeNormalizerConfig {
124 form: NormForm::Nfc,
125 strip_accents: false,
126 lowercase: false,
127 tokenize_cjk: true,
128 }
129 }
130}
131
132#[derive(Debug, Clone)]
155pub struct UnicodeNormalizer {
156 config: UnicodeNormalizerConfig,
157}
158
159impl UnicodeNormalizer {
160 pub fn new(config: UnicodeNormalizerConfig) -> Self {
162 UnicodeNormalizer { config }
163 }
164
165 pub fn default_normalizer() -> Self {
167 UnicodeNormalizer::new(UnicodeNormalizerConfig::default())
168 }
169
170 pub fn normalize(&self, text: &str) -> String {
177 let s = if self.config.lowercase {
179 text.to_lowercase()
180 } else {
181 text.to_owned()
182 };
183
184 match self.config.form {
186 NormForm::Nfd => {
187 if self.config.strip_accents {
188 s.nfd().filter(|&c| !is_combining_diacritic(c)).collect()
190 } else {
191 s.nfd().collect()
192 }
193 }
194 NormForm::Nfc => {
195 if self.config.strip_accents {
196 let stripped: String =
198 s.nfd().filter(|&c| !is_combining_diacritic(c)).collect();
199 stripped.nfc().collect()
200 } else {
201 s.nfc().collect()
202 }
203 }
204 }
205 }
206
207 pub fn tokenize_language_agnostic(&self, text: &str) -> Vec<String> {
217 let normalized = self.normalize(text);
218
219 let mut spaced = String::with_capacity(normalized.len() * 2);
220 for ch in normalized.chars() {
221 if self.config.tokenize_cjk && is_cjk_character(ch) {
222 spaced.push(' ');
224 spaced.push(ch);
225 spaced.push(' ');
226 } else {
227 spaced.push(ch);
228 }
229 }
230
231 spaced
232 .split(|c: char| c.is_whitespace())
233 .filter(|s| !s.is_empty())
234 .map(|s| s.to_owned())
235 .collect()
236 }
237
238 pub fn config(&self) -> &UnicodeNormalizerConfig {
240 &self.config
241 }
242}
243
244impl Default for UnicodeNormalizer {
245 fn default() -> Self {
246 UnicodeNormalizer::new(UnicodeNormalizerConfig::default())
247 }
248}
249
250fn is_combining_diacritic(ch: char) -> bool {
255 let cp = ch as u32;
256 (0x0300..=0x036F).contains(&cp)
258 || (0x1DC0..=0x1DFF).contains(&cp)
260 || (0x1AB0..=0x1AFF).contains(&cp)
262 || (0xFE20..=0xFE2F).contains(&cp)
264}
265
266fn is_cjk_character(ch: char) -> bool {
268 let cp = ch as u32;
269 (0x4E00..=0x9FFF).contains(&cp)
270 || (0x3400..=0x4DBF).contains(&cp)
271 || (0x20000..=0x2A6DF).contains(&cp)
272 || (0x2A700..=0x2B73F).contains(&cp)
273 || (0x2B740..=0x2B81F).contains(&cp)
274 || (0x2B820..=0x2CEAF).contains(&cp)
275 || (0xF900..=0xFAFF).contains(&cp)
276 || (0x2F800..=0x2FA1F).contains(&cp)
277 || (0x3040..=0x309F).contains(&cp) || (0x30A0..=0x30FF).contains(&cp) }
280
281#[cfg(test)]
284mod tests {
285 use super::*;
286
287 #[test]
290 fn test_detect_script_latin() {
291 assert_eq!(detect_script('a'), Script::Latin);
292 assert_eq!(detect_script('Z'), Script::Latin);
293 assert_eq!(detect_script('é'), Script::Latin); }
295
296 #[test]
297 fn test_detect_script_cjk() {
298 assert_eq!(detect_script('中'), Script::Cjk); assert_eq!(detect_script('日'), Script::Cjk); assert_eq!(detect_script('語'), Script::Cjk); }
302
303 #[test]
304 fn test_detect_script_cyrillic() {
305 assert_eq!(detect_script('А'), Script::Cyrillic); assert_eq!(detect_script('я'), Script::Cyrillic); }
308
309 #[test]
310 fn test_detect_script_arabic() {
311 assert_eq!(detect_script('ع'), Script::Arabic); assert_eq!(detect_script('م'), Script::Arabic); }
314
315 #[test]
316 fn test_detect_script_devanagari() {
317 assert_eq!(detect_script('क'), Script::Devanagari); assert_eq!(detect_script('ा'), Script::Devanagari); }
320
321 #[test]
322 fn test_detect_script_hebrew() {
323 assert_eq!(detect_script('א'), Script::Hebrew); assert_eq!(detect_script('ש'), Script::Hebrew); }
326
327 #[test]
328 fn test_detect_script_other() {
329 assert_eq!(detect_script('!'), Script::Other);
330 assert_eq!(detect_script(' '), Script::Other);
331 assert_eq!(detect_script('1'), Script::Other);
332 }
333
334 #[test]
337 fn test_normalize_lowercase() {
338 let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
339 lowercase: true,
340 ..Default::default()
341 });
342 assert_eq!(n.normalize("Hello WORLD"), "hello world");
343 }
344
345 #[test]
346 fn test_normalize_no_lowercase() {
347 let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
348 lowercase: false,
349 ..Default::default()
350 });
351 assert_eq!(n.normalize("Hello WORLD"), "Hello WORLD");
352 }
353
354 #[test]
355 fn test_normalize_strip_accents_nfc() {
356 let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
357 form: NormForm::Nfc,
358 strip_accents: true,
359 lowercase: false,
360 tokenize_cjk: false,
361 });
362 let result = n.normalize("café");
364 assert_eq!(result, "cafe");
365 }
366
367 #[test]
368 fn test_normalize_strip_accents_nfd() {
369 let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
370 form: NormForm::Nfd,
371 strip_accents: true,
372 lowercase: false,
373 tokenize_cjk: false,
374 });
375 let result = n.normalize("résumé");
376 assert_eq!(result, "resume");
377 }
378
379 #[test]
380 fn test_normalize_nfc_idempotent_on_ascii() {
381 let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
382 form: NormForm::Nfc,
383 strip_accents: false,
384 lowercase: false,
385 tokenize_cjk: false,
386 });
387 let text = "hello world 123";
388 assert_eq!(n.normalize(text), text);
389 }
390
391 #[test]
394 fn test_cjk_chars_split() {
395 let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
396 tokenize_cjk: true,
397 lowercase: false,
398 strip_accents: false,
399 form: NormForm::Nfc,
400 });
401 let tokens = n.tokenize_language_agnostic("Hello世界");
402 assert!(tokens.contains(&"Hello".to_string()), "got: {:?}", tokens);
404 assert!(tokens.contains(&"世".to_string()), "got: {:?}", tokens);
405 assert!(tokens.contains(&"界".to_string()), "got: {:?}", tokens);
406 }
407
408 #[test]
409 fn test_cjk_split_mixed_text() {
410 let n = UnicodeNormalizer::default();
411 let tokens = n.tokenize_language_agnostic("我 love Rust");
412 assert!(tokens.iter().any(|t| t == "我"), "got: {:?}", tokens);
414 assert!(tokens.iter().any(|t| t == "love"), "got: {:?}", tokens);
415 assert!(tokens.iter().any(|t| t == "Rust"), "got: {:?}", tokens);
416 }
417
418 #[test]
419 fn test_tokenize_latin_only() {
420 let n = UnicodeNormalizer::default();
421 let tokens = n.tokenize_language_agnostic("the quick brown fox");
422 assert_eq!(tokens, vec!["the", "quick", "brown", "fox"]);
423 }
424
425 #[test]
426 fn test_tokenize_empty() {
427 let n = UnicodeNormalizer::default();
428 let tokens = n.tokenize_language_agnostic(" ");
429 assert!(tokens.is_empty());
430 }
431
432 #[test]
433 fn test_tokenize_with_lowercase_and_accent_strip() {
434 let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
435 form: NormForm::Nfc,
436 strip_accents: true,
437 lowercase: true,
438 tokenize_cjk: true,
439 });
440 let tokens = n.tokenize_language_agnostic("Héllo Wörld");
441 assert!(tokens.iter().any(|t| t == "hello"), "got: {:?}", tokens);
442 assert!(tokens.iter().any(|t| t == "world"), "got: {:?}", tokens);
443 }
444
445 #[test]
446 fn test_combining_mark_detection() {
447 assert!(is_combining_diacritic('\u{0301}'));
449 assert!(is_combining_diacritic('\u{0300}'));
450 assert!(is_combining_diacritic('\u{036F}'));
451 assert!(!is_combining_diacritic('a'));
453 assert!(!is_combining_diacritic('é')); }
455
456 #[test]
457 fn test_cjk_character_detection() {
458 assert!(is_cjk_character('中'));
459 assert!(is_cjk_character('日'));
460 assert!(is_cjk_character('あ')); assert!(is_cjk_character('ア')); assert!(!is_cjk_character('a'));
463 assert!(!is_cjk_character('1'));
464 assert!(!is_cjk_character(' '));
465 }
466}