use unicode_normalization::UnicodeNormalization;
#[non_exhaustive]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Script {
Latin,
Cjk,
Cyrillic,
Arabic,
Devanagari,
Hebrew,
Other,
}
pub fn detect_script(c: char) -> Script {
let cp = c as u32;
if (0x4E00..=0x9FFF).contains(&cp) || (0x3400..=0x4DBF).contains(&cp) || (0x20000..=0x2A6DF).contains(&cp) || (0x2A700..=0x2B73F).contains(&cp) || (0x2B740..=0x2B81F).contains(&cp) || (0x2B820..=0x2CEAF).contains(&cp) || (0xF900..=0xFAFF).contains(&cp) || (0x2F800..=0x2FA1F).contains(&cp) || (0x3000..=0x303F).contains(&cp) || (0x3040..=0x309F).contains(&cp) || (0x30A0..=0x30FF).contains(&cp)
{
return Script::Cjk;
}
if (0x0400..=0x04FF).contains(&cp) {
return Script::Cyrillic;
}
if (0x0600..=0x06FF).contains(&cp) {
return Script::Arabic;
}
if (0x0900..=0x097F).contains(&cp) {
return Script::Devanagari;
}
if (0x0590..=0x05FF).contains(&cp) {
return Script::Hebrew;
}
if (0x0041..=0x005A).contains(&cp) || (0x0061..=0x007A).contains(&cp) || (0x00C0..=0x00D6).contains(&cp)
|| (0x00D8..=0x00F6).contains(&cp)
|| (0x00F8..=0x024F).contains(&cp)
{
return Script::Latin;
}
Script::Other
}
#[non_exhaustive]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum NormForm {
Nfc,
Nfd,
}
#[derive(Debug, Clone)]
pub struct UnicodeNormalizerConfig {
pub form: NormForm,
pub strip_accents: bool,
pub lowercase: bool,
pub tokenize_cjk: bool,
}
impl Default for UnicodeNormalizerConfig {
fn default() -> Self {
UnicodeNormalizerConfig {
form: NormForm::Nfc,
strip_accents: false,
lowercase: false,
tokenize_cjk: true,
}
}
}
#[derive(Debug, Clone)]
pub struct UnicodeNormalizer {
config: UnicodeNormalizerConfig,
}
impl UnicodeNormalizer {
pub fn new(config: UnicodeNormalizerConfig) -> Self {
UnicodeNormalizer { config }
}
pub fn default_normalizer() -> Self {
UnicodeNormalizer::new(UnicodeNormalizerConfig::default())
}
pub fn normalize(&self, text: &str) -> String {
let s = if self.config.lowercase {
text.to_lowercase()
} else {
text.to_owned()
};
match self.config.form {
NormForm::Nfd => {
if self.config.strip_accents {
s.nfd().filter(|&c| !is_combining_diacritic(c)).collect()
} else {
s.nfd().collect()
}
}
NormForm::Nfc => {
if self.config.strip_accents {
let stripped: String =
s.nfd().filter(|&c| !is_combining_diacritic(c)).collect();
stripped.nfc().collect()
} else {
s.nfc().collect()
}
}
}
}
pub fn tokenize_language_agnostic(&self, text: &str) -> Vec<String> {
let normalized = self.normalize(text);
let mut spaced = String::with_capacity(normalized.len() * 2);
for ch in normalized.chars() {
if self.config.tokenize_cjk && is_cjk_character(ch) {
spaced.push(' ');
spaced.push(ch);
spaced.push(' ');
} else {
spaced.push(ch);
}
}
spaced
.split(|c: char| c.is_whitespace())
.filter(|s| !s.is_empty())
.map(|s| s.to_owned())
.collect()
}
pub fn config(&self) -> &UnicodeNormalizerConfig {
&self.config
}
}
impl Default for UnicodeNormalizer {
fn default() -> Self {
UnicodeNormalizer::new(UnicodeNormalizerConfig::default())
}
}
fn is_combining_diacritic(ch: char) -> bool {
let cp = ch as u32;
(0x0300..=0x036F).contains(&cp)
|| (0x1DC0..=0x1DFF).contains(&cp)
|| (0x1AB0..=0x1AFF).contains(&cp)
|| (0xFE20..=0xFE2F).contains(&cp)
}
fn is_cjk_character(ch: char) -> bool {
let cp = ch as u32;
(0x4E00..=0x9FFF).contains(&cp)
|| (0x3400..=0x4DBF).contains(&cp)
|| (0x20000..=0x2A6DF).contains(&cp)
|| (0x2A700..=0x2B73F).contains(&cp)
|| (0x2B740..=0x2B81F).contains(&cp)
|| (0x2B820..=0x2CEAF).contains(&cp)
|| (0xF900..=0xFAFF).contains(&cp)
|| (0x2F800..=0x2FA1F).contains(&cp)
|| (0x3040..=0x309F).contains(&cp) || (0x30A0..=0x30FF).contains(&cp) }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_script_latin() {
assert_eq!(detect_script('a'), Script::Latin);
assert_eq!(detect_script('Z'), Script::Latin);
assert_eq!(detect_script('é'), Script::Latin); }
#[test]
fn test_detect_script_cjk() {
assert_eq!(detect_script('中'), Script::Cjk); assert_eq!(detect_script('日'), Script::Cjk); assert_eq!(detect_script('語'), Script::Cjk); }
#[test]
fn test_detect_script_cyrillic() {
assert_eq!(detect_script('А'), Script::Cyrillic); assert_eq!(detect_script('я'), Script::Cyrillic); }
#[test]
fn test_detect_script_arabic() {
assert_eq!(detect_script('ع'), Script::Arabic); assert_eq!(detect_script('م'), Script::Arabic); }
#[test]
fn test_detect_script_devanagari() {
assert_eq!(detect_script('क'), Script::Devanagari); assert_eq!(detect_script('ा'), Script::Devanagari); }
#[test]
fn test_detect_script_hebrew() {
assert_eq!(detect_script('א'), Script::Hebrew); assert_eq!(detect_script('ש'), Script::Hebrew); }
#[test]
fn test_detect_script_other() {
assert_eq!(detect_script('!'), Script::Other);
assert_eq!(detect_script(' '), Script::Other);
assert_eq!(detect_script('1'), Script::Other);
}
#[test]
fn test_normalize_lowercase() {
let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
lowercase: true,
..Default::default()
});
assert_eq!(n.normalize("Hello WORLD"), "hello world");
}
#[test]
fn test_normalize_no_lowercase() {
let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
lowercase: false,
..Default::default()
});
assert_eq!(n.normalize("Hello WORLD"), "Hello WORLD");
}
#[test]
fn test_normalize_strip_accents_nfc() {
let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
form: NormForm::Nfc,
strip_accents: true,
lowercase: false,
tokenize_cjk: false,
});
let result = n.normalize("café");
assert_eq!(result, "cafe");
}
#[test]
fn test_normalize_strip_accents_nfd() {
let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
form: NormForm::Nfd,
strip_accents: true,
lowercase: false,
tokenize_cjk: false,
});
let result = n.normalize("résumé");
assert_eq!(result, "resume");
}
#[test]
fn test_normalize_nfc_idempotent_on_ascii() {
let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
form: NormForm::Nfc,
strip_accents: false,
lowercase: false,
tokenize_cjk: false,
});
let text = "hello world 123";
assert_eq!(n.normalize(text), text);
}
#[test]
fn test_cjk_chars_split() {
let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
tokenize_cjk: true,
lowercase: false,
strip_accents: false,
form: NormForm::Nfc,
});
let tokens = n.tokenize_language_agnostic("Hello世界");
assert!(tokens.contains(&"Hello".to_string()), "got: {:?}", tokens);
assert!(tokens.contains(&"世".to_string()), "got: {:?}", tokens);
assert!(tokens.contains(&"界".to_string()), "got: {:?}", tokens);
}
#[test]
fn test_cjk_split_mixed_text() {
let n = UnicodeNormalizer::default();
let tokens = n.tokenize_language_agnostic("我 love Rust");
assert!(tokens.iter().any(|t| t == "我"), "got: {:?}", tokens);
assert!(tokens.iter().any(|t| t == "love"), "got: {:?}", tokens);
assert!(tokens.iter().any(|t| t == "Rust"), "got: {:?}", tokens);
}
#[test]
fn test_tokenize_latin_only() {
let n = UnicodeNormalizer::default();
let tokens = n.tokenize_language_agnostic("the quick brown fox");
assert_eq!(tokens, vec!["the", "quick", "brown", "fox"]);
}
#[test]
fn test_tokenize_empty() {
let n = UnicodeNormalizer::default();
let tokens = n.tokenize_language_agnostic(" ");
assert!(tokens.is_empty());
}
#[test]
fn test_tokenize_with_lowercase_and_accent_strip() {
let n = UnicodeNormalizer::new(UnicodeNormalizerConfig {
form: NormForm::Nfc,
strip_accents: true,
lowercase: true,
tokenize_cjk: true,
});
let tokens = n.tokenize_language_agnostic("Héllo Wörld");
assert!(tokens.iter().any(|t| t == "hello"), "got: {:?}", tokens);
assert!(tokens.iter().any(|t| t == "world"), "got: {:?}", tokens);
}
#[test]
fn test_combining_mark_detection() {
assert!(is_combining_diacritic('\u{0301}'));
assert!(is_combining_diacritic('\u{0300}'));
assert!(is_combining_diacritic('\u{036F}'));
assert!(!is_combining_diacritic('a'));
assert!(!is_combining_diacritic('é')); }
#[test]
fn test_cjk_character_detection() {
assert!(is_cjk_character('中'));
assert!(is_cjk_character('日'));
assert!(is_cjk_character('あ')); assert!(is_cjk_character('ア')); assert!(!is_cjk_character('a'));
assert!(!is_cjk_character('1'));
assert!(!is_cjk_character(' '));
}
}