#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub enum Script {
Latin,
Cyrillic,
Greek,
Arabic,
Hebrew,
Han,
Hiragana,
Katakana,
Hangul,
Devanagari,
Digit,
Other,
}
pub fn script_of(c: char) -> Script {
let code = c as u32;
if c.is_ascii_digit() {
return Script::Digit;
}
match code {
0x0041..=0x005A | 0x0061..=0x007A | 0x00C0..=0x024F => Script::Latin,
0x0370..=0x03FF => Script::Greek,
0x0400..=0x04FF => Script::Cyrillic,
0x0590..=0x05FF => Script::Hebrew,
0x0600..=0x06FF | 0x0750..=0x077F => Script::Arabic,
0x0900..=0x097F => Script::Devanagari,
0x3040..=0x309F => Script::Hiragana,
0x30A0..=0x30FF => Script::Katakana,
0x4E00..=0x9FFF => Script::Han,
0xAC00..=0xD7AF => Script::Hangul,
_ => Script::Other,
}
}
pub fn script_histogram(text: &str) -> std::collections::HashMap<Script, u32> {
let mut hist = std::collections::HashMap::new();
for c in text.chars() {
if c.is_whitespace() {
continue;
}
*hist.entry(script_of(c)).or_insert(0) += 1;
}
hist
}
pub fn dominant_script(text: &str) -> Option<Script> {
let hist = script_histogram(text);
hist.into_iter()
.filter(|(s, _)| !matches!(s, Script::Digit | Script::Other))
.max_by_key(|(_, c)| *c)
.map(|(s, _)| s)
}