use std::fmt;
use crate::utils::is_stop_char;
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub enum Script {
Arabic,
Bengali,
Cyrillic,
Devanagari,
Ethiopic,
Georgian,
Greek,
Gujarati,
Gurmukhi,
Hangul,
Hebrew,
Hiragana,
Kannada,
Katakana,
Khmer,
Latin,
Malayalam,
Mandarin,
Myanmar,
Oriya,
Sinhala,
Tamil,
Telugu,
Thai,
}
impl Script {
pub fn name(&self) -> &str {
match *self {
Script::Latin => "Latin",
Script::Cyrillic => "Cyrillic",
Script::Arabic => "Arabic",
Script::Devanagari => "Devanagari",
Script::Hiragana => "Hiragana",
Script::Katakana => "Katakana",
Script::Ethiopic => "Ethiopic",
Script::Hebrew => "Hebrew",
Script::Bengali => "Bengali",
Script::Georgian => "Georgian",
Script::Mandarin => "Mandarin",
Script::Hangul => "Hangul",
Script::Greek => "Greek",
Script::Kannada => "Kannada",
Script::Tamil => "Tamil",
Script::Thai => "Thai",
Script::Gujarati => "Gujarati",
Script::Gurmukhi => "Gurmukhi",
Script::Telugu => "Telugu",
Script::Malayalam => "Malayalam",
Script::Oriya => "Oriya",
Script::Myanmar => "Myanmar",
Script::Sinhala => "Sinhala",
Script::Khmer => "Khmer",
}
}
}
impl fmt::Display for Script {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.name())
}
}
type ScriptCounter = (Script, fn(char) -> bool, usize);
pub fn detect_script(text: &str) -> Option<Script> {
let mut script_counters: [ScriptCounter; 24] = [
(Script::Latin, is_latin, 0),
(Script::Cyrillic, is_cyrillic, 0),
(Script::Arabic, is_arabic, 0),
(Script::Mandarin, is_mandarin, 0),
(Script::Devanagari, is_devanagari, 0),
(Script::Hebrew, is_hebrew, 0),
(Script::Ethiopic, is_ethiopic, 0),
(Script::Georgian, is_georgian, 0),
(Script::Bengali, is_bengali, 0),
(Script::Hangul, is_hangul, 0),
(Script::Hiragana, is_hiragana, 0),
(Script::Katakana, is_katakana, 0),
(Script::Greek, is_greek, 0),
(Script::Kannada, is_kannada, 0),
(Script::Tamil, is_tamil, 0),
(Script::Thai, is_thai, 0),
(Script::Gujarati, is_gujarati, 0),
(Script::Gurmukhi, is_gurmukhi, 0),
(Script::Telugu, is_telugu, 0),
(Script::Malayalam, is_malayalam, 0),
(Script::Oriya, is_oriya, 0),
(Script::Myanmar, is_myanmar, 0),
(Script::Sinhala, is_sinhala, 0),
(Script::Khmer, is_khmer, 0),
];
let half = text.chars().count() / 2;
for ch in text.chars() {
if is_stop_char(ch) {
continue;
}
for i in 0..script_counters.len() {
let found = {
let (script, check_fn, ref mut count) = script_counters[i];
if check_fn(ch) {
*count += 1;
if *count > half {
return Some(script);
}
true
} else {
false
}
};
if found {
if i > 0 {
script_counters.swap(i - 1, i);
}
break;
}
}
}
let (script, _, count) = script_counters
.iter()
.cloned()
.max_by_key(|&(_, _, count)| count)
.unwrap();
if count != 0 {
Some(script)
} else {
None
}
}
fn is_cyrillic(ch: char) -> bool {
matches!(ch,
'\u{0400}'..='\u{0484}'
| '\u{0487}'..='\u{052F}'
| '\u{2DE0}'..='\u{2DFF}'
| '\u{A640}'..='\u{A69D}'
| '\u{1D2B}'
| '\u{1D78}'
| '\u{A69F}'
)
}
fn is_latin(ch: char) -> bool {
matches!(ch,
'a'..='z'
| 'A'..='Z'
| '\u{0080}'..='\u{00FF}'
| '\u{0100}'..='\u{017F}'
| '\u{0180}'..='\u{024F}'
| '\u{0250}'..='\u{02AF}'
| '\u{1D00}'..='\u{1D7F}'
| '\u{1D80}'..='\u{1DBF}'
| '\u{1E00}'..='\u{1EFF}'
| '\u{2100}'..='\u{214F}'
| '\u{2C60}'..='\u{2C7F}'
| '\u{A720}'..='\u{A7FF}'
| '\u{AB30}'..='\u{AB6F}'
)
}
fn is_arabic(ch: char) -> bool {
matches!(ch,
'\u{0600}'..='\u{06FF}'
| '\u{0750}'..='\u{07FF}'
| '\u{08A0}'..='\u{08FF}'
| '\u{FB50}'..='\u{FDFF}'
| '\u{FE70}'..='\u{FEFF}'
| '\u{10E60}'..='\u{10E7F}'
| '\u{1EE00}'..='\u{1EEFF}'
)
}
fn is_devanagari(ch: char) -> bool {
matches!(ch, '\u{0900}'..='\u{097F}' | '\u{A8E0}'..='\u{A8FF}' | '\u{1CD0}'..='\u{1CFF}')
}
fn is_ethiopic(ch: char) -> bool {
matches!(ch, '\u{1200}'..='\u{139F}' | '\u{2D80}'..='\u{2DDF}' | '\u{AB00}'..='\u{AB2F}')
}
fn is_hebrew(ch: char) -> bool {
matches!(ch, '\u{0590}'..='\u{05FF}')
}
fn is_georgian(ch: char) -> bool {
matches!(ch, '\u{10A0}'..='\u{10FF}')
}
fn is_mandarin(ch: char) -> bool {
matches!(ch,
'\u{2E80}'..='\u{2E99}'
| '\u{2E9B}'..='\u{2EF3}'
| '\u{2F00}'..='\u{2FD5}'
| '\u{3005}'
| '\u{3007}'
| '\u{3021}'..='\u{3029}'
| '\u{3038}'..='\u{303B}'
| '\u{3400}'..='\u{4DB5}'
| '\u{4E00}'..='\u{9FCC}'
| '\u{F900}'..='\u{FA6D}'
| '\u{FA70}'..='\u{FAD9}'
)
}
fn is_bengali(ch: char) -> bool {
matches!(ch, '\u{0980}'..='\u{09FF}')
}
fn is_hiragana(ch: char) -> bool {
matches!(ch, '\u{3040}'..='\u{309F}')
}
fn is_katakana(ch: char) -> bool {
matches!(ch, '\u{30A0}'..='\u{30FF}')
}
fn is_hangul(ch: char) -> bool {
matches!(ch,
'\u{AC00}'..='\u{D7AF}'
| '\u{1100}'..='\u{11FF}'
| '\u{3130}'..='\u{318F}'
| '\u{3200}'..='\u{32FF}'
| '\u{A960}'..='\u{A97F}'
| '\u{D7B0}'..='\u{D7FF}'
| '\u{FF00}'..='\u{FFEF}'
)
}
fn is_greek(ch: char) -> bool {
matches!(ch, '\u{0370}'..='\u{03FF}')
}
fn is_kannada(ch: char) -> bool {
matches!(ch, '\u{0C80}'..='\u{0CFF}')
}
fn is_tamil(ch: char) -> bool {
matches!(ch, '\u{0B80}'..='\u{0BFF}')
}
fn is_thai(ch: char) -> bool {
matches!(ch, '\u{0E00}'..='\u{0E7F}')
}
fn is_gujarati(ch: char) -> bool {
matches!(ch, '\u{0A80}'..='\u{0AFF}')
}
fn is_gurmukhi(ch: char) -> bool {
matches!(ch, '\u{0A00}'..='\u{0A7F}')
}
fn is_telugu(ch: char) -> bool {
matches!(ch, '\u{0C00}'..='\u{0C7F}')
}
fn is_malayalam(ch: char) -> bool {
matches!(ch, '\u{0D00}'..='\u{0D7F}')
}
fn is_oriya(ch: char) -> bool {
matches!(ch, '\u{0B00}'..='\u{0B7F}')
}
fn is_myanmar(ch: char) -> bool {
matches!(ch, '\u{1000}'..='\u{109F}')
}
fn is_sinhala(ch: char) -> bool {
matches!(ch, '\u{0D80}'..='\u{0DFF}')
}
fn is_khmer(ch: char) -> bool {
matches!(ch, '\u{1780}'..='\u{17FF}' | '\u{19E0}'..='\u{19FF}')
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_script_name() {
assert_eq!(Script::Cyrillic.name(), "Cyrillic");
assert_eq!(Script::Katakana.name(), "Katakana");
}
#[test]
fn test_detect_script() {
assert_eq!(detect_script("1234567890-,;!"), None);
assert_eq!(detect_script("Hello!"), Some(Script::Latin));
assert_eq!(detect_script("Привет всем!"), Some(Script::Cyrillic));
assert_eq!(
detect_script("ქართული ენა მსოფლიო "),
Some(Script::Georgian)
);
assert_eq!(
detect_script("県見夜上温国阪題富販"),
Some(Script::Mandarin)
);
assert_eq!(
detect_script(" ككل حوالي 1.6، ومعظم الناس "),
Some(Script::Arabic)
);
assert_eq!(
detect_script("हिमालयी वन चिड़िया (जूथेरा सालिमअली) चिड़िया की एक प्रजाति है"),
Some(Script::Devanagari)
);
assert_eq!(
detect_script("היסטוריה והתפתחות של האלפבית העברי"),
Some(Script::Hebrew)
);
assert_eq!(
detect_script("የኢትዮጵያ ፌዴራላዊ ዴሞክራሲያዊሪፐብሊክ"),
Some(Script::Ethiopic)
);
assert_eq!(
detect_script("Привет! Текст на русском with some English."),
Some(Script::Cyrillic)
);
assert_eq!(
detect_script("Russian word любовь means love."),
Some(Script::Latin)
);
}
#[test]
fn test_is_latin() {
assert_eq!(is_latin('z'), true);
assert_eq!(is_latin('A'), true);
assert_eq!(is_latin('č'), true);
assert_eq!(is_latin('š'), true);
assert_eq!(is_latin('Ĵ'), true);
assert_eq!(is_latin('ж'), false);
}
#[test]
fn test_is_cyrillic() {
assert_eq!(is_cyrillic('а'), true);
assert_eq!(is_cyrillic('Я'), true);
assert_eq!(is_cyrillic('Ґ'), true);
assert_eq!(is_cyrillic('ї'), true);
assert_eq!(is_cyrillic('Ꙕ'), true);
assert_eq!(is_cyrillic('L'), false);
}
#[test]
fn test_is_ethiopic() {
assert_eq!(is_ethiopic('ፚ'), true);
assert_eq!(is_ethiopic('ᎀ'), true);
assert_eq!(is_ethiopic('а'), false);
assert_eq!(is_ethiopic('L'), false);
}
#[test]
fn test_is_georgian() {
assert_eq!(is_georgian('რ'), true);
assert_eq!(is_georgian('ж'), false);
}
#[test]
fn test_is_bengali() {
assert_eq!(is_bengali('ই'), true);
assert_eq!(is_bengali('z'), false);
}
#[test]
fn test_is_katakana() {
assert_eq!(is_katakana('カ'), true);
assert_eq!(is_katakana('f'), false);
}
#[test]
fn test_is_hiragana() {
assert_eq!(is_hiragana('ひ'), true);
assert_eq!(is_hiragana('a'), false);
}
#[test]
fn test_is_hangul() {
assert_eq!(is_hangul('ᄁ'), true);
assert_eq!(is_hangul('t'), false);
}
#[test]
fn test_is_greek() {
assert_eq!(is_greek('φ'), true);
assert_eq!(is_greek('ф'), false);
}
#[test]
fn test_is_kannada() {
assert_eq!(is_kannada('ಡ'), true);
assert_eq!(is_kannada('S'), false);
}
#[test]
fn test_is_tamil() {
assert_eq!(is_tamil('ஐ'), true);
assert_eq!(is_tamil('Ж'), false);
}
#[test]
fn test_is_thai() {
assert_eq!(is_thai('ก'), true);
assert_eq!(is_thai('๛'), true);
assert_eq!(is_thai('Ж'), false);
}
#[test]
fn test_is_gujarati() {
assert_eq!(is_gujarati('ઁ'), true);
assert_eq!(is_gujarati('૱'), true);
assert_eq!(is_gujarati('Ж'), false);
}
#[test]
fn test_is_gurmukhi() {
assert_eq!(is_gurmukhi('ਁ'), true);
assert_eq!(is_gurmukhi('ੴ'), true);
assert_eq!(is_gurmukhi('Ж'), false);
}
#[test]
fn test_is_telugu() {
assert_eq!(is_telugu('ఁ'), true);
assert_eq!(is_telugu('౿'), true);
assert_eq!(is_telugu('Ж'), false);
}
#[test]
fn test_is_oriya() {
assert_eq!(is_oriya('ଐ'), true);
assert_eq!(is_oriya('୷'), true);
assert_eq!(is_oriya('౿'), false);
}
}