use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum LanguageClass {
Latin,
Cjk,
Cyrillic,
Arabic,
Mixed,
}
impl LanguageClass {
#[must_use]
pub const fn chars_per_token(self) -> f64 {
match self {
Self::Latin => 4.0,
Self::Cjk => 1.5,
Self::Cyrillic => 2.5,
Self::Arabic => 2.0,
Self::Mixed => 3.0,
}
}
}
#[must_use]
pub fn detect_language_class(text: &str) -> LanguageClass {
if text.is_empty() {
return LanguageClass::Latin;
}
let mut latin: usize = 0;
let mut cjk: usize = 0;
let mut cyrillic: usize = 0;
let mut arabic: usize = 0;
let mut total_letters: usize = 0;
for ch in text.chars() {
if !ch.is_alphabetic() {
continue;
}
total_letters += 1;
if is_latin(ch) {
latin += 1;
} else if is_cjk(ch) {
cjk += 1;
} else if is_cyrillic(ch) {
cyrillic += 1;
} else if is_arabic(ch) {
arabic += 1;
}
}
if total_letters == 0 {
return LanguageClass::Latin; }
let threshold = total_letters / 2;
if latin > threshold {
LanguageClass::Latin
} else if cjk > threshold {
LanguageClass::Cjk
} else if cyrillic > threshold {
LanguageClass::Cyrillic
} else if arabic > threshold {
LanguageClass::Arabic
} else {
LanguageClass::Mixed
}
}
fn is_latin(ch: char) -> bool {
ch.is_ascii_alphabetic()
|| ('\u{00C0}'..='\u{024F}').contains(&ch) || ('\u{1E00}'..='\u{1EFF}').contains(&ch) }
fn is_cjk(ch: char) -> bool {
('\u{4E00}'..='\u{9FFF}').contains(&ch) || ('\u{3400}'..='\u{4DBF}').contains(&ch) || ('\u{3040}'..='\u{30FF}').contains(&ch) || ('\u{AC00}'..='\u{D7AF}').contains(&ch) }
fn is_cyrillic(ch: char) -> bool {
('\u{0400}'..='\u{04FF}').contains(&ch) || ('\u{0500}'..='\u{052F}').contains(&ch) }
fn is_arabic(ch: char) -> bool {
('\u{0600}'..='\u{06FF}').contains(&ch) || ('\u{0750}'..='\u{077F}').contains(&ch) || ('\u{FB50}'..='\u{FDFF}').contains(&ch) }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn english_text_is_latin() {
assert_eq!(detect_language_class("Hello, world!"), LanguageClass::Latin);
}
#[test]
fn german_text_is_latin() {
assert_eq!(
detect_language_class("Über den Wolken muss die Freiheit wohl grenzenlos sein"),
LanguageClass::Latin
);
}
#[test]
fn german_umlauts_are_latin() {
assert_eq!(detect_language_class("ÄÖÜäöüß"), LanguageClass::Latin);
}
#[test]
fn chinese_text_is_cjk() {
assert_eq!(detect_language_class("你好世界"), LanguageClass::Cjk);
}
#[test]
fn japanese_hiragana_is_cjk() {
assert_eq!(detect_language_class("こんにちは"), LanguageClass::Cjk);
}
#[test]
fn korean_text_is_cjk() {
assert_eq!(detect_language_class("안녕하세요"), LanguageClass::Cjk);
}
#[test]
fn russian_text_is_cyrillic() {
assert_eq!(detect_language_class("Привет мир"), LanguageClass::Cyrillic);
}
#[test]
fn arabic_text_is_arabic() {
assert_eq!(
detect_language_class("مرحبا بالعالم"),
LanguageClass::Arabic
);
}
#[test]
fn mixed_script_is_mixed() {
assert_eq!(
detect_language_class("Hello 你好 Привет"),
LanguageClass::Mixed
);
}
#[test]
fn empty_string_defaults_to_latin() {
assert_eq!(detect_language_class(""), LanguageClass::Latin);
}
#[test]
fn numbers_only_defaults_to_latin() {
assert_eq!(detect_language_class("12345"), LanguageClass::Latin);
}
#[test]
fn latin_chars_per_token() {
assert!((LanguageClass::Latin.chars_per_token() - 4.0).abs() < f64::EPSILON);
}
#[test]
fn cjk_chars_per_token() {
assert!((LanguageClass::Cjk.chars_per_token() - 1.5).abs() < f64::EPSILON);
}
#[test]
fn cyrillic_chars_per_token() {
assert!((LanguageClass::Cyrillic.chars_per_token() - 2.5).abs() < f64::EPSILON);
}
}