pub(crate) fn detect_scripts(text: &str) -> Vec<&'static str> {
let mut scripts: Vec<&'static str> = Vec::new();
let mut seen = std::collections::HashSet::new();
for ch in text.chars() {
let script = detect_char_script(ch);
if script != "Common" && script != "Inherited" && seen.insert(script) {
scripts.push(script);
}
}
scripts
}
pub(crate) fn is_mixed_script(text: &str) -> bool {
let mut first_script: Option<&'static str> = None;
for ch in text.chars() {
let script = detect_char_script(ch);
if script == "Common" || script == "Inherited" {
continue;
}
match first_script {
None => first_script = Some(script),
Some(s) if s != script => return true,
_ => {}
}
}
false
}
static SCRIPT_RANGES: &[(u32, u32, &str)] = &[
(0x0041, 0x005A, "Latin"),
(0x0061, 0x007A, "Latin"),
(0x00C0, 0x024F, "Latin"),
(0x0250, 0x02AF, "Latin"), (0x0300, 0x036F, "Inherited"),
(0x0370, 0x03FF, "Greek"),
(0x0400, 0x04FF, "Cyrillic"),
(0x0500, 0x052F, "Cyrillic"), (0x0530, 0x058F, "Armenian"),
(0x0590, 0x05FF, "Hebrew"),
(0x0600, 0x06FF, "Arabic"),
(0x0700, 0x074F, "Syriac"),
(0x0750, 0x077F, "Arabic"),
(0x0780, 0x07BF, "Thaana"),
(0x07C0, 0x07FF, "NKo"),
(0x0860, 0x086F, "Syriac"),
(0x08A0, 0x08FF, "Arabic"),
(0x0900, 0x097F, "Devanagari"),
(0x0980, 0x09FF, "Bengali"),
(0x0A00, 0x0A7F, "Gurmukhi"),
(0x0A80, 0x0AFF, "Gujarati"),
(0x0B00, 0x0B7F, "Oriya"),
(0x0B80, 0x0BFF, "Tamil"),
(0x0C00, 0x0C7F, "Telugu"),
(0x0C80, 0x0CFF, "Kannada"),
(0x0D00, 0x0D7F, "Malayalam"),
(0x0D80, 0x0DFF, "Sinhala"),
(0x0E00, 0x0E7F, "Thai"),
(0x0E80, 0x0EFF, "Lao"),
(0x0F00, 0x0FFF, "Tibetan"),
(0x1000, 0x109F, "Myanmar"),
(0x10A0, 0x10FF, "Georgian"),
(0x1100, 0x11FF, "Hangul"),
(0x1200, 0x137F, "Ethiopic"),
(0x1380, 0x139F, "Ethiopic"), (0x13A0, 0x13FF, "Cherokee"),
(0x1400, 0x167F, "CanadianAboriginal"),
(0x1680, 0x169F, "Ogham"),
(0x16A0, 0x16FF, "Runic"),
(0x1700, 0x171F, "Tagalog"),
(0x1720, 0x173F, "Hanunoo"),
(0x1740, 0x175F, "Buhid"),
(0x1760, 0x177F, "Tagbanwa"),
(0x1780, 0x17FF, "Khmer"),
(0x1800, 0x18AF, "Mongolian"),
(0x18B0, 0x18FF, "CanadianAboriginal"),
(0x1950, 0x197F, "TaiLe"),
(0x1980, 0x19DF, "NewTaiLue"),
(0x19E0, 0x19FF, "Khmer"),
(0x1A00, 0x1A1F, "Buginese"),
(0x1A20, 0x1AAF, "TaiTham"),
(0x1AB0, 0x1AFF, "Inherited"),
(0x1B00, 0x1B7F, "Balinese"),
(0x1B80, 0x1BBF, "Sundanese"),
(0x1BC0, 0x1BFF, "Batak"),
(0x1C50, 0x1C7F, "OlChiki"),
(0x1C90, 0x1CBF, "Georgian"),
(0x1D00, 0x1D7F, "Latin"),
(0x1D80, 0x1DBF, "Latin"),
(0x1DC0, 0x1DFF, "Inherited"),
(0x1E00, 0x1EFF, "Latin"),
(0x1F00, 0x1FFF, "Greek"),
(0x20D0, 0x20FF, "Inherited"),
(0x2C60, 0x2C7F, "Latin"),
(0x2C80, 0x2CFF, "Coptic"),
(0x2D00, 0x2D2F, "Georgian"),
(0x2D30, 0x2D7F, "Tifinagh"),
(0x2D80, 0x2DDF, "Ethiopic"),
(0x2DE0, 0x2DFF, "Cyrillic"),
(0x2E80, 0x2EFF, "Han"),
(0x2F00, 0x2FDF, "Han"),
(0x3040, 0x309F, "Hiragana"),
(0x30A0, 0x30FF, "Katakana"),
(0x3130, 0x318F, "Hangul"),
(0x31F0, 0x31FF, "Katakana"),
(0x3400, 0x4DBF, "Han"),
(0x4E00, 0x9FFF, "Han"),
(0xA4D0, 0xA4FF, "Lisu"),
(0xA500, 0xA63F, "Vai"),
(0xA640, 0xA69F, "Cyrillic"),
(0xA6A0, 0xA6FF, "Bamum"),
(0xA720, 0xA7FF, "Latin"),
(0xA8E0, 0xA8FF, "Devanagari"),
(0xA960, 0xA97F, "Hangul"),
(0xA980, 0xA9DF, "Javanese"),
(0xAA00, 0xAA5F, "Cham"),
(0xAA60, 0xAA7F, "Myanmar"),
(0xAAE0, 0xAAFF, "MeeteiMayek"),
(0xAB00, 0xAB2F, "Ethiopic"),
(0xAB30, 0xAB6F, "Latin"),
(0xAB70, 0xABBF, "Cherokee"),
(0xABC0, 0xABFF, "MeeteiMayek"),
(0xAC00, 0xD7AF, "Hangul"),
(0xD7B0, 0xD7FF, "Hangul"),
(0xF900, 0xFAFF, "Han"),
(0xFB00, 0xFB06, "Latin"),
(0xFB13, 0xFB17, "Armenian"),
(0xFB1D, 0xFB4F, "Hebrew"),
(0xFB50, 0xFDFF, "Arabic"),
(0xFE20, 0xFE2F, "Inherited"),
(0xFE70, 0xFEFF, "Arabic"),
(0xFF65, 0xFF9F, "Katakana"),
(0x10000, 0x1007F, "LinearB"),
(0x10080, 0x100FF, "LinearB"),
(0x10330, 0x1034F, "Gothic"),
(0x103A0, 0x103DF, "OldPersian"),
(0x12000, 0x123FF, "Cuneiform"),
(0x12400, 0x1247F, "Cuneiform"), (0x20000, 0x2A6DF, "Han"),
(0x2A700, 0x2B73F, "Han"),
(0x2B740, 0x2B81F, "Han"),
(0x2B820, 0x2CEAF, "Han"),
(0x2CEB0, 0x2EBEF, "Han"),
(0x30000, 0x3134F, "Han"),
];
fn detect_char_script(ch: char) -> &'static str {
let cp = ch as u32;
if ch.is_ascii() {
if (0x0041..=0x005A).contains(&cp) || (0x0061..=0x007A).contains(&cp) {
return "Latin";
}
return "Common";
}
match SCRIPT_RANGES.binary_search_by(|&(start, _, _)| start.cmp(&cp)) {
Ok(idx) => SCRIPT_RANGES[idx].2,
Err(0) => {
"Common"
}
Err(idx) => {
let &(_, end, script) = &SCRIPT_RANGES[idx - 1];
if cp <= end {
script
} else {
"Common"
}
}
}
}
fn script_to_lang(script: &str) -> Option<&'static str> {
match script {
"Thai" => Some("th"),
"Lao" => Some("lo"),
"Myanmar" => Some("my"),
"Khmer" => Some("km"),
"Georgian" => Some("ka"),
"Armenian" => Some("hy"),
"Tibetan" => Some("bo"),
"Ethiopic" => Some("am"),
"Bengali" => Some("bn"),
"Tamil" => Some("ta"),
"Telugu" => Some("te"),
"Kannada" => Some("kn"),
"Malayalam" => Some("ml"),
"Gujarati" => Some("gu"),
"Gurmukhi" => Some("pa"),
"Oriya" => Some("or"),
"Sinhala" => Some("si"),
"Hangul" => Some("ko"),
"Hebrew" => Some("he"),
"Arabic" => Some("ar"),
"Thaana" => Some("dv"),
"Javanese" => Some("jv"),
"Mongolian" => Some("mn"),
"Devanagari" => Some("hi"),
"Cyrillic" => Some("ru"),
"Han" => Some("zh"),
"Hiragana" | "Katakana" => Some("ja"),
"Greek" => Some("el"),
"Balinese" => Some("ban"),
"Bamum" => Some("bax"),
"Buginese" => Some("bug"),
"Cherokee" => Some("chr"),
"Cham" => Some("cjm"),
"Coptic" => Some("cop"),
"Lisu" => Some("lis"),
"MeeteiMayek" => Some("mni"),
"NKo" => Some("nqo"),
"NewTaiLue" => Some("khb"),
"OlChiki" => Some("sat"),
"Sundanese" => Some("su"),
"Syriac" => Some("syr"),
"TaiLe" => Some("tdd"),
"TaiTham" => Some("nod"),
"Tagalog" => Some("tl"),
"Tifinagh" => Some("tzm"),
"Vai" => Some("vai"),
_ => None,
}
}
fn is_ambiguous_script(script: &str) -> bool {
matches!(script, "Cyrillic" | "Arabic")
}
fn lookup_discriminator(ch: char, script: &str) -> Option<&'static str> {
match script {
"Cyrillic" => match ch {
'\u{0491}' | '\u{0490}' | '\u{0457}' | '\u{0407}' | '\u{0454}' | '\u{0404}'
| '\u{0456}' | '\u{0406}' => Some("uk"),
'\u{0452}' | '\u{0402}' | '\u{045B}' | '\u{040B}' | '\u{0459}' | '\u{0409}'
| '\u{045A}' | '\u{040A}' | '\u{045F}' | '\u{040F}' | '\u{0458}' | '\u{0408}' => {
Some("sr")
}
'\u{04E9}' | '\u{04E8}' | '\u{04AF}' | '\u{04AE}' => Some("mn"),
_ => None,
},
"Arabic" => match ch {
'\u{067E}' | '\u{0686}' | '\u{0698}' | '\u{06AF}' => Some("fa"),
_ => None,
},
_ => None,
}
}
fn lookup_latin_discriminator(ch: char) -> Option<&'static str> {
match ch {
'\u{01A1}' | '\u{01A0}' | '\u{01B0}' | '\u{01AF}' => Some("vi"),
'\u{0130}' | '\u{0131}' => Some("tr"),
'\u{00DF}' | '\u{1E9E}' => Some("de"),
_ => None,
}
}
fn discriminate_by_chars(text: &str, script: &str) -> Option<&'static str> {
discriminate_by_chars_detailed(text, script).map(|(lang, _ch)| lang)
}
fn discriminate_by_chars_detailed(text: &str, script: &str) -> Option<(&'static str, char)> {
for ch in text.chars().take(crate::limits::SCAN_LIMIT) {
let hit = if script == "Latin" {
lookup_latin_discriminator(ch)
} else {
lookup_discriminator(ch, script)
};
if let Some(lang) = hit {
return Some((lang, ch));
}
}
None
}
pub(crate) fn resolve_auto_lang(text: &str) -> Option<String> {
let mut primary_script: Option<&str> = None;
for ch in text.chars() {
let script = detect_char_script(ch);
if script != "Common" && script != "Inherited" && script != "Latin" {
primary_script = Some(script);
break;
}
}
match primary_script {
Some(script) if is_ambiguous_script(script) => {
let lang = discriminate_by_chars(text, script).or_else(|| script_to_lang(script));
lang.map(str::to_owned)
}
Some(script) => {
script_to_lang(script).map(str::to_owned)
}
None => {
if text.is_ascii() {
None
} else {
discriminate_by_chars(text, "Latin").map(str::to_owned)
}
}
}
}
pub(crate) fn inspect_auto_lang(
text: &str,
) -> (
Option<&'static str>,
Option<String>,
&'static str,
Vec<String>,
) {
let mut primary_script: Option<&'static str> = None;
for ch in text.chars() {
let script = detect_char_script(ch);
if script != "Common" && script != "Inherited" && script != "Latin" {
primary_script = Some(script);
break;
}
}
match primary_script {
Some(script) if is_ambiguous_script(script) => {
match discriminate_by_chars_detailed(text, script) {
Some((lang, ch)) => (
Some(script),
Some(lang.to_owned()),
"discriminator",
vec![ch.to_string()],
),
None => (
Some(script),
script_to_lang(script).map(str::to_owned),
"script_default",
vec![],
),
}
}
Some(script) => (
Some(script),
script_to_lang(script).map(str::to_owned),
"unambiguous_script",
vec![],
),
None => {
if text.is_ascii() {
(None, None, "no_detection", vec![])
} else {
match discriminate_by_chars_detailed(text, "Latin") {
Some((lang, ch)) => (
None,
Some(lang.to_owned()),
"latin_discriminator",
vec![ch.to_string()],
),
None => (None, None, "no_detection", vec![]),
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_script_ranges_sorted() {
for i in 1..SCRIPT_RANGES.len() {
let (prev_start, prev_end, prev_script) = SCRIPT_RANGES[i - 1];
let (curr_start, _, _) = SCRIPT_RANGES[i];
assert!(
curr_start > prev_end,
"SCRIPT_RANGES not sorted or overlapping at index {i}: \
({prev_start:#X}..{prev_end:#X}, {prev_script:?}) vs ({curr_start:#X}..)"
);
}
}
#[test]
fn test_detect_latin() {
let scripts = detect_scripts("hello");
assert_eq!(scripts, vec!["Latin" as &str]);
}
#[test]
fn test_mixed_script() {
assert!(is_mixed_script("hello мир"));
}
#[test]
fn test_single_script() {
assert!(!is_mixed_script("hello world"));
}
#[test]
fn test_detect_bengali() {
let scripts = detect_scripts("বাংলা");
assert_eq!(scripts, vec!["Bengali"]);
}
#[test]
fn test_detect_tamil() {
let scripts = detect_scripts("தமிழ்");
assert_eq!(scripts, vec!["Tamil"]);
}
#[test]
fn test_detect_telugu() {
let scripts = detect_scripts("తెలుగు");
assert_eq!(scripts, vec!["Telugu"]);
}
#[test]
fn test_detect_kannada() {
let scripts = detect_scripts("ಕನ್ನಡ");
assert_eq!(scripts, vec!["Kannada"]);
}
#[test]
fn test_detect_malayalam() {
let scripts = detect_scripts("മലയാളം");
assert_eq!(scripts, vec!["Malayalam"]);
}
#[test]
fn test_detect_gujarati() {
let scripts = detect_scripts("ગુજરાતી");
assert_eq!(scripts, vec!["Gujarati"]);
}
#[test]
fn test_detect_gurmukhi() {
let scripts = detect_scripts("ਗੁਰਮੁਖੀ");
assert_eq!(scripts, vec!["Gurmukhi"]);
}
#[test]
fn test_detect_thai() {
let scripts = detect_scripts("ภาษาไทย");
assert_eq!(scripts, vec!["Thai"]);
}
#[test]
fn test_detect_lao() {
let scripts = detect_scripts("ພາສາລາວ");
assert_eq!(scripts, vec!["Lao"]);
}
#[test]
fn test_detect_myanmar() {
let scripts = detect_scripts("မြန်မာ");
assert_eq!(scripts, vec!["Myanmar"]);
}
#[test]
fn test_detect_tibetan() {
let scripts = detect_scripts("བོད་སྐད");
assert_eq!(scripts, vec!["Tibetan"]);
}
#[test]
fn test_detect_sinhala() {
let scripts = detect_scripts("සිංහල");
assert_eq!(scripts, vec!["Sinhala"]);
}
#[test]
fn test_detect_khmer() {
let scripts = detect_scripts("ភាសាខ្មែរ");
assert_eq!(scripts, vec!["Khmer"]);
}
#[test]
fn test_detect_georgian() {
let scripts = detect_scripts("ქართული");
assert_eq!(scripts, vec!["Georgian"]);
}
#[test]
fn test_detect_armenian() {
let scripts = detect_scripts("Հայերեն");
assert_eq!(scripts, vec!["Armenian"]);
}
#[test]
fn test_detect_ethiopic() {
let scripts = detect_scripts("አማርኛ");
assert_eq!(scripts, vec!["Ethiopic"]);
}
#[test]
fn test_detect_hangul() {
let scripts = detect_scripts("한국어");
assert_eq!(scripts, vec!["Hangul"]);
}
#[test]
fn test_detect_han() {
let scripts = detect_scripts("中文");
assert_eq!(scripts, vec!["Han"]);
}
#[test]
fn test_detect_arabic() {
let scripts = detect_scripts("العربية");
assert_eq!(scripts, vec!["Arabic"]);
}
#[test]
fn test_detect_hebrew() {
let scripts = detect_scripts("עברית");
assert_eq!(scripts, vec!["Hebrew"]);
}
#[test]
fn test_detect_oriya() {
let scripts = detect_scripts("ଓଡ଼ିଆ");
assert_eq!(scripts, vec!["Oriya"]);
}
#[test]
fn test_detect_coptic() {
let scripts = detect_scripts("Ⲙⲉⲧⲣⲉⲙⲛⲕⲏⲙⲉ");
assert_eq!(scripts, vec!["Coptic"]);
}
#[test]
fn test_inherited_combining_marks() {
let scripts = detect_scripts("\u{0301}");
assert!(scripts.is_empty());
}
#[test]
fn test_detect_syriac() {
assert_eq!(detect_char_script('\u{0710}'), "Syriac");
assert_eq!(detect_char_script('\u{074F}'), "Syriac");
}
#[test]
fn test_detect_thaana() {
assert_eq!(detect_char_script('\u{0780}'), "Thaana");
assert_eq!(detect_char_script('\u{07BF}'), "Thaana");
}
#[test]
fn test_detect_nko() {
assert_eq!(detect_char_script('\u{07C1}'), "NKo");
assert_eq!(detect_char_script('\u{07FF}'), "NKo");
}
#[test]
fn test_detect_mongolian() {
assert_eq!(detect_char_script('\u{1820}'), "Mongolian");
assert_eq!(detect_char_script('\u{18AF}'), "Mongolian");
}
#[test]
fn test_detect_cherokee() {
assert_eq!(detect_char_script('\u{13A0}'), "Cherokee");
assert_eq!(detect_char_script('\u{13FF}'), "Cherokee");
}
#[test]
fn test_detect_canadian_aboriginal() {
assert_eq!(detect_char_script('\u{1401}'), "CanadianAboriginal");
assert_eq!(detect_char_script('\u{167F}'), "CanadianAboriginal");
}
#[test]
fn test_detect_ogham() {
assert_eq!(detect_char_script('\u{1681}'), "Ogham");
assert_eq!(detect_char_script('\u{169F}'), "Ogham");
}
#[test]
fn test_detect_runic() {
assert_eq!(detect_char_script('\u{16A0}'), "Runic");
assert_eq!(detect_char_script('\u{16FF}'), "Runic");
}
#[test]
fn test_detect_tai_le() {
assert_eq!(detect_char_script('\u{1950}'), "TaiLe");
assert_eq!(detect_char_script('\u{197F}'), "TaiLe");
}
#[test]
fn test_detect_new_tai_lue() {
assert_eq!(detect_char_script('\u{1980}'), "NewTaiLue");
assert_eq!(detect_char_script('\u{19DF}'), "NewTaiLue");
}
#[test]
fn test_detect_balinese() {
assert_eq!(detect_char_script('\u{1B05}'), "Balinese");
assert_eq!(detect_char_script('\u{1B7F}'), "Balinese");
}
#[test]
fn test_detect_javanese() {
assert_eq!(detect_char_script('\u{A984}'), "Javanese");
assert_eq!(detect_char_script('\u{A9DF}'), "Javanese");
}
#[test]
fn test_detect_vai() {
assert_eq!(detect_char_script('\u{A500}'), "Vai");
assert_eq!(detect_char_script('\u{A63F}'), "Vai");
}
#[test]
fn test_latin_block_boundaries() {
assert_eq!(detect_char_script('A'), "Latin"); assert_eq!(detect_char_script('Z'), "Latin"); assert_eq!(detect_char_script('a'), "Latin"); assert_eq!(detect_char_script('z'), "Latin"); assert_eq!(detect_char_script('\u{00C0}'), "Latin"); assert_eq!(detect_char_script('\u{024F}'), "Latin");
assert_eq!(detect_char_script('\u{0250}'), "Latin");
assert_eq!(detect_char_script('\u{02AF}'), "Latin");
assert_eq!(detect_char_script('\u{1E00}'), "Latin");
assert_eq!(detect_char_script('\u{1EFF}'), "Latin");
}
#[test]
fn test_greek_block_boundaries() {
assert_eq!(detect_char_script('\u{0370}'), "Greek");
assert_eq!(detect_char_script('\u{03FF}'), "Greek");
assert_eq!(detect_char_script('\u{1F00}'), "Greek");
assert_eq!(detect_char_script('\u{1FFF}'), "Greek");
}
#[test]
fn test_cyrillic_block_boundaries() {
assert_eq!(detect_char_script('\u{0400}'), "Cyrillic");
assert_eq!(detect_char_script('\u{04FF}'), "Cyrillic");
assert_eq!(detect_char_script('\u{0500}'), "Cyrillic");
assert_eq!(detect_char_script('\u{052F}'), "Cyrillic");
assert_eq!(detect_char_script('\u{2DE0}'), "Cyrillic");
assert_eq!(detect_char_script('\u{2DFF}'), "Cyrillic");
assert_eq!(detect_char_script('\u{A640}'), "Cyrillic");
assert_eq!(detect_char_script('\u{A69F}'), "Cyrillic");
}
#[test]
fn test_arabic_block_boundaries() {
assert_eq!(detect_char_script('\u{0600}'), "Arabic");
assert_eq!(detect_char_script('\u{06FF}'), "Arabic");
assert_eq!(detect_char_script('\u{0750}'), "Arabic");
assert_eq!(detect_char_script('\u{077F}'), "Arabic");
assert_eq!(detect_char_script('\u{08A0}'), "Arabic");
assert_eq!(detect_char_script('\u{08FF}'), "Arabic");
assert_eq!(detect_char_script('\u{FB50}'), "Arabic");
assert_eq!(detect_char_script('\u{FE70}'), "Arabic");
assert_eq!(detect_char_script('\u{FEFF}'), "Arabic");
}
#[test]
fn test_han_supplementary_planes() {
assert_eq!(detect_char_script('\u{4E00}'), "Han");
assert_eq!(detect_char_script('\u{9FFF}'), "Han");
assert_eq!(detect_char_script('\u{3400}'), "Han");
assert_eq!(detect_char_script('\u{4DBF}'), "Han");
assert_eq!(detect_char_script('\u{20000}'), "Han");
assert_eq!(detect_char_script('\u{2A6DF}'), "Han");
assert_eq!(detect_char_script('\u{2A700}'), "Han");
assert_eq!(detect_char_script('\u{30000}'), "Han");
}
#[test]
fn test_hangul_block_boundaries() {
assert_eq!(detect_char_script('\u{1100}'), "Hangul");
assert_eq!(detect_char_script('\u{11FF}'), "Hangul");
assert_eq!(detect_char_script('\u{3130}'), "Hangul");
assert_eq!(detect_char_script('\u{318F}'), "Hangul");
assert_eq!(detect_char_script('\u{AC00}'), "Hangul");
assert_eq!(detect_char_script('\u{D7AF}'), "Hangul");
}
#[test]
fn test_common_detection() {
assert_eq!(detect_char_script('0'), "Common");
assert_eq!(detect_char_script(' '), "Common");
assert_eq!(detect_char_script('!'), "Common");
}
#[test]
fn test_inherited_combining_diacriticals() {
assert_eq!(detect_char_script('\u{0300}'), "Inherited"); assert_eq!(detect_char_script('\u{036F}'), "Inherited"); }
#[test]
fn test_inherited_combining_extended() {
assert_eq!(detect_char_script('\u{1AB0}'), "Inherited");
assert_eq!(detect_char_script('\u{1AFF}'), "Inherited");
}
#[test]
fn test_inherited_combining_supplement() {
assert_eq!(detect_char_script('\u{1DC0}'), "Inherited");
assert_eq!(detect_char_script('\u{1DFF}'), "Inherited");
}
#[test]
fn test_inherited_combining_symbols() {
assert_eq!(detect_char_script('\u{20D0}'), "Inherited");
assert_eq!(detect_char_script('\u{20FF}'), "Inherited");
}
#[test]
fn test_inherited_combining_half_marks() {
assert_eq!(detect_char_script('\u{FE20}'), "Inherited");
assert_eq!(detect_char_script('\u{FE2F}'), "Inherited");
}
#[test]
fn test_script_order_preserved() {
let scripts = detect_scripts("hello Москва");
assert_eq!(scripts, vec!["Latin", "Cyrillic"]);
}
#[test]
fn test_three_scripts_detected() {
let scripts = detect_scripts("abc мир 日本");
assert_eq!(scripts.len(), 3);
assert_eq!(scripts[0], "Latin");
assert_eq!(scripts[1], "Cyrillic");
assert_eq!(scripts[2], "Han");
}
#[test]
fn test_empty_string_no_scripts() {
let scripts = detect_scripts("");
assert!(scripts.is_empty());
}
#[test]
fn test_digits_only_no_scripts() {
let scripts = detect_scripts("12345");
assert!(scripts.is_empty());
}
#[test]
fn test_syriac_supplement() {
assert_eq!(detect_char_script('\u{0860}'), "Syriac");
assert_eq!(detect_char_script('\u{086F}'), "Syriac");
}
#[test]
fn test_latin_ligatures_in_alphabetic_pf() {
assert_eq!(detect_char_script('\u{FB00}'), "Latin"); assert_eq!(detect_char_script('\u{FB01}'), "Latin"); assert_eq!(detect_char_script('\u{FB02}'), "Latin"); assert_eq!(detect_char_script('\u{FB03}'), "Latin"); assert_eq!(detect_char_script('\u{FB04}'), "Latin"); assert_eq!(detect_char_script('\u{FB05}'), "Latin"); assert_eq!(detect_char_script('\u{FB06}'), "Latin"); }
#[test]
fn test_armenian_ligatures_in_alphabetic_pf() {
assert_eq!(detect_char_script('\u{FB13}'), "Armenian"); assert_eq!(detect_char_script('\u{FB14}'), "Armenian"); assert_eq!(detect_char_script('\u{FB15}'), "Armenian"); assert_eq!(detect_char_script('\u{FB16}'), "Armenian"); assert_eq!(detect_char_script('\u{FB17}'), "Armenian"); }
#[test]
fn test_latin_ligature_fi_detected_as_latin_in_text() {
let scripts = detect_scripts("fi");
assert_eq!(scripts, vec!["Latin" as &str]);
}
#[test]
fn test_armenian_ligature_detected_in_text() {
let scripts = detect_scripts("ﬓ");
assert_eq!(scripts, vec!["Armenian"]);
}
#[test]
fn test_mixed_latin_and_armenian_ligatures() {
let scripts = detect_scripts("fiﬓ");
assert_eq!(scripts, vec!["Latin", "Armenian"]);
}
#[test]
fn test_devanagari_extended_range() {
assert_eq!(detect_char_script('\u{A8E0}'), "Devanagari");
assert_eq!(detect_char_script('\u{A8FF}'), "Devanagari");
}
#[test]
fn test_ethiopic_extended() {
assert_eq!(detect_char_script('\u{2D80}'), "Ethiopic");
assert_eq!(detect_char_script('\u{2DDF}'), "Ethiopic");
}
#[test]
fn test_ethiopic_extended_a() {
assert_eq!(detect_char_script('\u{AB00}'), "Ethiopic");
assert_eq!(detect_char_script('\u{AB2F}'), "Ethiopic");
}
#[test]
fn test_cherokee_supplement_range() {
assert_eq!(detect_char_script('\u{AB70}'), "Cherokee");
assert_eq!(detect_char_script('\u{ABBF}'), "Cherokee");
}
#[test]
fn test_canadian_aboriginal_extended() {
assert_eq!(detect_char_script('\u{18B0}'), "CanadianAboriginal");
assert_eq!(detect_char_script('\u{18FF}'), "CanadianAboriginal");
}
#[test]
fn test_georgian_extended() {
assert_eq!(detect_char_script('\u{1C90}'), "Georgian");
assert_eq!(detect_char_script('\u{1CBF}'), "Georgian");
}
#[test]
fn test_myanmar_extended_a_range() {
assert_eq!(detect_char_script('\u{AA60}'), "Myanmar");
assert_eq!(detect_char_script('\u{AA7F}'), "Myanmar");
}
#[test]
fn test_khmer_symbols_range() {
assert_eq!(detect_char_script('\u{19E0}'), "Khmer");
assert_eq!(detect_char_script('\u{19FF}'), "Khmer");
}
#[test]
fn test_resolve_auto_lang_thai() {
assert_eq!(resolve_auto_lang("ภาษาไทย"), Some("th".to_owned()));
}
#[test]
fn test_resolve_auto_lang_new_scripts() {
let cases: &[(char, &str)] = &[
('\u{0710}', "syr"), ('\u{07CA}', "nqo"), ('\u{13A0}', "chr"), ('\u{1700}', "tl"), ('\u{1950}', "tdd"), ('\u{1980}', "khb"), ('\u{1A00}', "bug"), ('\u{1A20}', "nod"), ('\u{1B05}', "ban"), ('\u{1B83}', "su"), ('\u{1C5A}', "sat"), ('\u{2C80}', "cop"), ('\u{2D30}', "tzm"), ('\u{A4D0}', "lis"), ('\u{A500}', "vai"), ('\u{A6A0}', "bax"), ('\u{AA00}', "cjm"), ('\u{AAE0}', "mni"), ];
for &(ch, lang) in cases {
assert_eq!(
resolve_auto_lang(&ch.to_string()),
Some(lang.to_owned()),
"U+{:04X} should resolve to {lang}",
ch as u32
);
}
}
#[test]
fn test_resolve_auto_lang_latin_only() {
assert_eq!(resolve_auto_lang("hello"), None);
}
#[test]
fn test_resolve_auto_lang_empty() {
assert_eq!(resolve_auto_lang(""), None);
}
#[test]
fn test_resolve_auto_lang_accented_latin() {
assert_eq!(resolve_auto_lang("café"), None);
}
#[test]
fn test_resolve_auto_lang_mixed_latin_cyrillic() {
assert_eq!(resolve_auto_lang("Hello Москва"), Some("ru".to_owned()));
}
#[test]
fn test_resolve_auto_lang_hiragana() {
assert_eq!(resolve_auto_lang("こんにちは"), Some("ja".to_owned()));
}
#[test]
fn test_resolve_auto_lang_han() {
assert_eq!(resolve_auto_lang("中文"), Some("zh".to_owned()));
}
#[test]
fn test_resolve_auto_lang_hangul() {
assert_eq!(resolve_auto_lang("한국어"), Some("ko".to_owned()));
}
#[test]
fn test_resolve_auto_lang_arabic() {
assert_eq!(resolve_auto_lang("العربية"), Some("ar".to_owned()));
}
#[test]
fn test_resolve_auto_lang_hebrew() {
assert_eq!(resolve_auto_lang("עברית"), Some("he".to_owned()));
}
#[test]
fn test_resolve_auto_lang_georgian() {
assert_eq!(resolve_auto_lang("ქართული"), Some("ka".to_owned()));
}
#[test]
fn test_resolve_auto_lang_armenian() {
assert_eq!(resolve_auto_lang("Հայերեն"), Some("hy".to_owned()));
}
#[test]
fn test_resolve_auto_lang_unmapped_script() {
assert_eq!(resolve_auto_lang("\u{16A0}"), None);
}
#[test]
fn test_discriminate_ukrainian_by_exclusive_chars() {
assert_eq!(
resolve_auto_lang("Київ — столиця України"),
Some("uk".to_owned())
);
}
#[test]
fn test_discriminate_serbian_by_exclusive_chars() {
assert_eq!(resolve_auto_lang("Ђорђе и Ћирилица"), Some("sr".to_owned()));
}
#[test]
fn test_discriminate_persian_by_exclusive_chars() {
assert_eq!(resolve_auto_lang("پارسی زبان"), Some("fa".to_owned()));
}
#[test]
fn test_discriminate_vietnamese_by_exclusive_chars() {
assert_eq!(
resolve_auto_lang("Việt Nam có nhiều người"),
Some("vi".to_owned())
);
}
#[test]
fn test_discriminate_turkish_by_exclusive_chars() {
assert_eq!(
resolve_auto_lang("İstanbul güzel bir şehır"),
Some("tr".to_owned())
);
}
#[test]
fn test_discriminate_german_by_exclusive_chars() {
assert_eq!(
resolve_auto_lang("Straße nach Süden"),
Some("de".to_owned())
);
}
#[test]
fn test_discriminate_first_hit_wins() {
assert_eq!(resolve_auto_lang("їћ"), Some("uk".to_owned()));
}
#[test]
fn test_discriminate_cyrillic_no_exclusive_chars() {
assert_eq!(resolve_auto_lang("Москва"), Some("ru".to_owned()));
}
#[test]
fn test_discriminate_arabic_no_exclusive_chars() {
assert_eq!(resolve_auto_lang("العربية"), Some("ar".to_owned()));
}
#[test]
fn test_discriminate_latin_no_exclusive_chars() {
assert_eq!(resolve_auto_lang("café"), None);
}
#[test]
fn test_discriminate_latin_ascii_only() {
assert_eq!(resolve_auto_lang("hello"), None);
}
}