#[cfg(feature = "text_layout_hyphenation")]
use hyphenation::Language as HyphenationLanguage;
#[cfg(feature = "text_layout_hyphenation")]
pub use hyphenation::Language;
#[cfg(not(feature = "text_layout_hyphenation"))]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[allow(dead_code)]
pub enum Language {
EnglishUS,
French,
German1996,
Spanish,
Portuguese,
Estonian,
Hungarian,
Polish,
Czech,
Slovak,
Latvian,
Lithuanian,
Romanian,
Turkish,
Croatian,
Icelandic,
Welsh,
NorwegianBokmal,
Swedish,
Russian,
Ukrainian,
Belarusian,
Bulgarian,
Macedonian,
SerbianCyrillic,
Mongolian,
SlavonicChurch,
GreekMono,
GreekPoly,
Coptic,
Hindi,
Bengali,
Assamese,
Marathi,
Sanskrit,
Gujarati,
Panjabi,
Kannada,
Malayalam,
Oriya,
Tamil,
Telugu,
Georgian,
Ethiopic,
Thai,
Chinese,
}
use rust_fontconfig::UnicodeRange;
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
pub enum Script {
Arabic,
Bengali,
Cyrillic,
Devanagari,
Ethiopic,
Georgian,
Greek,
Gujarati,
Gurmukhi,
Hangul,
Hebrew,
Hiragana,
Kannada,
Katakana,
Khmer,
Latin,
Malayalam,
Mandarin,
Myanmar,
Oriya,
Sinhala,
Tamil,
Telugu,
Thai,
}
impl Script {
pub fn get_unicode_ranges(&self) -> Vec<UnicodeRange> {
match self {
Script::Arabic => vec![
UnicodeRange {
start: 0x0600,
end: 0x06FF,
},
UnicodeRange {
start: 0x0750,
end: 0x07FF,
},
UnicodeRange {
start: 0x08A0,
end: 0x08FF,
},
UnicodeRange {
start: 0xFB50,
end: 0xFDFF,
},
UnicodeRange {
start: 0xFE70,
end: 0xFEFF,
},
UnicodeRange {
start: 0x10E60,
end: 0x10E7F,
},
UnicodeRange {
start: 0x1EE00,
end: 0x1EEFF,
},
],
Script::Bengali => vec![UnicodeRange {
start: 0x0980,
end: 0x09FF,
}],
Script::Cyrillic => vec![
UnicodeRange {
start: 0x0400,
end: 0x0484,
},
UnicodeRange {
start: 0x0487,
end: 0x052F,
},
UnicodeRange {
start: 0x2DE0,
end: 0x2DFF,
},
UnicodeRange {
start: 0xA640,
end: 0xA69D,
},
UnicodeRange {
start: 0x1D2B,
end: 0x1D2B,
},
UnicodeRange {
start: 0x1D78,
end: 0x1D78,
},
UnicodeRange {
start: 0xA69F,
end: 0xA69F,
},
],
Script::Devanagari => vec![
UnicodeRange {
start: 0x0900,
end: 0x097F,
},
UnicodeRange {
start: 0xA8E0,
end: 0xA8FF,
},
UnicodeRange {
start: 0x1CD0,
end: 0x1CFF,
},
],
Script::Ethiopic => vec![
UnicodeRange {
start: 0x1200,
end: 0x139F,
},
UnicodeRange {
start: 0x2D80,
end: 0x2DDF,
},
UnicodeRange {
start: 0xAB00,
end: 0xAB2F,
},
],
Script::Georgian => vec![UnicodeRange {
start: 0x10A0,
end: 0x10FF,
}],
Script::Greek => vec![UnicodeRange {
start: 0x0370,
end: 0x03FF,
}],
Script::Gujarati => vec![UnicodeRange {
start: 0x0A80,
end: 0x0AFF,
}],
Script::Gurmukhi => vec![UnicodeRange {
start: 0x0A00,
end: 0x0A7F,
}],
Script::Hangul => vec![
UnicodeRange {
start: 0xAC00,
end: 0xD7AF,
},
UnicodeRange {
start: 0x1100,
end: 0x11FF,
},
UnicodeRange {
start: 0x3130,
end: 0x318F,
},
UnicodeRange {
start: 0x3200,
end: 0x32FF,
},
UnicodeRange {
start: 0xA960,
end: 0xA97F,
},
UnicodeRange {
start: 0xD7B0,
end: 0xD7FF,
},
UnicodeRange {
start: 0xFF00,
end: 0xFFEF,
},
],
Script::Hebrew => vec![UnicodeRange {
start: 0x0590,
end: 0x05FF,
}],
Script::Hiragana => vec![UnicodeRange {
start: 0x3040,
end: 0x309F,
}],
Script::Kannada => vec![UnicodeRange {
start: 0x0C80,
end: 0x0CFF,
}],
Script::Katakana => vec![UnicodeRange {
start: 0x30A0,
end: 0x30FF,
}],
Script::Khmer => vec![
UnicodeRange {
start: 0x1780,
end: 0x17FF,
},
UnicodeRange {
start: 0x19E0,
end: 0x19FF,
},
],
Script::Latin => vec![
UnicodeRange {
start: 0x0041,
end: 0x005A,
}, UnicodeRange {
start: 0x0061,
end: 0x007A,
}, UnicodeRange {
start: 0x0080,
end: 0x00FF,
},
UnicodeRange {
start: 0x0100,
end: 0x017F,
},
UnicodeRange {
start: 0x0180,
end: 0x024F,
},
UnicodeRange {
start: 0x0250,
end: 0x02AF,
},
UnicodeRange {
start: 0x1D00,
end: 0x1D7F,
},
UnicodeRange {
start: 0x1D80,
end: 0x1DBF,
},
UnicodeRange {
start: 0x1E00,
end: 0x1EFF,
},
UnicodeRange {
start: 0x2100,
end: 0x214F,
},
UnicodeRange {
start: 0x2C60,
end: 0x2C7F,
},
UnicodeRange {
start: 0xA720,
end: 0xA7FF,
},
UnicodeRange {
start: 0xAB30,
end: 0xAB6F,
},
],
Script::Malayalam => vec![UnicodeRange {
start: 0x0D00,
end: 0x0D7F,
}],
Script::Mandarin => vec![
UnicodeRange {
start: 0x2E80,
end: 0x2E99,
},
UnicodeRange {
start: 0x2E9B,
end: 0x2EF3,
},
UnicodeRange {
start: 0x2F00,
end: 0x2FD5,
},
UnicodeRange {
start: 0x3005,
end: 0x3005,
},
UnicodeRange {
start: 0x3007,
end: 0x3007,
},
UnicodeRange {
start: 0x3021,
end: 0x3029,
},
UnicodeRange {
start: 0x3038,
end: 0x303B,
},
UnicodeRange {
start: 0x3400,
end: 0x4DB5,
},
UnicodeRange {
start: 0x4E00,
end: 0x9FCC,
},
UnicodeRange {
start: 0xF900,
end: 0xFA6D,
},
UnicodeRange {
start: 0xFA70,
end: 0xFAD9,
},
],
Script::Myanmar => vec![UnicodeRange {
start: 0x1000,
end: 0x109F,
}],
Script::Oriya => vec![UnicodeRange {
start: 0x0B00,
end: 0x0B7F,
}],
Script::Sinhala => vec![UnicodeRange {
start: 0x0D80,
end: 0x0DFF,
}],
Script::Tamil => vec![UnicodeRange {
start: 0x0B80,
end: 0x0BFF,
}],
Script::Telugu => vec![UnicodeRange {
start: 0x0C00,
end: 0x0C7F,
}],
Script::Thai => vec![UnicodeRange {
start: 0x0E00,
end: 0x0E7F,
}],
}
}
}
#[inline]
pub fn is_stop_char(ch: char) -> bool {
matches!(ch, '\u{0000}'..='\u{0040}' | '\u{005B}'..='\u{0060}' | '\u{007B}'..='\u{007E}')
}
type ScriptCounter = (Script, fn(char) -> bool, usize);
pub fn detect_script(text: &str) -> Option<Script> {
let mut script_counters: [ScriptCounter; 24] = [
(Script::Latin, is_latin, 0),
(Script::Cyrillic, is_cyrillic, 0),
(Script::Arabic, is_arabic, 0),
(Script::Mandarin, is_mandarin, 0),
(Script::Devanagari, is_devanagari, 0),
(Script::Hebrew, is_hebrew, 0),
(Script::Ethiopic, is_ethiopic, 0),
(Script::Georgian, is_georgian, 0),
(Script::Bengali, is_bengali, 0),
(Script::Hangul, is_hangul, 0),
(Script::Hiragana, is_hiragana, 0),
(Script::Katakana, is_katakana, 0),
(Script::Greek, is_greek, 0),
(Script::Kannada, is_kannada, 0),
(Script::Tamil, is_tamil, 0),
(Script::Thai, is_thai, 0),
(Script::Gujarati, is_gujarati, 0),
(Script::Gurmukhi, is_gurmukhi, 0),
(Script::Telugu, is_telugu, 0),
(Script::Malayalam, is_malayalam, 0),
(Script::Oriya, is_oriya, 0),
(Script::Myanmar, is_myanmar, 0),
(Script::Sinhala, is_sinhala, 0),
(Script::Khmer, is_khmer, 0),
];
let half = text.chars().count() / 2;
for ch in text.chars() {
if is_stop_char(ch) {
continue;
}
for i in 0..script_counters.len() {
let found = {
let (script, check_fn, ref mut count) = script_counters[i];
if check_fn(ch) {
*count += 1;
if *count > half {
return Some(script);
}
true
} else {
false
}
};
if found {
if i > 0 {
script_counters.swap(i - 1, i);
}
break;
}
}
}
let (script, _, count) = script_counters
.iter()
.cloned()
.max_by_key(|&(_, _, count)| count)
.unwrap();
if count != 0 {
Some(script)
} else {
None
}
}
pub fn detect_char_script(ch: char) -> Option<Script> {
let script_counters: [ScriptCounter; 24] = [
(Script::Latin, is_latin, 0),
(Script::Cyrillic, is_cyrillic, 0),
(Script::Arabic, is_arabic, 0),
(Script::Mandarin, is_mandarin, 0),
(Script::Devanagari, is_devanagari, 0),
(Script::Hebrew, is_hebrew, 0),
(Script::Ethiopic, is_ethiopic, 0),
(Script::Georgian, is_georgian, 0),
(Script::Bengali, is_bengali, 0),
(Script::Hangul, is_hangul, 0),
(Script::Hiragana, is_hiragana, 0),
(Script::Katakana, is_katakana, 0),
(Script::Greek, is_greek, 0),
(Script::Kannada, is_kannada, 0),
(Script::Tamil, is_tamil, 0),
(Script::Thai, is_thai, 0),
(Script::Gujarati, is_gujarati, 0),
(Script::Gurmukhi, is_gurmukhi, 0),
(Script::Telugu, is_telugu, 0),
(Script::Malayalam, is_malayalam, 0),
(Script::Oriya, is_oriya, 0),
(Script::Myanmar, is_myanmar, 0),
(Script::Sinhala, is_sinhala, 0),
(Script::Khmer, is_khmer, 0),
];
for i in 0..script_counters.len() {
let (script, check_fn, _) = script_counters[i];
if check_fn(ch) {
return Some(script);
}
}
None
}
fn detect_bengali_language(text: &str) -> Language {
for c in text.chars() {
if matches!(c, '\u{09F0}' | '\u{09F1}') {
return Language::Assamese;
}
}
Language::Bengali
}
fn detect_cyrillic_language(text: &str) -> Language {
for c in text.chars() {
match c {
'\u{0460}'..='\u{047F}' => return Language::SlavonicChurch,
'ѓ' | 'ќ' | 'ѕ' => return Language::Macedonian,
'ў' => return Language::Belarusian,
'є' | 'і' | 'ї' | 'ґ' => return Language::Ukrainian,
'ө' | 'ү' | 'һ' => return Language::Mongolian,
'ј' | 'љ' | 'њ' | 'ћ' | 'ђ' | 'џ' => return Language::SerbianCyrillic,
'щ' => return Language::Bulgarian,
_ => {}
}
}
Language::Russian
}
fn detect_devanagari_language(text: &str) -> Language {
for c in text.chars() {
match c {
'\u{0933}' => return Language::Marathi, '\u{1CD0}'..='\u{1CFF}' => return Language::Sanskrit,
_ => (),
}
}
Language::Hindi
}
fn detect_greek_language(text: &str) -> Language {
let mut has_polytonic = false;
for c in text.chars() {
match c {
'\u{2C80}'..='\u{2CFF}' => return Language::Coptic,
'\u{1F00}'..='\u{1FFF}' => return Language::GreekPoly,
_ => {}
}
}
Language::GreekMono
}
fn detect_latin_language(text: &str) -> Language {
let mut has_french_c = false;
let mut has_portugese_o = false;
let mut has_portuguese_a = false;
for c in text.chars() {
match c {
'ß' => return Language::German1996,
'ő' | 'ű' => return Language::Hungarian,
'ł' => return Language::Polish,
'ř' | 'ů' => return Language::Czech,
'ľ' | 'ĺ' | 'ŕ' => return Language::Slovak,
'ā' | 'ē' | 'ģ' | 'ī' | 'ķ' | 'ļ' | 'ņ' | 'ō' | 'ū' => {
return Language::Latvian
}
'ą' | 'ę' | 'ė' | 'į' | 'ų' => return Language::Lithuanian,
'ă' | 'ș' | 'ț' => return Language::Romanian,
'ğ' | 'ı' | 'ş' => return Language::Turkish,
'đ' => return Language::Croatian,
'þ' | 'ð' => return Language::Icelandic,
'ŵ' | 'ŷ' => return Language::Welsh,
'æ' | 'ø' => return Language::NorwegianBokmal, 'å' => return Language::Swedish, 'ñ' => return Language::Spanish,
'ä' | 'ö' | 'ü' => return Language::German1996,
'õ' => has_portugese_o = true,
'ã' => has_portuguese_a = true,
'ç' => has_french_c = true, 'á' | 'é' | 'í' | 'ó' | 'ú' => return Language::Spanish,
_ => (),
}
}
if has_french_c && !has_portugese_o && !has_portuguese_a {
return Language::French;
}
if has_portugese_o && !has_french_c && !has_portuguese_a {
return Language::Estonian;
}
if has_portugese_o || has_portuguese_a || has_french_c {
return Language::Portuguese;
}
Language::EnglishUS
}
pub fn script_to_language(script: Script, text: &str) -> Language {
match script {
Script::Ethiopic => Language::Ethiopic,
Script::Georgian => Language::Georgian,
Script::Gujarati => Language::Gujarati,
Script::Gurmukhi => Language::Panjabi,
Script::Kannada => Language::Kannada,
Script::Malayalam => Language::Malayalam,
Script::Mandarin => Language::Chinese,
Script::Oriya => Language::Oriya,
Script::Tamil => Language::Tamil,
Script::Telugu => Language::Telugu,
Script::Thai => Language::Thai,
Script::Bengali => detect_bengali_language(text),
Script::Cyrillic => detect_cyrillic_language(text),
Script::Devanagari => detect_devanagari_language(text),
Script::Greek => detect_greek_language(text),
Script::Latin => detect_latin_language(text),
Script::Myanmar => Language::Thai,
Script::Khmer => Language::Thai,
Script::Sinhala => Language::Hindi,
Script::Arabic => Language::Chinese,
Script::Hebrew => Language::Chinese,
Script::Hangul => Language::Chinese,
Script::Hiragana => Language::Chinese,
Script::Katakana => Language::Chinese,
}
}
pub fn is_cyrillic(ch: char) -> bool {
matches!(ch,
'\u{0400}'..='\u{0484}'
| '\u{0487}'..='\u{052F}'
| '\u{2DE0}'..='\u{2DFF}'
| '\u{A640}'..='\u{A69D}'
| '\u{1D2B}'
| '\u{1D78}'
| '\u{A69F}'
)
}
pub fn is_latin(ch: char) -> bool {
matches!(ch,
'a'..='z'
| 'A'..='Z'
| '\u{0080}'..='\u{00FF}'
| '\u{0100}'..='\u{017F}'
| '\u{0180}'..='\u{024F}'
| '\u{0250}'..='\u{02AF}'
| '\u{1D00}'..='\u{1D7F}'
| '\u{1D80}'..='\u{1DBF}'
| '\u{1E00}'..='\u{1EFF}'
| '\u{2100}'..='\u{214F}'
| '\u{2C60}'..='\u{2C7F}'
| '\u{A720}'..='\u{A7FF}'
| '\u{AB30}'..='\u{AB6F}'
)
}
pub fn is_arabic(ch: char) -> bool {
matches!(ch,
'\u{0600}'..='\u{06FF}'
| '\u{0750}'..='\u{07FF}'
| '\u{08A0}'..='\u{08FF}'
| '\u{FB50}'..='\u{FDFF}'
| '\u{FE70}'..='\u{FEFF}'
| '\u{10E60}'..='\u{10E7F}'
| '\u{1EE00}'..='\u{1EEFF}'
)
}
pub fn is_devanagari(ch: char) -> bool {
matches!(ch, '\u{0900}'..='\u{097F}' | '\u{A8E0}'..='\u{A8FF}' | '\u{1CD0}'..='\u{1CFF}')
}
pub fn is_ethiopic(ch: char) -> bool {
matches!(ch, '\u{1200}'..='\u{139F}' | '\u{2D80}'..='\u{2DDF}' | '\u{AB00}'..='\u{AB2F}')
}
pub fn is_hebrew(ch: char) -> bool {
matches!(ch, '\u{0590}'..='\u{05FF}')
}
pub fn is_georgian(ch: char) -> bool {
matches!(ch, '\u{10A0}'..='\u{10FF}')
}
pub fn is_mandarin(ch: char) -> bool {
matches!(ch,
'\u{2E80}'..='\u{2E99}'
| '\u{2E9B}'..='\u{2EF3}'
| '\u{2F00}'..='\u{2FD5}'
| '\u{3005}'
| '\u{3007}'
| '\u{3021}'..='\u{3029}'
| '\u{3038}'..='\u{303B}'
| '\u{3400}'..='\u{4DB5}'
| '\u{4E00}'..='\u{9FCC}'
| '\u{F900}'..='\u{FA6D}'
| '\u{FA70}'..='\u{FAD9}'
)
}
pub fn is_bengali(ch: char) -> bool {
matches!(ch, '\u{0980}'..='\u{09FF}')
}
pub fn is_hiragana(ch: char) -> bool {
matches!(ch, '\u{3040}'..='\u{309F}')
}
pub fn is_katakana(ch: char) -> bool {
matches!(ch, '\u{30A0}'..='\u{30FF}')
}
pub fn is_hangul(ch: char) -> bool {
matches!(ch,
'\u{AC00}'..='\u{D7AF}'
| '\u{1100}'..='\u{11FF}'
| '\u{3130}'..='\u{318F}'
| '\u{3200}'..='\u{32FF}'
| '\u{A960}'..='\u{A97F}'
| '\u{D7B0}'..='\u{D7FF}'
| '\u{FF00}'..='\u{FFEF}'
)
}
pub fn is_greek(ch: char) -> bool {
matches!(ch, '\u{0370}'..='\u{03FF}')
}
pub fn is_kannada(ch: char) -> bool {
matches!(ch, '\u{0C80}'..='\u{0CFF}')
}
pub fn is_tamil(ch: char) -> bool {
matches!(ch, '\u{0B80}'..='\u{0BFF}')
}
pub fn is_thai(ch: char) -> bool {
matches!(ch, '\u{0E00}'..='\u{0E7F}')
}
pub fn is_gujarati(ch: char) -> bool {
matches!(ch, '\u{0A80}'..='\u{0AFF}')
}
pub fn is_gurmukhi(ch: char) -> bool {
matches!(ch, '\u{0A00}'..='\u{0A7F}')
}
pub fn is_telugu(ch: char) -> bool {
matches!(ch, '\u{0C00}'..='\u{0C7F}')
}
pub fn is_malayalam(ch: char) -> bool {
matches!(ch, '\u{0D00}'..='\u{0D7F}')
}
pub fn is_oriya(ch: char) -> bool {
matches!(ch, '\u{0B00}'..='\u{0B7F}')
}
pub fn is_myanmar(ch: char) -> bool {
matches!(ch, '\u{1000}'..='\u{109F}')
}
pub fn is_sinhala(ch: char) -> bool {
matches!(ch, '\u{0D80}'..='\u{0DFF}')
}
pub fn is_khmer(ch: char) -> bool {
matches!(ch, '\u{1780}'..='\u{17FF}' | '\u{19E0}'..='\u{19FF}')
}