#![doc = include_str!("../README.md")]
use std::fmt;
use std::str::FromStr;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Script {
Uas,
Uls,
Uys,
Cts,
Ucs,
Xjus,
Uzls,
Ipa,
}
impl Script {
fn as_str(self) -> &'static str {
match self {
Self::Uas => "UAS",
Self::Uls => "ULS",
Self::Uys => "UYS",
Self::Cts => "CTS",
Self::Ucs => "UCS",
Self::Xjus => "XJUS",
Self::Uzls => "UZLS",
Self::Ipa => "IPA",
}
}
}
impl fmt::Display for Script {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(self.as_str())
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ParseScriptError {
input: String,
}
impl fmt::Display for ParseScriptError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "unknown script: {}", self.input)
}
}
impl std::error::Error for ParseScriptError {}
impl FromStr for Script {
type Err = ParseScriptError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.trim().to_ascii_uppercase().as_str() {
"UAS" => Ok(Self::Uas),
"ULS" => Ok(Self::Uls),
"UYS" => Ok(Self::Uys),
"CTS" => Ok(Self::Cts),
"UCS" => Ok(Self::Ucs),
"XJUS" | "XJUSS" => Ok(Self::Xjus),
"UZLS" => Ok(Self::Uzls),
"IPA" => Ok(Self::Ipa),
_ => Err(ParseScriptError {
input: s.to_string(),
}),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Error {
UnsupportedConversion { source: Script, target: Script },
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::UnsupportedConversion { source, target } => {
write!(f, "conversion from {source} to {target} not supported")
}
}
}
}
impl std::error::Error for Error {}
pub struct UgMultiScriptConverter {
source_script: Script,
target_script: Script,
}
impl UgMultiScriptConverter {
pub fn new(source_script: Script, target_script: Script) -> Self {
Self {
source_script,
target_script,
}
}
pub fn source_script(&self) -> Script {
self.source_script
}
pub fn target_script(&self) -> Script {
self.target_script
}
pub fn convert(&self, text: &str) -> Result<String, Error> {
convert(text, self.source_script, self.target_script)
}
pub fn convert_with(
&mut self,
text: &str,
source_script: Script,
target_script: Script,
) -> Result<String, Error> {
self.source_script = source_script;
self.target_script = target_script;
self.convert(text)
}
pub fn is_pure_uyghur_script(text: &str) -> bool {
text.chars()
.any(|ch| ('\u{0621}'..='\u{06ff}').contains(&ch))
}
}
pub fn convert(text: &str, source_script: Script, target_script: Script) -> Result<String, Error> {
if source_script == target_script {
return Ok(text.to_string());
}
match (source_script, target_script) {
(Script::Uas, Script::Cts) => Ok(uas_to_cts(text, false)),
(Script::Uas, Script::Uls) => Ok(cts_to_uls(&uas_to_cts(text, true))),
(Script::Uas, Script::Ucs) => Ok(cts_to_ucs(&uas_to_cts(text, true))),
(Script::Uas, Script::Uys) => Ok(cts_to_uys(&uas_to_cts(text, true))),
(Script::Uas, Script::Uzls) => Ok(cts_to_uzls(&uas_to_cts(text, true))),
(Script::Uls, Script::Cts) => Ok(uls_to_cts(text)),
(Script::Uls, Script::Uas) => Ok(cts_to_uas(&uls_to_cts(text))),
(Script::Uls, Script::Ucs) => Ok(cts_to_ucs(&uls_to_cts(text))),
(Script::Uls, Script::Uys) => Ok(cts_to_uys(&uls_to_cts(text))),
(Script::Uys, Script::Cts) => Ok(uys_to_cts(text)),
(Script::Uys, Script::Uas) => Ok(cts_to_uas(&uys_to_cts(text))),
(Script::Uys, Script::Uls) => Ok(cts_to_uls(&uys_to_cts(text))),
(Script::Uys, Script::Ucs) => Ok(cts_to_ucs(&uys_to_cts(text))),
(Script::Ucs, Script::Cts) => Ok(ucs_to_cts(text)),
(Script::Ucs, Script::Uas) => Ok(cts_to_uas(&ucs_to_cts(text))),
(Script::Ucs, Script::Uls) => Ok(cts_to_uls(&ucs_to_cts(text))),
(Script::Ucs, Script::Uys) => Ok(cts_to_uys(&ucs_to_cts(text))),
(Script::Xjus, Script::Cts) => Ok(xjus_to_cts(text)),
(Script::Xjus, Script::Uas) => Ok(cts_to_uas(&xjus_to_cts(text))),
(Script::Uzls, Script::Cts) => Ok(uzls_to_cts(text)),
(Script::Cts, Script::Uas) => Ok(cts_to_uas(text)),
(Script::Cts, Script::Uls) => Ok(cts_to_uls(text)),
(Script::Cts, Script::Uys) => Ok(cts_to_uys(text)),
(Script::Cts, Script::Ipa) => Ok(cts_to_ipa(text)),
(Script::Cts, Script::Uzls) => Ok(cts_to_uzls(text)),
(Script::Cts, Script::Xjus) => Ok(cts_to_xjus(text)),
(Script::Cts, Script::Ucs) => Ok(cts_to_ucs(text)),
_ => Err(Error::UnsupportedConversion {
source: source_script,
target: target_script,
}),
}
}
const UAS_GROUP1: &[&str] = &[
"ا", "ە", "ب", "پ", "ت", "ج", "چ", "خ", "د", "ر", "ز", "ژ", "س", "ش", "ف", "ڭ", "ل", "لا", "م",
"ھ", "و", "ۇ", "ۆ", "ۈ", "ۋ", "ې", "ى", "ي", "ق", "ك", "گ", "ن", "غ", "؟", "،", "؛", "٭",
];
const CTS_GROUP1: &[&str] = &[
"a", "e", "b", "p", "t", "c", "ç", "x", "d", "r", "z", "j", "s", "ş", "f", "ñ", "l", "la", "m",
"h", "o", "u", "ö", "ü", "v", "é", "i", "y", "q", "k", "g", "n", "ğ", "?", ",", ";", "*",
];
const UCS_GROUP1: &[&str] = &[
"а", "ә", "б", "п", "т", "җ", "ч", "х", "д", "р", "з", "ж", "с", "ш", "ф", "ң", "л", "ла", "м",
"һ", "о", "у", "ө", "ү", "в", "е", "и", "й", "қ", "к", "г", "н", "ғ", "?", ",", ";", "*",
];
const IPA_GROUP1: &[&str] = &[
"ɑ", "æ", "b", "p", "t", "dʒ", "tʃ", "χ", "d", "r", "z", "ʒ", "s", "ʃ", "f", "ŋ", "l", "la",
"m", "h", "o", "u", "ø", "y", "w", "ɛ", "i", "j", "q", "k", "ɡ", "n", "ʁ", "?", ",", ";", "*",
];
fn replace_via_table(text: &str, from: &[&str], to: &[&str]) -> String {
let mut pairs: Vec<(&str, &str)> = from.iter().copied().zip(to.iter().copied()).collect();
pairs.sort_by(|(a, _), (b, _)| b.len().cmp(&a.len()));
let mut out = text.to_string();
for (src, dst) in pairs {
out = out.replace(src, dst);
}
out
}
fn is_cts_letter(ch: char) -> bool {
matches!(
ch,
'a' | 'e'
| 'u'
| 'o'
| 'ö'
| 'ü'
| 'b'
| 'p'
| 't'
| 'c'
| 'ç'
| 'x'
| 'd'
| 'r'
| 'z'
| 'j'
| 's'
| 'ş'
| 'f'
| 'ñ'
| 'l'
| 'm'
| 'h'
| 'v'
| 'é'
| 'i'
| 'y'
| 'q'
| 'k'
| 'g'
| 'n'
| 'ğ'
)
}
fn is_cts_vowel(ch: char) -> bool {
matches!(ch, 'a' | 'e' | 'é' | 'i' | 'o' | 'u' | 'ö' | 'ü')
}
fn is_cts_uas_consonant(ch: char) -> bool {
matches!(
ch,
'b' | 'p'
| 't'
| 'c'
| 'ç'
| 'x'
| 'd'
| 'r'
| 'z'
| 'j'
| 's'
| 'ş'
| 'f'
| 'ñ'
| 'l'
| 'm'
| 'h'
| 'v'
| 'y'
| 'q'
| 'k'
| 'g'
| 'n'
| 'ğ'
)
}
fn is_uas_vowel(ch: char) -> bool {
matches!(ch, 'ا' | 'ە' | 'ې' | 'ى' | 'و' | 'ۇ' | 'ۆ' | 'ۈ')
}
fn revise_cts(text: &str, keep_apostrophes: bool) -> String {
let mut stripped = String::with_capacity(text.len());
let mut prev: Option<char> = None;
for ch in text.chars() {
if ch == 'ئ' && prev.map_or(true, |p| !is_cts_letter(p)) {
prev = Some(ch);
continue;
}
stripped.push(ch);
prev = Some(ch);
}
let mut out = String::with_capacity(stripped.len());
let mut prev: Option<char> = None;
for ch in stripped.chars() {
if ch == 'ئ' {
if !keep_apostrophes && prev.is_some_and(is_cts_vowel) {
prev = Some(ch);
continue;
}
out.push('\'');
prev = Some(ch);
continue;
}
out.push(ch);
prev = Some(ch);
}
out
}
fn revise_uas(text: &str) -> String {
let mut out = String::with_capacity(text.len());
let mut prev: Option<char> = None;
for ch in text.chars() {
if is_uas_vowel(ch)
&& prev.map_or(true, |p| p == '-' || p.is_whitespace() || is_uas_vowel(p))
{
out.push('ئ');
}
out.push(ch);
prev = Some(ch);
}
out
}
fn uas_to_cts(text: &str, keep_apostrophes: bool) -> String {
let mapped = replace_via_table(text, UAS_GROUP1, CTS_GROUP1);
revise_cts(&mapped, keep_apostrophes)
}
fn uls_to_cts(text: &str) -> String {
text.to_lowercase()
.replace("j", "c")
.replace("ng", "ñ")
.replace("n'g", "ng'")
.replace("'ng", "ñ")
.replace("ch", "ç")
.replace("zh", "j")
.replace("sh", "ş")
.replace("'gh", "ğ")
.replace("gh", "ğ")
.replace("w", "v")
.replace("ch", "ç")
}
fn uys_to_cts(text: &str) -> String {
text.to_lowercase()
.replace("e", "é")
.replace("ə", "e")
.replace("j", "c")
.replace("q", "ç")
.replace("ⱬ", "j")
.replace("x", "ş")
.replace("h", "x")
.replace("ⱨ", "h")
.replace("ng", "ñ")
.replace("ø", "ö")
.replace("ü", "ü")
.replace("w", "v")
.replace("ⱪ", "q")
.replace("ƣ", "ğ")
}
fn ucs_to_cts(text: &str) -> String {
replace_via_table(&text.to_lowercase(), UCS_GROUP1, CTS_GROUP1)
.replace("я", "ya")
.replace("ю", "yu")
}
fn xjus_to_cts(text: &str) -> String {
let mapped = text
.replace('v', "ئ")
.replace('J', "j")
.replace('c', "ç")
.replace('j', "c")
.replace('x', "ş")
.replace('H', "x")
.replace('N', "ñ")
.replace('O', "ö")
.replace('U', "ü")
.replace('e', "é")
.replace('A', "e")
.replace('G', "ğ")
.replace('w', "v");
revise_cts(&mapped, false)
}
fn uzls_to_cts(text: &str) -> String {
let mapped = text
.replace("ch", "ç")
.replace("sh", "ş")
.replace("s'h", "sh")
.replace("ng", "ñ")
.replace("n'g", "ng")
.replace("g‘", "ğ")
.replace("o‘", "ö")
.replace("u‘", "ü")
.replace("e", "é")
.replace('a', "e")
.replace('o', "a")
.replace('j', "c");
revise_cts(&mapped, false)
}
fn cts_to_uas(text: &str) -> String {
let mut prepared = String::with_capacity(text.len() * 2);
let mut prev: Option<char> = None;
for ch in text.chars() {
if is_cts_vowel(ch) && prev.map_or(true, |p| !is_cts_uas_consonant(p)) {
prepared.push('ئ');
}
prepared.push(ch);
prev = Some(ch);
}
let mapped = replace_via_table(&prepared, CTS_GROUP1, UAS_GROUP1).replace('\'', "");
revise_uas(&mapped)
}
fn cts_to_uls(text: &str) -> String {
text.to_lowercase()
.replace("ng", "n'g")
.replace("sh", "s'h")
.replace("ch", "c'h")
.replace("zh", "z'h")
.replace("gh", "g'h")
.replace("ng", "n'g")
.replace("nğ", "n'gh")
.replace("ñ", "ng")
.replace("j", "zh")
.replace("c", "j")
.replace("ç", "ch")
.replace("ş", "sh")
.replace("ğ", "gh")
.replace("v", "w")
}
fn cts_to_uys(text: &str) -> String {
text.to_lowercase()
.replace("ng", "n'g")
.replace("e", "ə")
.replace("j", "ⱬ")
.replace("c", "j")
.replace("q", "ⱪ")
.replace("ç", "q")
.replace("h", "ⱨ")
.replace("x", "h")
.replace("ş", "x")
.replace("ñ", "ng")
.replace("ö", "ø")
.replace("v", "w")
.replace("é", "e")
.replace("ğ", "ƣ")
}
fn cts_to_ipa(text: &str) -> String {
let position = CTS_GROUP1
.iter()
.position(|entry| *entry == "y")
.expect("CTS mapping must contain y");
let mut cts = CTS_GROUP1.to_vec();
let mut ipa = IPA_GROUP1.to_vec();
cts.remove(position);
ipa.remove(position);
replace_via_table(text, &cts, &ipa).replace('ü', "y")
}
fn cts_to_uzls(text: &str) -> String {
text.to_lowercase()
.replace('a', "o")
.replace('e', "a")
.replace('c', "j")
.replace('q', "q")
.replace("ç", "ch")
.replace("ş", "sh")
.replace("ñ", "ng")
.replace("ö", "o‘")
.replace("ü", "u‘")
.replace("é", "e")
.replace("ğ", "g‘")
}
fn cts_to_xjus(text: &str) -> String {
let mapped = text
.to_lowercase()
.replace('e', "A")
.replace('x', "H")
.replace('j', "J")
.replace('c', "j")
.replace("ç", "c")
.replace("ş", "x")
.replace("ñ", "N")
.replace("ö", "O")
.replace("ü", "U")
.replace("é", "e")
.replace("ğ", "G")
.replace('v', "w");
let mut out = String::with_capacity(mapped.len() * 2);
let mut prev: Option<char> = None;
for ch in mapped.chars() {
if matches!(ch, 'a' | 'A' | 'e' | 'i' | 'o' | 'u' | 'O' | 'U')
&& prev.map_or(true, |p| {
!matches!(
p,
'b' | 'p'
| 't'
| 'c'
| 'x'
| 'd'
| 'r'
| 'z'
| 'j'
| 'J'
| 's'
| 'f'
| 'N'
| 'l'
| 'm'
| 'h'
| 'H'
| 'y'
| 'q'
| 'k'
| 'g'
| 'n'
| 'G'
| 'w'
)
})
{
out.push('v');
}
out.push(ch);
prev = Some(ch);
}
out.replace('\'', "")
}
fn cts_to_ucs(text: &str) -> String {
replace_via_table(
&text.to_lowercase().replace("ya", "я").replace("yu", "ю"),
CTS_GROUP1,
UCS_GROUP1,
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parses_scripts_case_insensitively() {
assert_eq!("uas".parse::<Script>().unwrap(), Script::Uas);
assert_eq!("XJUSS".parse::<Script>().unwrap(), Script::Xjus);
}
#[test]
fn detects_arabic_script() {
assert!(UgMultiScriptConverter::is_pure_uyghur_script("ئاپ"));
assert!(!UgMultiScriptConverter::is_pure_uyghur_script("ap"));
}
#[test]
fn converts_uas_to_cts() {
assert_eq!(convert("ئاپ", Script::Uas, Script::Cts).unwrap(), "ap");
}
#[test]
fn converts_cts_to_uas() {
assert_eq!(convert("ap", Script::Cts, Script::Uas).unwrap(), "ئاپ");
}
#[test]
fn converts_uls_to_cts() {
assert_eq!(convert("jang", Script::Uls, Script::Cts).unwrap(), "cañ");
}
#[test]
fn converts_cts_to_ucs() {
assert_eq!(convert("ya yu", Script::Cts, Script::Ucs).unwrap(), "я ю");
}
#[test]
fn rejects_unsupported_pairs() {
let err = convert("text", Script::Uzls, Script::Uas).unwrap_err();
assert_eq!(
err,
Error::UnsupportedConversion {
source: Script::Uzls,
target: Script::Uas
}
);
}
}