use crate::unicode::{general_category, nfd, Group};
use alloc::string::String;
#[must_use]
pub fn remove_diacritics(s: &str) -> String {
let stripped = nfd(s.chars()).filter(|&c| !matches!(general_category(c).group(), Group::Mark));
crate::unicode::nfc(stripped).collect()
}
#[derive(Debug, Clone)]
pub struct Transform {
rules: alloc::vec::Vec<(alloc::string::String, alloc::string::String)>,
}
impl Transform {
#[must_use]
pub fn parse(rules: &str) -> Option<Transform> {
let mut parsed: alloc::vec::Vec<(alloc::string::String, alloc::string::String)> =
alloc::vec::Vec::new();
for rule in rules.split(';') {
let Some((src, dst)) = rule.split_once('>') else {
continue;
};
let (src, dst) = (src.trim(), dst.trim());
if !src.is_empty() {
parsed.push((src.into(), dst.into()));
}
}
if parsed.is_empty() {
return None;
}
parsed.sort_by_key(|(s, _)| core::cmp::Reverse(s.chars().count()));
Some(Transform { rules: parsed })
}
#[must_use]
pub fn apply(&self, s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut rest = s;
'outer: while !rest.is_empty() {
for (src, dst) in &self.rules {
if let Some(after) = rest.strip_prefix(src.as_str()) {
out.push_str(dst);
rest = after;
continue 'outer;
}
}
let c = rest.chars().next().unwrap();
out.push(c);
rest = &rest[c.len_utf8()..];
}
out
}
}
#[must_use]
pub fn any_ascii(s: &str) -> String {
latin_ascii(&greek_to_latin(&cyrillic_to_latin(s)))
}
#[must_use]
pub fn cyrillic_to_latin(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for c in s.chars() {
match c {
'а' => out.push('a'),
'б' => out.push('b'),
'в' => out.push('v'),
'г' => out.push('g'),
'д' => out.push('d'),
'е' => out.push('e'),
'ё' => out.push('ë'),
'ж' => out.push('ž'),
'з' => out.push('z'),
'и' => out.push('i'),
'й' => out.push('j'),
'к' => out.push('k'),
'л' => out.push('l'),
'м' => out.push('m'),
'н' => out.push('n'),
'о' => out.push('o'),
'п' => out.push('p'),
'р' => out.push('r'),
'с' => out.push('s'),
'т' => out.push('t'),
'у' => out.push('u'),
'ф' => out.push('f'),
'х' => out.push('h'),
'ц' => out.push('c'),
'ч' => out.push('č'),
'ш' => out.push('š'),
'щ' => out.push('ŝ'),
'ъ' => out.push('ʺ'),
'ы' => out.push('y'),
'ь' => out.push('ʹ'),
'э' => out.push('è'),
'ю' => out.push('û'),
'я' => out.push('â'),
'і' => out.push('ì'),
'ї' => out.push('ï'),
'є' => out.push('ê'),
'ґ' => out.push('g'),
'ђ' => out.push('đ'),
'ј' => out.push('j'),
'љ' => out.push_str("lj"),
'њ' => out.push_str("nj"),
'ћ' => out.push('ć'),
'џ' => out.push_str("dž"),
'ѕ' => out.push('ẑ'),
'А'..='Я' | 'Ё' | 'І' | 'Ї' | 'Є' | 'Ґ' | 'Ђ' | 'Ј' | 'Љ' | 'Њ' | 'Ћ' | 'Џ' | 'Ѕ' =>
{
let lower = c.to_lowercase().next().unwrap_or(c);
let t = cyrillic_to_latin(lower.encode_utf8(&mut [0u8; 4]));
let mut chars = t.chars();
if let Some(first) = chars.next() {
out.extend(first.to_uppercase());
out.push_str(chars.as_str());
}
}
_ => out.push(c),
}
}
out
}
fn greek_letter(c: char) -> Option<&'static str> {
Some(match c {
'α' => "a",
'β' => "v",
'γ' => "g",
'δ' => "d",
'ε' => "e",
'ζ' => "z",
'η' => "i",
'θ' => "th",
'ι' => "i",
'κ' => "k",
'λ' => "l",
'μ' => "m",
'ν' => "n",
'ξ' => "x",
'ο' => "o",
'π' => "p",
'ρ' => "r",
'σ' | 'ς' => "s",
'τ' => "t",
'υ' => "y",
'φ' => "f",
'χ' => "ch",
'ψ' => "ps",
'ω' => "o",
_ => return None,
})
}
#[must_use]
pub fn greek_to_latin(s: &str) -> String {
let chars: alloc::vec::Vec<char> = nfd(s.chars())
.filter(|&c| !matches!(general_category(c).group(), Group::Mark))
.collect();
let mut out = String::with_capacity(s.len());
for (i, &c) in chars.iter().enumerate() {
if let Some(latin) = greek_letter(c) {
out.push_str(latin);
} else if c.is_uppercase() && greek_letter(c.to_lowercase().next().unwrap_or(c)).is_some() {
let latin = greek_letter(c.to_lowercase().next().unwrap()).unwrap();
let all_caps = chars.get(i + 1).is_some_and(|n| n.is_uppercase())
|| (i > 0 && chars[i - 1].is_uppercase());
if all_caps {
out.extend(latin.chars().flat_map(char::to_uppercase));
} else {
let mut it = latin.chars();
if let Some(f) = it.next() {
out.extend(f.to_uppercase());
out.push_str(it.as_str());
}
}
} else {
out.push(c);
}
}
out
}
#[must_use]
pub fn latin_ascii(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for c in nfd(s.chars()) {
if matches!(general_category(c).group(), Group::Mark) {
continue;
}
match c {
'\0'..='\u{7F}' => out.push(c),
'Ø' => out.push('O'),
'ø' => out.push('o'),
'Đ' | 'Ð' => out.push('D'),
'đ' | 'ð' => out.push('d'),
'Ł' => out.push('L'),
'ł' => out.push('l'),
'Ħ' => out.push('H'),
'ħ' => out.push('h'),
'Ŧ' => out.push('T'),
'ŧ' => out.push('t'),
'ı' => out.push('i'),
'İ' => out.push('I'),
'ʼn' => out.push('n'),
'Þ' => out.push_str("Th"),
'þ' => out.push_str("th"),
'Æ' => out.push_str("AE"),
'æ' => out.push_str("ae"),
'Œ' => out.push_str("OE"),
'œ' => out.push_str("oe"),
'ß' => out.push_str("ss"),
'ẞ' => out.push_str("SS"),
'Ŋ' => out.push_str("NG"),
'ŋ' => out.push_str("ng"),
'IJ' => out.push_str("IJ"),
'ij' => out.push_str("ij"),
'\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{2032}' => out.push('\''),
'\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{2033}' => out.push('"'),
'\u{2013}' | '\u{2014}' | '\u{2212}' => out.push('-'),
'\u{2026}' => out.push_str("..."),
'\u{00A0}' | '\u{2007}' | '\u{2009}' | '\u{202F}' => out.push(' '),
'\u{00AB}' | '\u{00BB}' => out.push('"'),
_ => out.push(c),
}
}
out
}