use crate::unicode::{general_category, nfd, Group};
use alloc::string::String;
#[must_use]
pub fn remove_diacritics(s: &str) -> String {
let stripped = nfd(s.chars()).filter(|&c| !matches!(general_category(c).group(), Group::Mark));
crate::unicode::nfc(stripped).collect()
}
#[derive(Debug, Clone)]
struct Rule {
before: String,
source: String,
after: String,
target: String,
set: Option<alloc::vec::Vec<(char, char)>>,
quantifier: Option<char>,
}
fn parse_set(s: &str) -> Option<alloc::vec::Vec<(char, char)>> {
let inner = s.strip_prefix('[')?.strip_suffix(']')?;
let chars: alloc::vec::Vec<char> = inner.chars().collect();
let mut ranges = alloc::vec::Vec::new();
let mut i = 0;
while i < chars.len() {
if chars[i].is_whitespace() {
i += 1;
continue;
}
if i + 2 < chars.len() && chars[i + 1] == '-' {
ranges.push((chars[i], chars[i + 2]));
i += 3;
} else {
ranges.push((chars[i], chars[i]));
i += 1;
}
}
Some(ranges)
}
#[derive(Debug, Clone)]
pub struct Transform {
rules: alloc::vec::Vec<Rule>,
}
impl Transform {
#[must_use]
pub fn parse(rules: &str) -> Option<Transform> {
let mut parsed: alloc::vec::Vec<Rule> = alloc::vec::Vec::new();
for rule in rules.split(';') {
let Some((lhs, target)) = rule.split_once('>') else {
continue;
};
let (before, rest) = match lhs.split_once('{') {
Some((b, r)) => (b.trim(), r),
None => ("", lhs),
};
let (source, after) = match rest.split_once('}') {
Some((s, a)) => (s.trim(), a.trim()),
None => (rest.trim(), ""),
};
let (set_src, quantifier) = match source.strip_suffix(['+', '*', '?']) {
Some(prefix) if prefix.ends_with(']') => (prefix, source.chars().next_back()),
_ => (source, None),
};
if !set_src.is_empty() {
parsed.push(Rule {
before: before.into(),
source: set_src.into(),
after: after.into(),
target: target.trim().into(),
set: parse_set(set_src),
quantifier,
});
}
}
if parsed.is_empty() {
return None;
}
parsed.sort_by_key(|r| core::cmp::Reverse(r.source.chars().count()));
Some(Transform { rules: parsed })
}
fn match_len(rule: &Rule, rest: &str) -> Option<usize> {
let Some(ranges) = &rule.set else {
return rest
.starts_with(rule.source.as_str())
.then_some(rule.source.len());
};
let in_set = |c: char| ranges.iter().any(|&(lo, hi)| (lo..=hi).contains(&c));
match rule.quantifier {
None => {
let c = rest.chars().next()?;
in_set(c).then(|| c.len_utf8())
}
Some(q) => {
let mut len = 0;
for c in rest.chars() {
if !in_set(c) {
break;
}
len += c.len_utf8();
if q == '?' {
break; }
}
(len > 0).then_some(len)
}
}
}
#[must_use]
pub fn apply(&self, s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut rest = s;
'outer: while !rest.is_empty() {
for rule in &self.rules {
if !out.ends_with(rule.before.as_str()) {
continue;
}
let Some(mlen) = Self::match_len(rule, rest) else {
continue;
};
if !rest[mlen..].starts_with(rule.after.as_str()) {
continue;
}
if rule.target.contains("$0") {
out.push_str(&rule.target.replace("$0", &rest[..mlen]));
} else {
out.push_str(&rule.target);
}
rest = &rest[mlen..];
continue 'outer;
}
let c = rest.chars().next().unwrap();
out.push(c);
rest = &rest[c.len_utf8()..];
}
out
}
}
#[must_use]
pub fn any_ascii(s: &str) -> String {
latin_ascii(&greek_to_latin(&cyrillic_to_latin(&armenian_to_latin(
&georgian_to_latin(s),
))))
}
#[must_use]
pub fn cyrillic_to_latin(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for c in s.chars() {
match c {
'а' => out.push('a'),
'б' => out.push('b'),
'в' => out.push('v'),
'г' => out.push('g'),
'д' => out.push('d'),
'е' => out.push('e'),
'ё' => out.push('ë'),
'ж' => out.push('ž'),
'з' => out.push('z'),
'и' => out.push('i'),
'й' => out.push('j'),
'к' => out.push('k'),
'л' => out.push('l'),
'м' => out.push('m'),
'н' => out.push('n'),
'о' => out.push('o'),
'п' => out.push('p'),
'р' => out.push('r'),
'с' => out.push('s'),
'т' => out.push('t'),
'у' => out.push('u'),
'ф' => out.push('f'),
'х' => out.push('h'),
'ц' => out.push('c'),
'ч' => out.push('č'),
'ш' => out.push('š'),
'щ' => out.push('ŝ'),
'ъ' => out.push('ʺ'),
'ы' => out.push('y'),
'ь' => out.push('ʹ'),
'э' => out.push('è'),
'ю' => out.push('û'),
'я' => out.push('â'),
'і' => out.push('ì'),
'ї' => out.push('ï'),
'є' => out.push('ê'),
'ґ' => out.push('g'),
'ђ' => out.push('đ'),
'ј' => out.push('j'),
'љ' => out.push_str("lj"),
'њ' => out.push_str("nj"),
'ћ' => out.push('ć'),
'џ' => out.push_str("dž"),
'ѕ' => out.push('ẑ'),
'А'..='Я' | 'Ё' | 'І' | 'Ї' | 'Є' | 'Ґ' | 'Ђ' | 'Ј' | 'Љ' | 'Њ' | 'Ћ' | 'Џ' | 'Ѕ' =>
{
let lower = c.to_lowercase().next().unwrap_or(c);
let t = cyrillic_to_latin(lower.encode_utf8(&mut [0u8; 4]));
let mut chars = t.chars();
if let Some(first) = chars.next() {
out.extend(first.to_uppercase());
out.push_str(chars.as_str());
}
}
_ => out.push(c),
}
}
out
}
fn greek_letter(c: char) -> Option<&'static str> {
Some(match c {
'α' => "a",
'β' => "v",
'γ' => "g",
'δ' => "d",
'ε' => "e",
'ζ' => "z",
'η' => "i",
'θ' => "th",
'ι' => "i",
'κ' => "k",
'λ' => "l",
'μ' => "m",
'ν' => "n",
'ξ' => "x",
'ο' => "o",
'π' => "p",
'ρ' => "r",
'σ' | 'ς' => "s",
'τ' => "t",
'υ' => "y",
'φ' => "f",
'χ' => "ch",
'ψ' => "ps",
'ω' => "o",
_ => return None,
})
}
#[must_use]
pub fn greek_to_latin(s: &str) -> String {
let chars: alloc::vec::Vec<char> = nfd(s.chars())
.filter(|&c| !matches!(general_category(c).group(), Group::Mark))
.collect();
let mut out = String::with_capacity(s.len());
for (i, &c) in chars.iter().enumerate() {
if let Some(latin) = greek_letter(c) {
out.push_str(latin);
} else if c.is_uppercase() && greek_letter(c.to_lowercase().next().unwrap_or(c)).is_some() {
let latin = greek_letter(c.to_lowercase().next().unwrap()).unwrap();
let all_caps = chars.get(i + 1).is_some_and(|n| n.is_uppercase())
|| (i > 0 && chars[i - 1].is_uppercase());
if all_caps {
out.extend(latin.chars().flat_map(char::to_uppercase));
} else {
let mut it = latin.chars();
if let Some(f) = it.next() {
out.extend(f.to_uppercase());
out.push_str(it.as_str());
}
}
} else {
out.push(c);
}
}
out
}
fn armenian_letter(c: char) -> Option<&'static str> {
Some(match c {
'ա' => "a",
'բ' => "b",
'գ' => "g",
'դ' => "d",
'ե' => "e",
'զ' => "z",
'է' => "e",
'ը' => "e",
'թ' => "t",
'ժ' => "zh",
'ի' => "i",
'լ' => "l",
'խ' => "kh",
'ծ' => "ts",
'կ' => "k",
'հ' => "h",
'ձ' => "dz",
'ղ' => "gh",
'ճ' => "ch",
'մ' => "m",
'յ' => "y",
'ն' => "n",
'շ' => "sh",
'ո' => "o",
'չ' => "ch",
'պ' => "p",
'ջ' => "j",
'ռ' => "r",
'ս' => "s",
'վ' => "v",
'տ' => "t",
'ր' => "r",
'ց' => "ts",
'ւ' => "w",
'փ' => "p",
'ք' => "k",
'օ' => "o",
'ֆ' => "f",
'և' => "ev",
_ => return None,
})
}
#[must_use]
pub fn armenian_to_latin(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for c in s.chars() {
if let Some(latin) = armenian_letter(c) {
out.push_str(latin);
} else if let Some(lower) = c
.to_lowercase()
.next()
.filter(|&l| armenian_letter(l).is_some())
{
let latin = armenian_letter(lower).unwrap();
let mut it = latin.chars();
if let Some(f) = it.next() {
out.extend(f.to_uppercase());
out.push_str(it.as_str());
}
} else {
out.push(c);
}
}
out
}
#[must_use]
pub fn georgian_to_latin(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for c in s.chars() {
let latin: &str = match c {
'ა' => "a",
'ბ' => "b",
'გ' => "g",
'დ' => "d",
'ე' => "e",
'ვ' => "v",
'ზ' => "z",
'თ' => "t",
'ი' => "i",
'კ' => "k",
'ლ' => "l",
'მ' => "m",
'ნ' => "n",
'ო' => "o",
'პ' => "p",
'ჟ' => "zh",
'რ' => "r",
'ს' => "s",
'ტ' => "t",
'უ' => "u",
'ფ' => "p",
'ქ' => "k",
'ღ' => "gh",
'ყ' => "q",
'შ' => "sh",
'ჩ' => "ch",
'ც' => "ts",
'ძ' => "dz",
'წ' => "ts",
'ჭ' => "ch",
'ხ' => "kh",
'ჯ' => "j",
'ჰ' => "h",
other => {
out.push(other);
continue;
}
};
out.push_str(latin);
}
out
}
#[must_use]
pub fn latin_ascii(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for c in nfd(s.chars()) {
if matches!(general_category(c).group(), Group::Mark) {
continue;
}
match c {
'\0'..='\u{7F}' => out.push(c),
'Ø' => out.push('O'),
'ø' => out.push('o'),
'Đ' | 'Ð' => out.push('D'),
'đ' | 'ð' => out.push('d'),
'Ł' => out.push('L'),
'ł' => out.push('l'),
'Ħ' => out.push('H'),
'ħ' => out.push('h'),
'Ŧ' => out.push('T'),
'ŧ' => out.push('t'),
'ı' => out.push('i'),
'İ' => out.push('I'),
'ʼn' => out.push('n'),
'Þ' => out.push_str("Th"),
'þ' => out.push_str("th"),
'Æ' => out.push_str("AE"),
'æ' => out.push_str("ae"),
'Œ' => out.push_str("OE"),
'œ' => out.push_str("oe"),
'ß' => out.push_str("ss"),
'ẞ' => out.push_str("SS"),
'Ŋ' => out.push_str("NG"),
'ŋ' => out.push_str("ng"),
'IJ' => out.push_str("IJ"),
'ij' => out.push_str("ij"),
'\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{2032}' => out.push('\''),
'\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{2033}' => out.push('"'),
'\u{2013}' | '\u{2014}' | '\u{2212}' => out.push('-'),
'\u{2026}' => out.push_str("..."),
'\u{00A0}' | '\u{2007}' | '\u{2009}' | '\u{202F}' => out.push(' '),
'\u{00AB}' | '\u{00BB}' => out.push('"'),
_ => out.push(c),
}
}
out
}