pub fn transliterate_kmu55(text: &str) -> String {
let mut output = String::with_capacity(text.len() * 2);
let mut index = 0usize;
while index < text.len() {
let slice = &text[index..];
if slice.starts_with("ЗГ") && is_word_start(text, index) {
output.push_str("ZGH");
index += 4;
continue;
}
if slice.starts_with("Зг") && is_word_start(text, index) {
output.push_str("Zgh");
index += 4;
continue;
}
if slice.starts_with("зг") && is_word_start(text, index) {
output.push_str("zgh");
index += 4;
continue;
}
let ch = match slice.chars().next() {
Some(ch) => ch,
None => break,
};
let ch_len = ch.len_utf8();
let case = letter_case(text, index, ch);
let mapped = match ch {
'А' | 'а' => Some("a"),
'Б' | 'б' => Some("b"),
'В' | 'в' => Some("v"),
'Г' | 'г' => Some("h"),
'Ґ' | 'ґ' => Some("g"),
'Д' | 'д' => Some("d"),
'Е' | 'е' => Some("e"),
'Є' | 'є' => {
let base = if is_word_start(text, index) {
"ye"
} else {
"ie"
};
Some(base)
}
'Ж' | 'ж' => Some("zh"),
'З' | 'з' => Some("z"),
'И' | 'и' => Some("y"),
'І' | 'і' => Some("i"),
'Ї' | 'ї' => {
let base = if is_word_start(text, index) {
"yi"
} else {
"i"
};
Some(base)
}
'Й' | 'й' => {
let base = if is_word_start(text, index) { "y" } else { "i" };
Some(base)
}
'К' | 'к' => Some("k"),
'Л' | 'л' => Some("l"),
'М' | 'м' => Some("m"),
'Н' | 'н' => Some("n"),
'О' | 'о' => Some("o"),
'П' | 'п' => Some("p"),
'Р' | 'р' => Some("r"),
'С' | 'с' => Some("s"),
'Т' | 'т' => Some("t"),
'У' | 'у' => Some("u"),
'Ф' | 'ф' => Some("f"),
'Х' | 'х' => Some("kh"),
'Ц' | 'ц' => Some("ts"),
'Ч' | 'ч' => Some("ch"),
'Ш' | 'ш' => Some("sh"),
'Щ' | 'щ' => Some("shch"),
'Ю' | 'ю' => {
let base = if is_word_start(text, index) {
"yu"
} else {
"iu"
};
Some(base)
}
'Я' | 'я' => {
let base = if is_word_start(text, index) {
"ya"
} else {
"ia"
};
Some(base)
}
'Ь' | 'ь' => Some(""),
'\'' | '’' => Some(""),
_ => None,
};
if let Some(base) = mapped {
push_cased(&mut output, base, case);
} else {
output.push(ch);
}
index += ch_len;
}
output
}
#[derive(Copy, Clone)]
enum LetterCase {
Lower,
Capitalized,
Upper,
}
fn letter_case(text: &str, idx: usize, ch: char) -> LetterCase {
if !ch.is_uppercase() {
return LetterCase::Lower;
}
let is_upper_word = next_letter_is_uppercase(text, idx + ch.len_utf8());
if is_upper_word {
LetterCase::Upper
} else {
LetterCase::Capitalized
}
}
fn push_cased(output: &mut String, base: &str, case: LetterCase) {
match case {
LetterCase::Lower => output.push_str(base),
LetterCase::Capitalized => {
let mut chars = base.chars();
if let Some(first) = chars.next() {
for upper in first.to_uppercase() {
output.push(upper);
}
}
for rest in chars {
for lower in rest.to_lowercase() {
output.push(lower);
}
}
}
LetterCase::Upper => {
for ch in base.chars() {
for upper in ch.to_uppercase() {
output.push(upper);
}
}
}
}
}
fn is_word_start(text: &str, idx: usize) -> bool {
if idx == 0 {
return true;
}
let mut iter = text[..idx].chars().rev();
while let Some(prev) = iter.next() {
match prev {
'\'' | '’' => continue,
c if is_word_separator(c) => return true,
c if c.is_alphabetic() => return false,
_ => return true,
}
}
true
}
fn is_word_separator(ch: char) -> bool {
ch.is_whitespace()
|| matches!(
ch,
'-' | '–'
| '—'
| '.'
| ','
| ':'
| ';'
| '!'
| '?'
| '('
| ')'
| '['
| ']'
| '{'
| '}'
| '"'
| '«'
| '»'
| '/'
| '\\'
)
}
fn next_letter_is_uppercase(text: &str, mut idx: usize) -> bool {
while idx < text.len() {
let next = match text[idx..].chars().next() {
Some(ch) => ch,
None => break,
};
if next == '\'' || next == '’' {
idx += next.len_utf8();
continue;
}
if next.is_alphabetic() {
return next.is_uppercase();
}
if is_word_separator(next) {
return false;
}
idx += next.len_utf8();
}
false
}