ukraine 1.1.0 - Docs.rs

/// Implementation of the Ukrainian National transliteration system
/// approved by Cabinet of Ministers resolution №55 (2010).
/// https://www.kmu.gov.ua/npas/243262567
/// Transliterate Ukrainian Cyrillic text to Latin according to KMU №55.
//

/// # Examples
/// ```rust
/// use ukraine::latin::transliterate_kmu55;
/// # fn main() {
/// let original = "Слава Україні. Героям слава!";
/// let transliterated = transliterate_kmu55(original);
/// assert_eq!(transliterated, "Slava Ukraini. Heroiam slava!");
/// }
/// ```
pub fn transliterate_kmu55(text: &str) -> String {
    let mut output = String::with_capacity(text.len() * 2);
    let mut index = 0usize;

    while index < text.len() {
        let slice = &text[index..];

        if slice.starts_with("ЗГ") && is_word_start(text, index) {
            output.push_str("ZGH");
            index += 4;
            continue;
        }

        if slice.starts_with("Зг") && is_word_start(text, index) {
            output.push_str("Zgh");
            index += 4;
            continue;
        }

        if slice.starts_with("зг") && is_word_start(text, index) {
            output.push_str("zgh");
            index += 4;
            continue;
        }

        let ch = match slice.chars().next() {
            Some(ch) => ch,
            None => break,
        };

        let ch_len = ch.len_utf8();
        let case = letter_case(text, index, ch);

        let mapped = match ch {
            'А' | 'а' => Some("a"),
            'Б' | 'б' => Some("b"),
            'В' | 'в' => Some("v"),
            'Г' | 'г' => Some("h"),
            'Ґ' | 'ґ' => Some("g"),
            'Д' | 'д' => Some("d"),
            'Е' | 'е' => Some("e"),
            'Є' | 'є' => {
                let base = if is_word_start(text, index) {
                    "ye"
                } else {
                    "ie"
                };
                Some(base)
            }
            'Ж' | 'ж' => Some("zh"),
            'З' | 'з' => Some("z"),
            'И' | 'и' => Some("y"),
            'І' | 'і' => Some("i"),
            'Ї' | 'ї' => {
                let base = if is_word_start(text, index) {
                    "yi"
                } else {
                    "i"
                };
                Some(base)
            }
            'Й' | 'й' => {
                let base = if is_word_start(text, index) { "y" } else { "i" };
                Some(base)
            }
            'К' | 'к' => Some("k"),
            'Л' | 'л' => Some("l"),
            'М' | 'м' => Some("m"),
            'Н' | 'н' => Some("n"),
            'О' | 'о' => Some("o"),
            'П' | 'п' => Some("p"),
            'Р' | 'р' => Some("r"),
            'С' | 'с' => Some("s"),
            'Т' | 'т' => Some("t"),
            'У' | 'у' => Some("u"),
            'Ф' | 'ф' => Some("f"),
            'Х' | 'х' => Some("kh"),
            'Ц' | 'ц' => Some("ts"),
            'Ч' | 'ч' => Some("ch"),
            'Ш' | 'ш' => Some("sh"),
            'Щ' | 'щ' => Some("shch"),
            'Ю' | 'ю' => {
                let base = if is_word_start(text, index) {
                    "yu"
                } else {
                    "iu"
                };
                Some(base)
            }
            'Я' | 'я' => {
                let base = if is_word_start(text, index) {
                    "ya"
                } else {
                    "ia"
                };
                Some(base)
            }
            'Ь' | 'ь' => Some(""),
            '\'' | '’' => Some(""),
            _ => None,
        };

        if let Some(base) = mapped {
            push_cased(&mut output, base, case);
        } else {
            output.push(ch);
        }

        index += ch_len;
    }

    output
}

#[derive(Copy, Clone)]
enum LetterCase {
    Lower,
    Capitalized,
    Upper,
}

fn letter_case(text: &str, idx: usize, ch: char) -> LetterCase {
    if !ch.is_uppercase() {
        return LetterCase::Lower;
    }

    let is_upper_word = next_letter_is_uppercase(text, idx + ch.len_utf8());
    if is_upper_word {
        LetterCase::Upper
    } else {
        LetterCase::Capitalized
    }
}

fn push_cased(output: &mut String, base: &str, case: LetterCase) {
    match case {
        LetterCase::Lower => output.push_str(base),
        LetterCase::Capitalized => {
            let mut chars = base.chars();
            if let Some(first) = chars.next() {
                for upper in first.to_uppercase() {
                    output.push(upper);
                }
            }
            for rest in chars {
                for lower in rest.to_lowercase() {
                    output.push(lower);
                }
            }
        }
        LetterCase::Upper => {
            for ch in base.chars() {
                for upper in ch.to_uppercase() {
                    output.push(upper);
                }
            }
        }
    }
}

fn is_word_start(text: &str, idx: usize) -> bool {
    if idx == 0 {
        return true;
    }

    let mut iter = text[..idx].chars().rev();

    while let Some(prev) = iter.next() {
        match prev {
            '\'' | '’' => continue,
            c if is_word_separator(c) => return true,
            c if c.is_alphabetic() => return false,
            _ => return true,
        }
    }

    true
}

fn is_word_separator(ch: char) -> bool {
    ch.is_whitespace()
        || matches!(
            ch,
            '-' | '–'
                | '—'
                | '.'
                | ','
                | ':'
                | ';'
                | '!'
                | '?'
                | '('
                | ')'
                | '['
                | ']'
                | '{'
                | '}'
                | '"'
                | '«'
                | '»'
                | '/'
                | '\\'
        )
}

fn next_letter_is_uppercase(text: &str, mut idx: usize) -> bool {
    while idx < text.len() {
        let next = match text[idx..].chars().next() {
            Some(ch) => ch,
            None => break,
        };

        if next == '\'' || next == '’' {
            idx += next.len_utf8();
            continue;
        }

        if next.is_alphabetic() {
            return next.is_uppercase();
        }

        if is_word_separator(next) {
            return false;
        }

        idx += next.len_utf8();
    }

    false
}