pinyin-converter 0.1.0

Fast, dictionary-backed Chinese to Pinyin conversion for Rust and the command line.
Documentation
use crate::{ToneStyle, YuStyle};

pub(crate) fn format_phrase(phrase: &str, tone_style: ToneStyle, yu_style: YuStyle) -> String {
    phrase
        .split_whitespace()
        .map(|syllable| format_syllable(syllable, tone_style, yu_style))
        .collect::<Vec<_>>()
        .join(" ")
}

pub(crate) fn format_syllable(syllable: &str, tone_style: ToneStyle, yu_style: YuStyle) -> String {
    match tone_style {
        ToneStyle::Mark if yu_style == YuStyle::Umlaut => syllable.to_string(),
        ToneStyle::Mark => render_base(&normalize(syllable).0, yu_style),
        ToneStyle::Number => {
            let (base, tone) = normalize(syllable);
            let mut rendered = render_base(&base, yu_style);
            if (1..=4).contains(&tone) {
                rendered.push(char::from_digit(tone as u32, 10).expect("tone is a digit"));
            }
            rendered
        }
        ToneStyle::None => render_base(&normalize(syllable).0, yu_style),
    }
}

pub(crate) fn first_pronunciation(phrase: &str) -> &str {
    phrase.split_whitespace().next().unwrap_or(phrase)
}

pub(crate) fn split_phrase(phrase: &str) -> Vec<&str> {
    phrase.split_whitespace().collect()
}

pub(crate) fn initials_token(token: &str) -> String {
    let plain = format_phrase(token, ToneStyle::None, YuStyle::V);
    let cleaned = plain
        .chars()
        .filter(|ch| ch.is_ascii_alphanumeric())
        .collect::<String>();

    if cleaned.chars().any(|ch| ch.is_ascii_digit()) {
        return cleaned;
    }

    cleaned
        .chars()
        .next()
        .map(|ch| ch.to_string())
        .unwrap_or_default()
}

pub(crate) fn slug_token(token: &str) -> String {
    format_phrase(token, ToneStyle::None, YuStyle::V)
        .chars()
        .filter(|ch| ch.is_ascii_alphanumeric())
        .collect::<String>()
        .to_ascii_lowercase()
}

fn normalize(syllable: &str) -> (String, u8) {
    let mut tone = 5;
    let mut base = String::with_capacity(syllable.len());

    for ch in syllable.chars() {
        let (plain, mark_tone) = match ch {
            'ā' => ('a', 1),
            'á' => ('a', 2),
            'ǎ' => ('a', 3),
            'à' => ('a', 4),
            'ē' => ('e', 1),
            'é' => ('e', 2),
            'ě' => ('e', 3),
            'è' => ('e', 4),
            'ī' => ('i', 1),
            'í' => ('i', 2),
            'ǐ' => ('i', 3),
            'ì' => ('i', 4),
            'ō' => ('o', 1),
            'ó' => ('o', 2),
            'ǒ' => ('o', 3),
            'ò' => ('o', 4),
            'ū' => ('u', 1),
            'ú' => ('u', 2),
            'ǔ' => ('u', 3),
            'ù' => ('u', 4),
            'ǖ' => ('ü', 1),
            'ǘ' => ('ü', 2),
            'ǚ' => ('ü', 3),
            'ǜ' => ('ü', 4),
            _ => {
                base.push(ch);
                continue;
            }
        };
        tone = mark_tone;
        base.push(plain);
    }

    (base, tone)
}

fn render_base(base: &str, yu_style: YuStyle) -> String {
    match yu_style {
        YuStyle::Umlaut => base.to_string(),
        YuStyle::V => base.replace('ü', "v"),
        YuStyle::Yu => base.replace('ü', "yu"),
        YuStyle::U => base.replace('ü', "u"),
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn formats_tone_numbers_at_the_end() {
        assert_eq!(
            format_syllable("hǎo", ToneStyle::Number, YuStyle::V),
            "hao3"
        );
        assert_eq!(format_syllable("", ToneStyle::Number, YuStyle::V), "lv3");
    }

    #[test]
    fn keeps_umlaut_for_mark_style_by_default() {
        assert_eq!(
            format_syllable("", ToneStyle::Mark, YuStyle::Umlaut),
            ""
        );
        assert_eq!(format_syllable("", ToneStyle::None, YuStyle::Yu), "lyu");
    }

    #[test]
    fn builds_slug_tokens() {
        assert_eq!(slug_token(""), "lv");
        assert_eq!(slug_token("Hello!"), "hello");
    }
}