pinyin-converter 0.1.0

Fast, dictionary-backed Chinese to Pinyin conversion for Rust and the command line.
Documentation
use std::collections::{HashMap, HashSet};

const WORD_FILES: [&str; 10] = [
    include_str!(concat!(env!("OUT_DIR"), "/data/words_0.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/words_1.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/words_2.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/words_3.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/words_4.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/words_5.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/words_6.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/words_7.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/words_8.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/words_9.txt")),
];

const CHAR_FILES: [&str; 10] = [
    include_str!(concat!(env!("OUT_DIR"), "/data/chars_0.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/chars_1.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/chars_2.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/chars_3.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/chars_4.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/chars_5.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/chars_6.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/chars_7.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/chars_8.txt")),
    include_str!(concat!(env!("OUT_DIR"), "/data/chars_9.txt")),
];

const SURNAMES: &str = include_str!(concat!(env!("OUT_DIR"), "/data/surnames.txt"));
const HETERONYMS: &str = include_str!(concat!(env!("OUT_DIR"), "/data/heteronyms.txt"));

#[derive(Debug, Default)]
pub(crate) struct Lexicon {
    words: HashMap<String, String>,
    chars: HashMap<String, String>,
    surnames: HashMap<String, String>,
    heteronyms: HashSet<char>,
}

impl Lexicon {
    pub(crate) fn new() -> Self {
        Self {
            words: load_map(WORD_FILES),
            chars: load_map(CHAR_FILES),
            surnames: load_map([SURNAMES]),
            heteronyms: HETERONYMS
                .lines()
                .filter_map(|line| line.chars().next())
                .collect(),
        }
    }

    pub(crate) fn default_entries(&self) -> Vec<(String, String)> {
        let mut entries = self.chars.clone();

        for (word, pinyin) in &self.surnames {
            entries.insert(word.clone(), pinyin.clone());
        }

        for (word, pinyin) in &self.words {
            entries.insert(word.clone(), pinyin.clone());
        }

        into_sorted_entries(entries)
    }

    pub(crate) fn plain_entries(&self) -> Vec<(String, String)> {
        let mut entries = self.chars.clone();

        for (word, pinyin) in &self.words {
            entries.insert(word.clone(), pinyin.clone());
        }

        into_sorted_entries(entries)
    }

    pub(crate) fn surname_entries(&self) -> Vec<(String, String)> {
        let mut entries = self.chars.clone();

        for (word, pinyin) in &self.words {
            entries.insert(word.clone(), pinyin.clone());
        }

        for (word, pinyin) in &self.surnames {
            entries.insert(word.clone(), pinyin.clone());
        }

        into_sorted_entries(entries)
    }

    pub(crate) fn char_pinyin(&self, ch: char) -> Option<&str> {
        self.chars.get(&ch.to_string()).map(String::as_str)
    }

    pub(crate) fn surname_pinyin(&self, text: &str) -> Option<&str> {
        self.surnames.get(text).map(String::as_str)
    }

    pub(crate) fn longest_surname_prefix<'a>(&self, input: &'a str) -> Option<&'a str> {
        self.surnames
            .keys()
            .filter_map(|surname| input.strip_prefix(surname).map(|_| surname.len()))
            .max()
            .map(|len| &input[..len])
    }

    pub(crate) fn heteronyms(&self, ch: char) -> Option<Vec<&str>> {
        if !self.heteronyms.contains(&ch) {
            return None;
        }

        self.char_pinyin(ch)
            .map(|pinyin| pinyin.split_whitespace().collect())
    }
}

fn load_map<const N: usize>(files: [&str; N]) -> HashMap<String, String> {
    files
        .into_iter()
        .flat_map(str::lines)
        .filter_map(parse_line)
        .collect()
}

fn parse_line(line: &str) -> Option<(String, String)> {
    let line = line.trim();
    if line.is_empty() || line.starts_with('#') {
        return None;
    }

    let (key, value) = line.split_once(':')?;
    let key = key.trim();
    let value = value.trim();
    if key.is_empty() || value.is_empty() {
        return None;
    }

    Some((key.to_string(), value.to_string()))
}

fn into_sorted_entries(map: HashMap<String, String>) -> Vec<(String, String)> {
    let mut entries = map.into_iter().collect::<Vec<_>>();
    entries.sort_by(|left, right| {
        right
            .0
            .chars()
            .count()
            .cmp(&left.0.chars().count())
            .then_with(|| left.0.cmp(&right.0))
    });
    entries
}

#[cfg(test)]
mod tests {
    use super::*;
    use pretty_assertions::assert_eq;

    #[test]
    fn parses_generated_rows() {
        assert_eq!(
            parse_line("中国: zhōng guó"),
            Some(("中国".to_string(), "zhōng guó".to_string()))
        );
        assert_eq!(parse_line("broken"), None);
    }

    #[test]
    fn loads_core_sources() {
        let lexicon = Lexicon::new();
        assert_eq!(lexicon.char_pinyin(''), Some("zhōng zhòng"));
        assert_eq!(lexicon.surname_pinyin(""), Some("shàn"));
        assert!(
            lexicon
                .default_entries()
                .iter()
                .any(|(word, _)| word == "你好")
        );
    }

    #[test]
    fn finds_longest_surname_prefix() {
        let lexicon = Lexicon::new();
        assert_eq!(lexicon.longest_surname_prefix("尉迟恭"), Some("尉迟"));
        assert_eq!(lexicon.longest_surname_prefix("张三"), None);
    }
}