Skip to main content

inputx_pinyin/
encode.rs

1//! Reverse lookup — `char → Vec<pinyin>`.
2//!
3//! Powered by traversing the embedded dict on-demand. v0.1 builds the
4//! reverse index lazily on first call (small bootstrap → cheap); v0.2 will
5//! materialize it at compile time once the dict grows.
6
7use std::collections::HashMap;
8use std::sync::OnceLock;
9
10use crate::dict::PinyinDict;
11
12/// A single character's pinyin readings, in the order they were first
13/// encountered while traversing the dict.
14type CharReadings = Vec<String>;
15
16/// Lazy global reverse index. Populated on first call; never invalidated
17/// (the embedded dict is immutable in v0.1).
18static REVERSE: OnceLock<HashMap<char, CharReadings>> = OnceLock::new();
19
20fn reverse_index() -> &'static HashMap<char, CharReadings> {
21    REVERSE.get_or_init(|| {
22        let dict = PinyinDict::embedded();
23        // We can't access the FST through the public PinyinDict API after
24        // construction, so re-walk the same bytes. The embedded ctor is
25        // cheap; this only runs once.
26        let mut map: HashMap<char, CharReadings> = HashMap::new();
27        let raw = build_walk(&dict);
28        for (pinyin, word) in raw {
29            // Single-char entries contribute the most reliable readings.
30            // Multi-char entries' readings are only assigned via segmentation,
31            // which is out of scope for v0.1's reverse index.
32            let mut chars = word.chars();
33            let (Some(first), None) = (chars.next(), chars.next()) else {
34                continue;
35            };
36            let readings = map.entry(first).or_default();
37            if !readings.contains(&pinyin) {
38                readings.push(pinyin);
39            }
40        }
41        map
42    })
43}
44
45fn build_walk(dict: &PinyinDict) -> Vec<(String, String)> {
46    let mut out = Vec::new();
47    // Re-open the embedded bytes through the public prefix("") to get the full set.
48    // (PinyinDict doesn't expose `Map` directly to keep the surface minimal.)
49    out.extend(dict.prefix(""));
50    // Sanity: prefix("") returns everything sorted by (pinyin, word).
51    out
52}
53
54/// Pinyin readings for a single Han character. Returns an empty `Vec` if the
55/// character isn't in the bootstrap dict (most chars won't be in v0.1; v0.2
56/// expands coverage to ~67k via Unihan + corpus pipeline).
57pub fn char_to_pinyin(c: char) -> Vec<String> {
58    reverse_index().get(&c).cloned().unwrap_or_default()
59}
60
61/// Number of Han characters with at least one reading in the reverse index.
62/// Useful for sanity checks; not a meaningful coverage metric in v0.1.
63pub fn covered_char_count() -> usize {
64    reverse_index().len()
65}
66
67#[cfg(test)]
68mod tests {
69    use super::*;
70
71    #[test]
72    fn common_chars_have_readings() {
73        for (c, want) in [('我', "wo"), ('你', "ni"), ('好', "hao"), ('中', "zhong")] {
74            let readings = char_to_pinyin(c);
75            assert!(
76                readings.iter().any(|p| p == want),
77                "{c} should include reading {want:?}, got {readings:?}"
78            );
79        }
80    }
81
82    #[test]
83    fn unknown_char_yields_empty() {
84        // Use a Private Use Area codepoint — guaranteed never in Unihan
85        // (PUA is reserved for application-specific assignments). The full
86        // v0.2 dict covers Ext B-G so previously-archaic CJK codepoints
87        // (e.g., 𤴓 U+24D13) now have readings.
88        assert!(char_to_pinyin('\u{E000}').is_empty());
89        assert!(char_to_pinyin('\u{F8FF}').is_empty());
90    }
91
92    #[test]
93    fn covered_count_reasonable() {
94        let n = covered_char_count();
95        assert!(n >= 50, "expected ≥50 single-char entries, got {n}");
96    }
97}