inputx_pinyin/encode.rs
1//! Reverse lookup — `char → Vec<pinyin>`.
2//!
3//! Powered by traversing the embedded dict on-demand. v0.1 builds the
4//! reverse index lazily on first call (small bootstrap → cheap); v0.2 will
5//! materialize it at compile time once the dict grows.
6
7use std::collections::HashMap;
8use std::sync::OnceLock;
9
10use crate::dict::PinyinDict;
11
12/// A single character's pinyin readings, in the order they were first
13/// encountered while traversing the dict.
14type CharReadings = Vec<String>;
15
16/// Lazy global reverse index. Populated on first call; never invalidated
17/// (the embedded dict is immutable in v0.1).
18static REVERSE: OnceLock<HashMap<char, CharReadings>> = OnceLock::new();
19
20fn reverse_index() -> &'static HashMap<char, CharReadings> {
21 REVERSE.get_or_init(|| {
22 let dict = PinyinDict::embedded();
23 // We can't access the FST through the public PinyinDict API after
24 // construction, so re-walk the same bytes. The embedded ctor is
25 // cheap; this only runs once.
26 let mut map: HashMap<char, CharReadings> = HashMap::new();
27 let raw = build_walk(&dict);
28 for (pinyin, word) in raw {
29 // Single-char entries contribute the most reliable readings.
30 // Multi-char entries' readings are only assigned via segmentation,
31 // which is out of scope for v0.1's reverse index.
32 let mut chars = word.chars();
33 let (Some(first), None) = (chars.next(), chars.next()) else {
34 continue;
35 };
36 let readings = map.entry(first).or_default();
37 if !readings.contains(&pinyin) {
38 readings.push(pinyin);
39 }
40 }
41 map
42 })
43}
44
45fn build_walk(dict: &PinyinDict) -> Vec<(String, String)> {
46 let mut out = Vec::new();
47 // Re-open the embedded bytes through the public prefix("") to get the full set.
48 // (PinyinDict doesn't expose `Map` directly to keep the surface minimal.)
49 out.extend(dict.prefix(""));
50 // Sanity: prefix("") returns everything sorted by (pinyin, word).
51 out
52}
53
54/// Pinyin readings for a single Han character. Returns an empty `Vec` if the
55/// character isn't in the bootstrap dict (most chars won't be in v0.1; v0.2
56/// expands coverage to ~67k via Unihan + corpus pipeline).
57pub fn char_to_pinyin(c: char) -> Vec<String> {
58 reverse_index().get(&c).cloned().unwrap_or_default()
59}
60
61/// Number of Han characters with at least one reading in the reverse index.
62/// Useful for sanity checks; not a meaningful coverage metric in v0.1.
63pub fn covered_char_count() -> usize {
64 reverse_index().len()
65}
66
67#[cfg(test)]
68mod tests {
69 use super::*;
70
71 #[test]
72 fn common_chars_have_readings() {
73 for (c, want) in [('我', "wo"), ('你', "ni"), ('好', "hao"), ('中', "zhong")] {
74 let readings = char_to_pinyin(c);
75 assert!(
76 readings.iter().any(|p| p == want),
77 "{c} should include reading {want:?}, got {readings:?}"
78 );
79 }
80 }
81
82 #[test]
83 fn unknown_char_yields_empty() {
84 // Use a Private Use Area codepoint — guaranteed never in Unihan
85 // (PUA is reserved for application-specific assignments). The full
86 // v0.2 dict covers Ext B-G so previously-archaic CJK codepoints
87 // (e.g., 𤴓 U+24D13) now have readings.
88 assert!(char_to_pinyin('\u{E000}').is_empty());
89 assert!(char_to_pinyin('\u{F8FF}').is_empty());
90 }
91
92 #[test]
93 fn covered_count_reasonable() {
94 let n = covered_char_count();
95 assert!(n >= 50, "expected ≥50 single-char entries, got {n}");
96 }
97}