Skip to main content

inputx_wubi_data/
table.rs

1//! Thin adapter over [`inputx_wubi::WubiDict`] (the embedded FST in the
2//! sibling [`inputx-wubi`](https://crates.io/crates/inputx-wubi) crate). The dict
3//! instance is process-global via `OnceLock`; L0 (per-user learning) state
4//! therefore persists across `Session` instances within one process.
5
6use std::sync::OnceLock;
7use std::sync::atomic::{AtomicBool, Ordering};
8
9use inputx_wubi::{L0Snapshot, WubiDict};
10
11static DICT: OnceLock<WubiDict> = OnceLock::new();
12
13fn dict() -> &'static WubiDict {
14    DICT.get_or_init(WubiDict::embedded)
15}
16
17/// Codepoint cutoff for "rare CJK". Anything ≥ this lands in CJK Extension
18/// B (`U+20000`) or higher — blocks where most consumer fonts on iOS /
19/// Android lack glyphs, so committing those characters into typical apps
20/// renders as `?`. Default behavior of [`lookup`] is to filter them out;
21/// power users can re-enable via [`set_show_rare`].
22const RARE_CODEPOINT_THRESHOLD: u32 = 0x20000;
23
24/// When `false` (default), `lookup` drops candidates containing any rare
25/// CJK character. Industry-standard CJK IMEs (Apple, Sogou, Baidu) silently
26/// hide these from candidate lists for the same reason — their dictionaries
27/// have them but the host UI can't render them, so showing them is worse
28/// than not.
29static SHOW_RARE: AtomicBool = AtomicBool::new(false);
30
31pub fn set_show_rare(show: bool) {
32    SHOW_RARE.store(show, Ordering::Relaxed);
33}
34
35pub fn show_rare() -> bool {
36    SHOW_RARE.load(Ordering::Relaxed)
37}
38
39/// Force-init the embedded `WubiDict` and exercise common lookup paths so
40/// the OS faults the FST's `.rodata` pages into RAM and any internal `fst::Map`
41/// streamer state is primed. Idempotent — relies on `OnceLock::get_or_init`
42/// for the dict, and `WubiDict::lookup` for the page-touch effect. Called
43/// from `Session::warmup` so a host can off-load the cold-path cost to a
44/// background thread at startup instead of paying it on the user's first
45/// keystroke. ~100-300ms on iPhone cold; <1ms idempotent.
46pub fn warmup() {
47    let d = dict();
48    // 13 wubi codes spanning all 5 key zones (横/竖/撇/捺/折) — the FST
49    // is laid out alphabetically, so this touches pages across the whole
50    // .rodata range, not just one bucket.
51    for code in &[
52        "g", "h", "j", "k", "l", "m", "a", "s", "d", "f", "p", "q", "wq",
53    ] {
54        let _ = d.lookup(code);
55    }
56}
57
58/// `true` iff every character in `word` is below the rare-CJK threshold
59/// (`U+20000` — start of CJK Extension B). Pinyin-side composer also calls
60/// this so the user-facing rare-char toggle applies uniformly to both
61/// engines (item 54).
62pub fn is_displayable(word: &str) -> bool {
63    word.chars().all(|c| (c as u32) < RARE_CODEPOINT_THRESHOLD)
64}
65
66/// Exact lookup for `code`. Returns the candidates ranked by L0/L1, with
67/// rare CJK candidates filtered unless `show_rare()` is `true`.
68pub fn lookup(code: &str) -> Vec<String> {
69    let mut all = dict().lookup(code);
70    if !SHOW_RARE.load(Ordering::Relaxed) {
71        all.retain(|w| is_displayable(w));
72    }
73    all
74}
75
76/// Scored variant of [`lookup`]. Returns `(word, score)` tuples for
77/// the composite cross-engine merge. Rare-CJK filter applied here too.
78pub fn lookup_with_scores(code: &str) -> Vec<(String, f64)> {
79    let mut all: Vec<(String, f64)> = Vec::new();
80    dict().lookup_with_scores_into(code, &mut all);
81    if !SHOW_RARE.load(Ordering::Relaxed) {
82        all.retain(|(w, _)| is_displayable(w));
83    }
84    all
85}
86
87/// Layer-aware variant: each candidate also carries its origin Layer
88/// (Jianma1/2/3, Zigen, Phrase, Auto). Composite dispatch uses the
89/// layer tag to make context-aware ranking decisions — e.g. demoting
90/// low-confidence Auto / Phrase wubi candidates when the buffer shape
91/// suggests pinyin intent, while keeping high-confidence Jianma simcodes
92/// untouched (the 伙-rule: wubi simcodes always lead at their code).
93pub fn lookup_with_layer(code: &str) -> Vec<(String, f64, inputx_wubi::Layer)> {
94    let mut all: Vec<(String, f64, inputx_wubi::Layer)> = Vec::new();
95    dict().lookup_with_layer_into(code, &mut all);
96    if !SHOW_RARE.load(Ordering::Relaxed) {
97        all.retain(|(w, _, _)| is_displayable(w));
98    }
99    all
100}
101
102/// Per-code lookup exposing raw frequency (separate from layer.base ·
103/// pref) — used by the v1.4.7 composite hot path for orthodox
104/// score decomposition into (log_prior_q4 = Q4·ln(1+freq),
105/// log_likelihood_q4 = Q4·ln(layer.base() · pref · demotes)). Rare-CJK
106/// filter applied uniformly with `lookup_with_layer`.
107///
108/// v1.4.7 sub-phase A4 step 2: data source is `wubi_idf_reader()`
109/// (cement-owned `IdfReader` over `EMBEDDED_WUBI_IDF`) instead of the
110/// facade `WubiDict::lookup_with_freq_layer_into`. The IDF entry
111/// carries `raw_freq` losslessly and `Layer` via the engine_tag bits
112/// of `EntryFlags`. Output is byte-equivalent to the previous facade
113/// fill modulo the rare-CJK retain pass.
114pub fn lookup_with_freq_layer(
115    code: &str,
116) -> Vec<(String, inputx_wubi::Layer, u64)> {
117    let reader = crate::wubi_idf_reader();
118    let entries = reader.lookup(code.as_bytes());
119    let mut all: Vec<(String, inputx_wubi::Layer, u64)> = entries
120        .into_iter()
121        .map(|e| {
122            (
123                e.word.to_string(),
124                crate::layer_from_idf_tag(e.flags.engine_tag()),
125                e.raw_freq as u64,
126            )
127        })
128        .collect();
129    if !SHOW_RARE.load(Ordering::Relaxed) {
130        all.retain(|(w, _, _)| is_displayable(w));
131    }
132    all
133}
134
135/// Prefix-prediction lookup: `(word, freq, code_len)` for every dict
136/// entry whose code strictly extends `prefix` (no exact-code matches).
137/// Rare-CJK filter applied uniformly with [`lookup`]. Wired into the
138/// composite dispatch so Wubi gets the same prefix-prediction shape as
139/// pinyin / JP (e.g. `jj` exact 是 stays at #0, predictions 日/时 follow).
140///
141/// v1.4.7 sub-phase A4 step 2: streamed through `wubi_idf_reader()
142/// .prefix_for_each_entry` instead of `WubiDict::prefix_predictions`.
143/// Strictly-extending filter (`code.len() > prefix.len()`) and the
144/// `freq_desc → word_asc` sort match the facade output byte-for-byte
145/// modulo the rare-CJK retain pass.
146pub fn prefix_predictions(prefix: &str) -> Vec<(String, u64, usize)> {
147    let reader = crate::wubi_idf_reader();
148    let prefix_lower = prefix.to_ascii_lowercase();
149    let prefix_len = prefix_lower.len();
150    let mut all: Vec<(String, u64, usize)> = Vec::new();
151    reader.prefix_for_each_entry(prefix_lower.as_bytes(), |e| {
152        if e.code.len() <= prefix_len {
153            return;
154        }
155        all.push((e.word.to_string(), e.raw_freq as u64, e.code.len()));
156    });
157    // freq desc, word asc — matches the facade ordering.
158    all.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
159    if !SHOW_RARE.load(Ordering::Relaxed) {
160        all.retain(|(w, _, _)| is_displayable(w));
161    }
162    all
163}
164
165/// Notify the dictionary that the user committed `word` for `code`. The
166/// internal pick counter advances; on threshold the word auto-pins. All
167/// learning logic lives in `wubi` — this is just a passthrough so the IME
168/// layer doesn't need to know about counters.
169pub fn record_pick(code: &str, word: &str) {
170    dict().record_pick(code, word);
171}
172
173/// Snapshot the current L0 state (pins + pending pick counts + layer
174/// prefs) for host-side persistence. Host stores it however it wants
175/// (UserDefaults on Apple platforms, IndexedDB in web, etc.) and feeds
176/// it back via [`import_l0`] on next launch.
177pub fn export_l0() -> L0Snapshot {
178    dict().export_l0()
179}
180
181/// Restore a previously-exported L0 snapshot. Entries whose `(code, word)`
182/// no longer exist in the lexicon (e.g., after a wubi data version bump
183/// removed an extension char) are silently dropped. Returns the count of
184/// accepted pins.
185pub fn import_l0(snap: L0Snapshot) -> usize {
186    dict().import_l0(snap)
187}
188
189#[cfg(test)]
190mod tests {
191    use super::*;
192
193    #[test]
194    fn one_letter_jianma1_resolves() {
195        // 一级简码: 'g' → 一 (canonical 86 standard)
196        assert!(lookup("g").contains(&"一".to_string()));
197    }
198
199    #[test]
200    fn keyname_zigen_full_code() {
201        // 键名字根: 王 = gggg
202        assert!(lookup("gggg").contains(&"王".to_string()));
203    }
204
205    #[test]
206    fn unknown_returns_empty() {
207        assert!(lookup("xyzz123").is_empty());
208        assert!(lookup("").is_empty());
209    }
210}