inputx_wubi_data/table.rs
1//! Thin adapter over [`inputx_wubi::WubiDict`] (the embedded FST in the
2//! sibling [`inputx-wubi`](https://crates.io/crates/inputx-wubi) crate). The dict
3//! instance is process-global via `OnceLock`; L0 (per-user learning) state
4//! therefore persists across `Session` instances within one process.
5
6use std::sync::OnceLock;
7use std::sync::atomic::{AtomicBool, Ordering};
8
9use inputx_wubi::{L0Snapshot, WubiDict};
10
11static DICT: OnceLock<WubiDict> = OnceLock::new();
12
13fn dict() -> &'static WubiDict {
14 DICT.get_or_init(WubiDict::embedded)
15}
16
17/// Codepoint cutoff for "rare CJK". Anything ≥ this lands in CJK Extension
18/// B (`U+20000`) or higher — blocks where most consumer fonts on iOS /
19/// Android lack glyphs, so committing those characters into typical apps
20/// renders as `?`. Default behavior of [`lookup`] is to filter them out;
21/// power users can re-enable via [`set_show_rare`].
22const RARE_CODEPOINT_THRESHOLD: u32 = 0x20000;
23
24/// When `false` (default), `lookup` drops candidates containing any rare
25/// CJK character. Industry-standard CJK IMEs (Apple, Sogou, Baidu) silently
26/// hide these from candidate lists for the same reason — their dictionaries
27/// have them but the host UI can't render them, so showing them is worse
28/// than not.
29static SHOW_RARE: AtomicBool = AtomicBool::new(false);
30
31pub fn set_show_rare(show: bool) {
32 SHOW_RARE.store(show, Ordering::Relaxed);
33}
34
35pub fn show_rare() -> bool {
36 SHOW_RARE.load(Ordering::Relaxed)
37}
38
39/// Force-init the embedded `WubiDict` and exercise common lookup paths so
40/// the OS faults the FST's `.rodata` pages into RAM and any internal `fst::Map`
41/// streamer state is primed. Idempotent — relies on `OnceLock::get_or_init`
42/// for the dict, and `WubiDict::lookup` for the page-touch effect. Called
43/// from `Session::warmup` so a host can off-load the cold-path cost to a
44/// background thread at startup instead of paying it on the user's first
45/// keystroke. ~100-300ms on iPhone cold; <1ms idempotent.
46pub fn warmup() {
47 let d = dict();
48 // 13 wubi codes spanning all 5 key zones (横/竖/撇/捺/折) — the FST
49 // is laid out alphabetically, so this touches pages across the whole
50 // .rodata range, not just one bucket.
51 for code in &[
52 "g", "h", "j", "k", "l", "m", "a", "s", "d", "f", "p", "q", "wq",
53 ] {
54 let _ = d.lookup(code);
55 }
56}
57
58/// `true` iff every character in `word` is below the rare-CJK threshold
59/// (`U+20000` — start of CJK Extension B). Pinyin-side composer also calls
60/// this so the user-facing rare-char toggle applies uniformly to both
61/// engines (item 54).
62pub fn is_displayable(word: &str) -> bool {
63 word.chars().all(|c| (c as u32) < RARE_CODEPOINT_THRESHOLD)
64}
65
66/// Exact lookup for `code`. Returns the candidates ranked by L0/L1, with
67/// rare CJK candidates filtered unless `show_rare()` is `true`.
68pub fn lookup(code: &str) -> Vec<String> {
69 let mut all = dict().lookup(code);
70 if !SHOW_RARE.load(Ordering::Relaxed) {
71 all.retain(|w| is_displayable(w));
72 }
73 all
74}
75
76/// Scored variant of [`lookup`]. Returns `(word, score)` tuples for
77/// the composite cross-engine merge. Rare-CJK filter applied here too.
78pub fn lookup_with_scores(code: &str) -> Vec<(String, f64)> {
79 let mut all: Vec<(String, f64)> = Vec::new();
80 dict().lookup_with_scores_into(code, &mut all);
81 if !SHOW_RARE.load(Ordering::Relaxed) {
82 all.retain(|(w, _)| is_displayable(w));
83 }
84 all
85}
86
87/// Layer-aware variant: each candidate also carries its origin Layer
88/// (Jianma1/2/3, Zigen, Phrase, Auto). Composite dispatch uses the
89/// layer tag to make context-aware ranking decisions — e.g. demoting
90/// low-confidence Auto / Phrase wubi candidates when the buffer shape
91/// suggests pinyin intent, while keeping high-confidence Jianma simcodes
92/// untouched (the 伙-rule: wubi simcodes always lead at their code).
93pub fn lookup_with_layer(code: &str) -> Vec<(String, f64, inputx_wubi::Layer)> {
94 let mut all: Vec<(String, f64, inputx_wubi::Layer)> = Vec::new();
95 dict().lookup_with_layer_into(code, &mut all);
96 if !SHOW_RARE.load(Ordering::Relaxed) {
97 all.retain(|(w, _, _)| is_displayable(w));
98 }
99 all
100}
101
102/// Per-code lookup exposing raw frequency (separate from layer.base ·
103/// pref) — used by the v1.4.7 composite hot path for orthodox
104/// score decomposition into (log_prior_q4 = Q4·ln(1+freq),
105/// log_likelihood_q4 = Q4·ln(layer.base() · pref · demotes)). Rare-CJK
106/// filter applied uniformly with `lookup_with_layer`.
107///
108/// v1.4.7 sub-phase A4 step 2: data source is `wubi_idf_reader()`
109/// (cement-owned `IdfReader` over `EMBEDDED_WUBI_IDF`) instead of the
110/// facade `WubiDict::lookup_with_freq_layer_into`. The IDF entry
111/// carries `raw_freq` losslessly and `Layer` via the engine_tag bits
112/// of `EntryFlags`. Output is byte-equivalent to the previous facade
113/// fill modulo the rare-CJK retain pass.
114pub fn lookup_with_freq_layer(
115 code: &str,
116) -> Vec<(String, inputx_wubi::Layer, u64)> {
117 let reader = crate::wubi_idf_reader();
118 let entries = reader.lookup(code.as_bytes());
119 let mut all: Vec<(String, inputx_wubi::Layer, u64)> = entries
120 .into_iter()
121 .map(|e| {
122 (
123 e.word.to_string(),
124 crate::layer_from_idf_tag(e.flags.engine_tag()),
125 e.raw_freq as u64,
126 )
127 })
128 .collect();
129 if !SHOW_RARE.load(Ordering::Relaxed) {
130 all.retain(|(w, _, _)| is_displayable(w));
131 }
132 all
133}
134
135/// Prefix-prediction lookup: `(word, freq, code_len)` for every dict
136/// entry whose code strictly extends `prefix` (no exact-code matches).
137/// Rare-CJK filter applied uniformly with [`lookup`]. Wired into the
138/// composite dispatch so Wubi gets the same prefix-prediction shape as
139/// pinyin / JP (e.g. `jj` exact 是 stays at #0, predictions 日/时 follow).
140///
141/// v1.4.7 sub-phase A4 step 2: streamed through `wubi_idf_reader()
142/// .prefix_for_each_entry` instead of `WubiDict::prefix_predictions`.
143/// Strictly-extending filter (`code.len() > prefix.len()`) and the
144/// `freq_desc → word_asc` sort match the facade output byte-for-byte
145/// modulo the rare-CJK retain pass.
146pub fn prefix_predictions(prefix: &str) -> Vec<(String, u64, usize)> {
147 let reader = crate::wubi_idf_reader();
148 let prefix_lower = prefix.to_ascii_lowercase();
149 let prefix_len = prefix_lower.len();
150 let mut all: Vec<(String, u64, usize)> = Vec::new();
151 reader.prefix_for_each_entry(prefix_lower.as_bytes(), |e| {
152 if e.code.len() <= prefix_len {
153 return;
154 }
155 all.push((e.word.to_string(), e.raw_freq as u64, e.code.len()));
156 });
157 // freq desc, word asc — matches the facade ordering.
158 all.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
159 if !SHOW_RARE.load(Ordering::Relaxed) {
160 all.retain(|(w, _, _)| is_displayable(w));
161 }
162 all
163}
164
165/// Notify the dictionary that the user committed `word` for `code`. The
166/// internal pick counter advances; on threshold the word auto-pins. All
167/// learning logic lives in `wubi` — this is just a passthrough so the IME
168/// layer doesn't need to know about counters.
169pub fn record_pick(code: &str, word: &str) {
170 dict().record_pick(code, word);
171}
172
173/// Snapshot the current L0 state (pins + pending pick counts + layer
174/// prefs) for host-side persistence. Host stores it however it wants
175/// (UserDefaults on Apple platforms, IndexedDB in web, etc.) and feeds
176/// it back via [`import_l0`] on next launch.
177pub fn export_l0() -> L0Snapshot {
178 dict().export_l0()
179}
180
181/// Restore a previously-exported L0 snapshot. Entries whose `(code, word)`
182/// no longer exist in the lexicon (e.g., after a wubi data version bump
183/// removed an extension char) are silently dropped. Returns the count of
184/// accepted pins.
185pub fn import_l0(snap: L0Snapshot) -> usize {
186 dict().import_l0(snap)
187}
188
189#[cfg(test)]
190mod tests {
191 use super::*;
192
193 #[test]
194 fn one_letter_jianma1_resolves() {
195 // 一级简码: 'g' → 一 (canonical 86 standard)
196 assert!(lookup("g").contains(&"一".to_string()));
197 }
198
199 #[test]
200 fn keyname_zigen_full_code() {
201 // 键名字根: 王 = gggg
202 assert!(lookup("gggg").contains(&"王".to_string()));
203 }
204
205 #[test]
206 fn unknown_returns_empty() {
207 assert!(lookup("xyzz123").is_empty());
208 assert!(lookup("").is_empty());
209 }
210}