Skip to main content

inputx_wubi/
dict.rs

1//! FST-backed Wubi dictionary with a two-tier ranking model.
2//!
3//! # L0 / L1+
4//!
5//! - **L1+** is the immutable lexicon: the embedded FST built at compile
6//!   time, plus a per-entry [`Layer`] tag and a per-entry frequency score.
7//!   Every entry's nominal weight is `LAYER_BASE[layer] + freq_score`. Future
8//!   immutable layers (e.g., a per-app dictionary shipped by the host) can
9//!   stack on top with the same shape.
10//! - **L0** is a thin, per-user override layer:
11//!   - **Pinned candidates** — `code → preferred_word`. A pin moves that word
12//!     to position 0 in `lookup`'s output, regardless of L1+ weight.
13//!   - **Pick counters** — `(code, word) → u32`. [`WubiDict::record_pick`] increments
14//!     the counter; once it hits [`PROMOTE_THRESHOLD`], the word is auto-
15//!     pinned and all counters for that code are reset (so a later, different
16//!     pick has to earn its 3 votes from scratch — prevents thrashing).
17//!   - **Layer preferences** — `Layer → f64` multiplier (default 1.0, with
18//!     `Auto = 0.7` so extension characters don't dominate). Applied to the
19//!     L1 nominal weight at sort time. Settable via API; **not** auto-tuned.
20//!
21//! Layer prefs reorder *within* L1; pins override the resulting ordering at
22//! position 0. So in steady state most codes have empty L0 and the layer
23//! base ordering wins (hence "L0 default ≈ L1 default").
24
25use std::collections::HashMap;
26use std::sync::RwLock;
27
28use inputx_fsa::Dict;
29
30use crate::layer::{DEFAULT_LAYER_PREFS, LAYER_COUNT, Layer, unpack};
31
32const DICT_BYTES: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/wubi86.dict"));
33
34/// Number of consecutive picks of the same `(code, word)` required before
35/// L0 auto-pins it. Defaults to 3; can be overridden at build time via the
36/// `WUBI_PROMOTE_THRESHOLD` env var (developer escape hatch — not exposed
37/// to end users).
38pub const PROMOTE_THRESHOLD: u32 = parse_threshold_const();
39
40const fn parse_threshold_const() -> u32 {
41    match option_env!("WUBI_PROMOTE_THRESHOLD") {
42        Some(s) => parse_u32_const(s),
43        None => 3,
44    }
45}
46
47const fn parse_u32_const(s: &str) -> u32 {
48    let bytes = s.as_bytes();
49    if bytes.is_empty() {
50        panic!("WUBI_PROMOTE_THRESHOLD must not be empty");
51    }
52    let mut i = 0;
53    let mut n: u32 = 0;
54    while i < bytes.len() {
55        let b = bytes[i];
56        if b < b'0' || b > b'9' {
57            panic!("WUBI_PROMOTE_THRESHOLD must be ASCII digits");
58        }
59        n = n * 10 + (b - b'0') as u32;
60        i += 1;
61    }
62    if n == 0 {
63        panic!("WUBI_PROMOTE_THRESHOLD must be >= 1");
64    }
65    n
66}
67
68/// Persistent state of the L0 layer. Caller serializes / deserializes this
69/// however it likes (TOML, MessagePack, sqlite, …) — the crate intentionally
70/// has no `serde` dependency.
71#[derive(Debug, Clone)]
72pub struct L0Snapshot {
73    /// `(code, word)` pairs the user has pinned (manually or via `record_pick`
74    /// reaching threshold).
75    pub pins: Vec<(String, String)>,
76    /// `(code, word, count)` — pending pick counts that haven't yet reached
77    /// `PROMOTE_THRESHOLD`. Snapshot semantics are best-effort; a count of
78    /// `threshold - 1` restored after restart needs only one more pick to
79    /// promote.
80    pub pick_counts: Vec<(String, String, u32)>,
81    /// Layer multipliers, indexed by `Layer as usize`.
82    pub layer_prefs: [f64; LAYER_COUNT],
83}
84
85#[derive(Default)]
86struct L0Inner {
87    pins: HashMap<String, String>,
88    pick_counts: HashMap<(String, String), u32>,
89    layer_prefs: [f64; LAYER_COUNT],
90}
91
92impl L0Inner {
93    fn new() -> Self {
94        Self {
95            pins: HashMap::new(),
96            pick_counts: HashMap::new(),
97            layer_prefs: DEFAULT_LAYER_PREFS,
98        }
99    }
100}
101
102/// The Wubi 86 dictionary: an embedded FST plus a mutable L0 layer for
103/// per-user preference learning.
104///
105/// All read methods take `&self`. L0 mutations (`record_pick`, `pin`,
106/// `forget`, `set_layer_pref`, `import_l0`) also take `&self` — interior
107/// mutability via `RwLock` lets a single shared instance feed every
108/// concurrent IME / WASM session without exposing the lock to the caller.
109pub struct WubiDict {
110    map: Dict<&'static [u8]>,
111    l0: RwLock<L0Inner>,
112}
113
114impl WubiDict {
115    /// Construct the dictionary from the embedded FST. Cheap (validates the
116    /// FST header and initializes an empty L0); callers should still cache
117    /// the instance and reuse it for the program lifetime.
118    pub fn embedded() -> Self {
119        Self {
120            map: Dict::new(DICT_BYTES).expect("invalid embedded wubi dict"),
121            l0: RwLock::new(L0Inner::new()),
122        }
123    }
124
125    /// Number of distinct codes in the dictionary. (The two-level `Dict`
126    /// counts codes, not total (code, word) pairs.)
127    pub fn len(&self) -> usize {
128        self.map.len() as usize
129    }
130
131    /// `true` iff the dictionary has zero codes.
132    pub fn is_empty(&self) -> bool {
133        self.map.is_empty()
134    }
135
136    /// Number of L0 pinned codes.
137    pub fn l0_pin_count(&self) -> usize {
138        self.l0.read().map(|g| g.pins.len()).unwrap_or(0)
139    }
140
141    /// Number of distinct `(code, word)` pairs with pending pick counters.
142    pub fn l0_pending_count(&self) -> usize {
143        self.l0.read().map(|g| g.pick_counts.len()).unwrap_or(0)
144    }
145
146    // -------------------------------------------------------------------
147    // Lookups
148    // -------------------------------------------------------------------
149
150    /// Words for the exact code, ordered by:
151    ///   1. L0 pin (if any) at index 0,
152    ///   2. then `LAYER_BASE[layer] * layer_prefs[layer] + freq_score` desc,
153    ///   3. then FST byte order (stable tiebreaker).
154    ///
155    /// Allocates a fresh `Vec`. Hot-loop callers (the IME's per-keystroke
156    /// candidate refresh) should use [`Self::lookup_into`] to reuse a
157    /// caller-owned buffer.
158    pub fn lookup(&self, code: &str) -> Vec<String> {
159        let mut out = Vec::new();
160        self.lookup_into(code, &mut out);
161        out
162    }
163
164    /// Scored lookup: same ordering as `lookup_into` (layer / freq / promote
165    /// rules + L0 pin), but emits `(word, score)` tuples so the cross-engine
166    /// merge layer can do a single unified sort instead of hard-coding which
167    /// engine wins. Score reflects:
168    ///   * layer.base() × layer_prefs   (jianma1 = 1e6, …)
169    ///   * + freq                       (corpus weight)
170    ///   * × 100.0                      if full-code single-char promotion fires
171    ///                                  (see lookup_into doc for the rule)
172    ///   * × 1000.0                     if the candidate is L0-pinned
173    ///                                  (must dominate any natural score)
174    ///
175    /// The post-multipliers keep wubi simcodes and L0 pins on top across
176    /// the cross-engine merge.
177    pub fn lookup_with_scores_into(&self, code: &str, out: &mut Vec<(String, f64)>) {
178        let mut layered = Vec::with_capacity(out.capacity());
179        self.lookup_with_layer_into(code, &mut layered);
180        out.clear();
181        out.reserve(layered.len());
182        for (w, score, _layer) in layered.drain(..) {
183            out.push((w, score));
184        }
185    }
186
187    /// Layer-aware scored lookup: identical to `lookup_with_scores_into`
188    /// but each candidate also carries its origin `Layer`. The composite
189    /// engine uses the layer tag to make context-aware ranking decisions
190    /// — e.g. demoting low-confidence Auto / Phrase entries at short
191    /// pinyin-shaped input while keeping high-confidence Jianma1/2/3 +
192    /// Zigen simcodes at full strength (the 伙 vs 嶙 distinction —
193    /// 伙 is Jianma2 wubi-simcode and must lead at #0 for its code,
194    /// 嶙 is typically Auto-layer and should not displace pinyin top).
195    pub fn lookup_with_layer_into(
196        &self,
197        code: &str,
198        out: &mut Vec<(String, f64, Layer)>,
199    ) {
200        out.clear();
201        let lower = code.to_ascii_lowercase();
202
203        let prefs = self
204            .l0
205            .read()
206            .map(|g| g.layer_prefs)
207            .unwrap_or(DEFAULT_LAYER_PREFS);
208
209        let full_code = lower.len() == 4;
210        // Tuple: (word, score, is_single, freq, layer).
211        let mut scratch: Vec<(String, f64, bool, u64, Layer)> = Vec::with_capacity(8);
212        let mut max_phrase_freq: u64 = 0;
213        self.map.get_for_each(lower.as_bytes(), |word, value| {
214            if let Ok(s) = core::str::from_utf8(word) {
215                let (layer, freq) = unpack(value);
216                let base = layer.base() as f64;
217                let pref = prefs[layer.as_index()];
218                let is_single = s.chars().count() == 1;
219                if !is_single && freq > max_phrase_freq {
220                    max_phrase_freq = freq;
221                }
222                scratch.push((s.to_string(), base * pref + freq as f64, is_single, freq, layer));
223            }
224        });
225
226        // Apply full-code single-char promote (lifts qualifying single
227        // chars above the same-code phrases) and L0 pin (lifts the pinned
228        // word above natural sort).
229        let pinned: Option<String> = self.l0.read().ok().and_then(|g| g.pins.get(&lower).cloned());
230        for e in scratch.iter_mut() {
231            let promote = full_code && e.2 && e.3 > max_phrase_freq;
232            if promote {
233                e.1 *= 100.0;
234            }
235            if let Some(p) = &pinned
236                && &e.0 == p
237            {
238                e.1 *= 1000.0;
239            }
240        }
241        scratch.sort_by(|a, b| {
242            b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)
243        });
244
245        out.reserve(scratch.len());
246        for (w, score, _, _, layer) in scratch.drain(..) {
247            out.push((w, score, layer));
248        }
249    }
250
251    /// Same as [`Self::lookup`] but writes into a caller-owned buffer.
252    /// `out` is cleared (capacity preserved) on entry.
253    ///
254    /// Reuses the result buffer's allocation across calls — a measurable win
255    /// for the IME's per-keystroke candidate refresh, where the candidate
256    /// list is rebuilt thousands of times per typing session. Per-candidate
257    /// `String`s are still freshly allocated (the FST stream yields owned
258    /// bytes; the crate doesn't expose `&'static str` because the stream's
259    /// borrow doesn't outlive the call).
260    pub fn lookup_into(&self, code: &str, out: &mut Vec<String>) {
261        out.clear();
262
263        let lower = code.to_ascii_lowercase();
264        let prefix_len = lower.len();
265
266        let prefs = self
267            .l0
268            .read()
269            .map(|g| g.layer_prefs)
270            .unwrap_or(DEFAULT_LAYER_PREFS);
271
272        // Score during the FST scan; reuse a small scratch Vec.
273        // Tuple: (word, score, is_single_char, freq, is_phrase).
274        //
275        // The wubi-86 "full-code single-char wins" rule applied here:
276        // at a fully-typed 4-letter code, a single-char entry whose
277        // corpus frequency *exceeds the highest phrase frequency at
278        // the same code* gets promoted above all phrases. Otherwise
279        // the standard score ordering (layer_base × pref + freq) wins.
280        //
281        // Why this shape (relative freq comparison, not absolute):
282        //
283        //   - gmww 两 (Auto, freq 37372) vs 两败俱伤 (Phrase, freq 15272):
284        //     37372 > 15272 → 两 promoted. ✓
285        //
286        //   - wcng 鹟 (Auto, freq 5961) vs 公司 (Phrase, freq 42817):
287        //     5961 < 42817 → 鹟 stays at its natural Auto score (low),
288        //     公司 wins on layer_base alone. ✓
289        //
290        //   - khlg 䟧 (Auto, freq 0) vs 中国 (Phrase, freq 44985):
291        //     0 < 44985 → 䟧 stays low, 中国 wins. ✓
292        //
293        // The earlier absolute-`freq > 0` gate worked for gmww but
294        // wrongly promoted any uncommon-but-corpus-present single char
295        // over a popular phrase (the wcng case the user just flagged).
296        let full_code = prefix_len == 4;
297        let mut scratch: Vec<(String, f64, bool, u64)> = Vec::with_capacity(8);
298        // Track the highest phrase frequency at this code so the
299        // promote decision can be made after the scan.
300        let mut max_phrase_freq: u64 = 0;
301        self.map.get_for_each(lower.as_bytes(), |word, value| {
302            if let Ok(s) = core::str::from_utf8(word) {
303                let (layer, freq) = unpack(value);
304                let base = layer.base() as f64;
305                let pref = prefs[layer.as_index()];
306                let is_single = s.chars().count() == 1;
307                if !is_single && freq > max_phrase_freq {
308                    max_phrase_freq = freq;
309                }
310                scratch.push((
311                    s.to_string(),
312                    base * pref + freq as f64,
313                    is_single,
314                    freq,
315                ));
316            }
317        });
318        scratch.sort_by(|a, b| {
319            let a_promote = full_code && a.2 && a.3 > max_phrase_freq;
320            let b_promote = full_code && b.2 && b.3 > max_phrase_freq;
321            if a_promote != b_promote {
322                return if a_promote {
323                    std::cmp::Ordering::Less
324                } else {
325                    std::cmp::Ordering::Greater
326                };
327            }
328            b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)
329        });
330
331        out.reserve(scratch.len());
332        for (w, _, _, _) in scratch.drain(..) {
333            out.push(w);
334        }
335
336        // L0 pin: pull to position 0.
337        if let Ok(l0) = self.l0.read() {
338            if let Some(pref) = l0.pins.get(code) {
339                if let Some(idx) = out.iter().position(|w| w == pref) {
340                    if idx > 0 {
341                        let p = out.remove(idx);
342                        out.insert(0, p);
343                    }
344                }
345            }
346        }
347    }
348
349    /// Same as [`Self::lookup`] but exposes the `(word, layer, freq_score)` triples
350    /// in the FST's natural byte order. Callers that want to apply their
351    /// own ranking can start from this.
352    pub fn lookup_with_meta(&self, code: &str) -> Vec<(String, Layer, u64)> {
353        let lower = code.to_ascii_lowercase();
354        let mut results = Vec::new();
355        self.map.get_for_each(lower.as_bytes(), |word, value| {
356            if let Ok(s) = core::str::from_utf8(word) {
357                let (layer, freq) = unpack(value);
358                results.push((s.to_string(), layer, freq));
359            }
360        });
361        results
362    }
363
364    /// Prefix-prediction lookups: all `(word, freq, code_len)` triples where
365    /// `code` strictly extends `prefix` (i.e., `code_len > prefix.len()`).
366    /// Exact-code matches are excluded — those are not predictions.
367    ///
368    /// Returned tuples are ordered by `freq` descending, then `word` ascending
369    /// (FST byte order tiebreaker). Pins are NOT applied (per-code; prefix
370    /// scan can't generalize). Used by the composite dispatch to attach Wubi
371    /// prediction candidates in Mixed mode (e.g., `jj` → 日, 时, 旧 as
372    /// predictions in addition to exact 是/我).
373    ///
374    /// Raw frequency is returned (not score) so the caller can compose the
375    /// final score via `scoring::predict_score(base, freq, freq_mult,
376    /// proximity)` where `proximity = typed_len / code_len`.
377    pub fn prefix_predictions(&self, prefix: &str) -> Vec<(String, u64, usize)> {
378        let lower = prefix.to_ascii_lowercase();
379        let prefix_len = lower.len();
380        let mut results: Vec<(String, u64, usize)> = Vec::new();
381        self.map.prefix_for_each(lower.as_bytes(), |code_bytes, word_bytes, value| {
382            if code_bytes.len() <= prefix_len {
383                return;
384            }
385            if let (Ok(_code), Ok(word)) = (
386                core::str::from_utf8(code_bytes),
387                core::str::from_utf8(word_bytes),
388            ) {
389                let (_layer, freq) = unpack(value);
390                results.push((word.to_string(), freq, code_bytes.len()));
391            }
392        });
393        results.sort_by(|a, b| {
394            b.1.cmp(&a.1).then(a.0.cmp(&b.0))
395        });
396        results
397    }
398
399    /// Per-code lookup exposing raw `freq` alongside [`Layer`] — used by
400    /// the v1.4.7 composite hot path for the orthodox score
401    /// decomposition (split log_prior_q4 = Q4·ln(1+freq) from
402    /// log_likelihood_q4 = Q4·ln(layer.base()·pref·demotes)). The
403    /// existing [`Self::lookup_with_layer_into`] returns the combined
404    /// `layer.base()·pref + freq` score; for Q4 log-space additive
405    /// sort key (PLAN.md L1 probability-native ranking) we need the
406    /// two terms unfused.
407    ///
408    /// Rare-CJK filter NOT applied here (caller decides; consistent
409    /// with `lookup_with_layer_into`).
410    pub fn lookup_with_freq_layer_into(
411        &self,
412        code: &str,
413        out: &mut Vec<(String, Layer, u64)>,
414    ) {
415        out.clear();
416        let lower = code.to_ascii_lowercase();
417        self.map.get_for_each(lower.as_bytes(), |word, value| {
418            if let Ok(s) = core::str::from_utf8(word) {
419                let (layer, freq) = unpack(value);
420                out.push((s.to_string(), layer, freq));
421            }
422        });
423    }
424
425    /// Iterate every entry in the embedded dict, in FST traversal order
426    /// (canonical lexicographic by `code` bytes; for a given code the
427    /// internal layout sees `(code, word, packed_value)` triples). Used
428    /// by tools / snapshot binaries (e.g. v1.4.3 `idf-from-wubi-tables`)
429    /// that need to re-emit the full dict in another format. NOT a
430    /// runtime hot-path API — allocates one `(String, String)` pair per
431    /// entry (~135k for the embedded dict, ~5 MB allocation total).
432    ///
433    /// `layer` is the layered confidence band ([`Layer`]), `freq` is the
434    /// per-entry frequency score (post-`pack` / pre-`unpack`).
435    pub fn all_entries(&self) -> Vec<(String, String, Layer, u64)> {
436        let mut results: Vec<(String, String, Layer, u64)> = Vec::new();
437        self.map.prefix_for_each(b"", |code_bytes, word_bytes, value| {
438            if let (Ok(code), Ok(word)) = (
439                core::str::from_utf8(code_bytes),
440                core::str::from_utf8(word_bytes),
441            ) {
442                let (layer, freq) = unpack(value);
443                results.push((code.to_string(), word.to_string(), layer, freq));
444            }
445        });
446        results
447    }
448
449    /// All `(code, word)` pairs with code starting with `prefix`, ordered by
450    /// (effective L1 weight desc, code, word). Pins are NOT applied here —
451    /// they're per-code and don't generalize across a prefix scan.
452    pub fn prefix(&self, prefix: &str) -> Vec<(String, String)> {
453        let lower = prefix.to_ascii_lowercase();
454
455        let prefs = self
456            .l0
457            .read()
458            .map(|g| g.layer_prefs)
459            .unwrap_or(DEFAULT_LAYER_PREFS);
460
461        let mut results: Vec<(String, String, f64)> = Vec::new();
462        self.map.prefix_for_each(lower.as_bytes(), |code_bytes, word_bytes, value| {
463            if let (Ok(code), Ok(word)) = (
464                core::str::from_utf8(code_bytes),
465                core::str::from_utf8(word_bytes),
466            ) {
467                let (layer, freq) = unpack(value);
468                let score = layer.base() as f64 * prefs[layer.as_index()] + freq as f64;
469                results.push((code.to_string(), word.to_string(), score));
470            }
471        });
472        results.sort_by(|a, b| {
473            b.2.partial_cmp(&a.2)
474                .unwrap_or(std::cmp::Ordering::Equal)
475                .then(a.0.cmp(&b.0))
476                .then(a.1.cmp(&b.1))
477        });
478        results.into_iter().map(|(c, w, _)| (c, w)).collect()
479    }
480
481    // -------------------------------------------------------------------
482    // L0 mutation
483    // -------------------------------------------------------------------
484
485    /// Record that the user picked `word` for `code`. If this is the
486    /// `PROMOTE_THRESHOLD`-th consecutive pick, the word is auto-pinned and
487    /// all counters for `code` are cleared. Returns `true` iff this call
488    /// caused a promotion.
489    ///
490    /// Silently no-ops if `(code, word)` isn't in L1 (defends against the
491    /// host accidentally feeding us things the user couldn't actually have
492    /// selected from candidates).
493    pub fn record_pick(&self, code: &str, word: &str) -> bool {
494        if !self.exists_in_l1(code, word) {
495            return false;
496        }
497        let Ok(mut l0) = self.l0.write() else {
498            return false;
499        };
500        let key = (code.to_string(), word.to_string());
501        let count = l0.pick_counts.entry(key).or_insert(0);
502        *count += 1;
503        if *count >= PROMOTE_THRESHOLD {
504            l0.pins.insert(code.to_string(), word.to_string());
505            l0.pick_counts.retain(|(c, _), _| c != code);
506            return true;
507        }
508        false
509    }
510
511    /// Force-pin a word without going through the pick counter. Validates
512    /// against L1; returns whether the pin was applied.
513    pub fn pin(&self, code: &str, word: &str) -> bool {
514        if !self.exists_in_l1(code, word) {
515            return false;
516        }
517        let Ok(mut l0) = self.l0.write() else {
518            return false;
519        };
520        l0.pins.insert(code.to_string(), word.to_string());
521        l0.pick_counts.retain(|(c, _), _| c != code);
522        true
523    }
524
525    /// Drop the pin for `code` (if any) AND any pick counters for it. Returns
526    /// whether any state was removed.
527    pub fn forget(&self, code: &str) -> bool {
528        let Ok(mut l0) = self.l0.write() else {
529            return false;
530        };
531        let had_pin = l0.pins.remove(code).is_some();
532        let len_before = l0.pick_counts.len();
533        l0.pick_counts.retain(|(c, _), _| c != code);
534        had_pin || l0.pick_counts.len() != len_before
535    }
536
537    /// Set the multiplier for `layer`. Negative or non-finite values are
538    /// clamped to 0.0 (silently — they're nonsensical for ranking).
539    pub fn set_layer_pref(&self, layer: Layer, multiplier: f64) {
540        let m = if multiplier.is_finite() && multiplier >= 0.0 {
541            multiplier
542        } else {
543            0.0
544        };
545        if let Ok(mut l0) = self.l0.write() {
546            l0.layer_prefs[layer.as_index()] = m;
547        }
548    }
549
550    /// Current multiplier for `layer`. Returns the default value if the
551    /// internal lock is poisoned (treat as best-effort).
552    pub fn layer_pref(&self, layer: Layer) -> f64 {
553        self.l0
554            .read()
555            .map(|g| g.layer_prefs[layer.as_index()])
556            .unwrap_or(DEFAULT_LAYER_PREFS[layer.as_index()])
557    }
558
559    /// Snapshot the entire L0 layer (pins + pick counts + layer prefs) for
560    /// host-side persistence. Pair with [`WubiDict::import_l0`] on app
561    /// startup.
562    pub fn export_l0(&self) -> L0Snapshot {
563        let Ok(l0) = self.l0.read() else {
564            return L0Snapshot {
565                pins: Vec::new(),
566                pick_counts: Vec::new(),
567                layer_prefs: DEFAULT_LAYER_PREFS,
568            };
569        };
570        L0Snapshot {
571            pins: l0
572                .pins
573                .iter()
574                .map(|(k, v)| (k.clone(), v.clone()))
575                .collect(),
576            pick_counts: l0
577                .pick_counts
578                .iter()
579                .map(|((c, w), n)| (c.clone(), w.clone(), *n))
580                .collect(),
581            layer_prefs: l0.layer_prefs,
582        }
583    }
584
585    /// Replace the entire L0 layer with `snap`. Pins / pick_counts whose
586    /// `(code, word)` isn't in L1 are silently dropped (lexicon may have
587    /// evolved between versions). Returns the count of *accepted* pins.
588    pub fn import_l0(&self, snap: L0Snapshot) -> usize {
589        // Validate everything against L1 before touching state, then commit.
590        let valid_pins: Vec<(String, String)> = snap
591            .pins
592            .into_iter()
593            .filter(|(c, w)| self.exists_in_l1(c, w))
594            .collect();
595        let valid_counts: Vec<((String, String), u32)> = snap
596            .pick_counts
597            .into_iter()
598            .filter_map(|(c, w, n)| {
599                if self.exists_in_l1(&c, &w) {
600                    Some(((c, w), n))
601                } else {
602                    None
603                }
604            })
605            .collect();
606        let accepted = valid_pins.len();
607
608        let Ok(mut l0) = self.l0.write() else {
609            return 0;
610        };
611        l0.pins = valid_pins.into_iter().collect();
612        l0.pick_counts = valid_counts.into_iter().collect();
613        l0.layer_prefs = snap.layer_prefs;
614        accepted
615    }
616
617    fn exists_in_l1(&self, code: &str, word: &str) -> bool {
618        self.lookup_with_meta(code)
619            .iter()
620            .any(|(w, _, _)| w == word)
621    }
622}
623
624#[cfg(test)]
625mod tests {
626    use super::*;
627
628    #[test]
629    fn embedded_loads() {
630        let d = WubiDict::embedded();
631        assert!(d.len() >= 50);
632    }
633
634    #[test]
635    fn jianma1_g_returns_yi_first() {
636        let d = WubiDict::embedded();
637        let words = d.lookup("g");
638        assert_eq!(words.first().map(String::as_str), Some("一"));
639    }
640
641    #[test]
642    fn khlg_phrase_outranks_extension_char() {
643        let d = WubiDict::embedded();
644        let words = d.lookup("khlg");
645        let zg = words.iter().position(|w| w == "中国");
646        let ext = words.iter().position(|w| w == "䟧");
647        if let (Some(zg), Some(ext)) = (zg, ext) {
648            assert!(zg < ext, "中国 should rank above 䟧, got {words:?}");
649        }
650    }
651
652    #[test]
653    fn rrrr_keyname_outranks_phrase() {
654        let d = WubiDict::embedded();
655        let words = d.lookup("rrrr");
656        let bai = words.iter().position(|w| w == "白");
657        let zhua = words.iter().position(|w| w == "抓拍");
658        if let (Some(bai), Some(zhua)) = (bai, zhua) {
659            assert!(bai < zhua, "白 should rank above 抓拍, got {words:?}");
660        }
661    }
662
663    #[test]
664    fn record_pick_promotes_after_threshold() {
665        let d = WubiDict::embedded();
666        // Three picks → promoted.
667        assert!(!d.record_pick("khlg", "跑车"));
668        assert!(!d.record_pick("khlg", "跑车"));
669        assert!(d.record_pick("khlg", "跑车"));
670        assert_eq!(d.lookup("khlg").first().map(String::as_str), Some("跑车"));
671        assert_eq!(d.l0_pin_count(), 1);
672        // Counters reset on promotion.
673        assert_eq!(d.l0_pending_count(), 0);
674    }
675
676    #[test]
677    fn record_pick_resets_on_promotion_so_others_must_earn_3_again() {
678        let d = WubiDict::embedded();
679        // Promote 跑车 first.
680        for _ in 0..3 {
681            d.record_pick("khlg", "跑车");
682        }
683        // Now picking 中国 once shouldn't auto-flip.
684        assert!(!d.record_pick("khlg", "中国"));
685        assert_eq!(d.lookup("khlg").first().map(String::as_str), Some("跑车"));
686        // But three picks of 中国 will dethrone 跑车.
687        assert!(!d.record_pick("khlg", "中国"));
688        assert!(d.record_pick("khlg", "中国"));
689        assert_eq!(d.lookup("khlg").first().map(String::as_str), Some("中国"));
690    }
691
692    #[test]
693    fn record_pick_rejects_unknown_word() {
694        let d = WubiDict::embedded();
695        for _ in 0..PROMOTE_THRESHOLD {
696            assert!(!d.record_pick("khlg", "this_is_not_a_real_word"));
697        }
698        assert_eq!(d.l0_pin_count(), 0);
699        assert_eq!(d.l0_pending_count(), 0);
700    }
701
702    #[test]
703    fn pin_force_pins_without_counters() {
704        let d = WubiDict::embedded();
705        assert!(d.pin("khlg", "跑车"));
706        assert_eq!(d.lookup("khlg").first().map(String::as_str), Some("跑车"));
707    }
708
709    #[test]
710    fn forget_clears_pin_and_counters() {
711        let d = WubiDict::embedded();
712        d.pin("khlg", "跑车");
713        d.record_pick("khlg", "中国");
714        assert!(d.forget("khlg"));
715        assert_eq!(d.lookup("khlg").first().map(String::as_str), Some("中国"));
716        assert_eq!(d.l0_pin_count(), 0);
717        assert_eq!(d.l0_pending_count(), 0);
718    }
719
720    #[test]
721    fn layer_pref_can_demote_a_layer() {
722        let d = WubiDict::embedded();
723        // Phrase normally beats Auto. Demote Phrase to 0 → Auto wins (if any).
724        // Use a code with both phrase and auto candidates: khlg has 中国 (Phrase)
725        // and 䟧 (Auto). Default Auto pref is 0.7 so Phrase still wins; bump
726        // Auto to 5.0 to flip.
727        d.set_layer_pref(Layer::Phrase, 0.0);
728        d.set_layer_pref(Layer::Auto, 5.0);
729        let words = d.lookup("khlg");
730        let ext = words.iter().position(|w| w == "䟧");
731        let zg = words.iter().position(|w| w == "中国");
732        if let (Some(ext), Some(zg)) = (ext, zg) {
733            assert!(
734                ext < zg,
735                "with Phrase=0 and Auto=5, 䟧 should outrank 中国, got {words:?}"
736            );
737        }
738    }
739
740    #[test]
741    fn export_import_roundtrip() {
742        let d = WubiDict::embedded();
743        d.pin("khlg", "跑车");
744        d.record_pick("wqvb", "您好");
745        d.set_layer_pref(Layer::Phrase, 1.5);
746        let snap = d.export_l0();
747        assert_eq!(snap.pins.len(), 1);
748        assert_eq!(snap.pick_counts.len(), 1);
749        assert!((snap.layer_prefs[Layer::Phrase.as_index()] - 1.5).abs() < f64::EPSILON);
750
751        d.forget("khlg");
752        d.forget("wqvb");
753        d.set_layer_pref(Layer::Phrase, 1.0);
754        assert_eq!(d.l0_pin_count(), 0);
755
756        let accepted = d.import_l0(snap);
757        assert_eq!(accepted, 1);
758        assert_eq!(d.lookup("khlg").first().map(String::as_str), Some("跑车"));
759        assert!((d.layer_pref(Layer::Phrase) - 1.5).abs() < f64::EPSILON);
760    }
761
762    #[test]
763    fn import_drops_invalid_entries() {
764        let d = WubiDict::embedded();
765        let snap = L0Snapshot {
766            pins: vec![
767                ("khlg".into(), "中国".into()),
768                ("khlg".into(), "bogus".into()),
769            ],
770            pick_counts: vec![("khlg".into(), "ghost".into(), 2)],
771            layer_prefs: DEFAULT_LAYER_PREFS,
772        };
773        let accepted = d.import_l0(snap);
774        assert_eq!(accepted, 1);
775        assert_eq!(d.l0_pending_count(), 0);
776    }
777
778    #[test]
779    fn set_layer_pref_clamps_negatives_and_nan() {
780        let d = WubiDict::embedded();
781        d.set_layer_pref(Layer::Phrase, -3.0);
782        assert_eq!(d.layer_pref(Layer::Phrase), 0.0);
783        d.set_layer_pref(Layer::Phrase, f64::NAN);
784        assert_eq!(d.layer_pref(Layer::Phrase), 0.0);
785    }
786
787    // Compile-time check — `PROMOTE_THRESHOLD` is a `const`, so a runtime
788    // assertion would be trivially true (and clippy flags it). A `const _`
789    // assertion fails at compile time if anyone ever sets it to 0.
790    const _: () = assert!(PROMOTE_THRESHOLD >= 1);
791}