inputx_wubi/dict.rs
1//! FST-backed Wubi dictionary with a two-tier ranking model.
2//!
3//! # L0 / L1+
4//!
5//! - **L1+** is the immutable lexicon: the embedded FST built at compile
6//! time, plus a per-entry [`Layer`] tag and a per-entry frequency score.
7//! Every entry's nominal weight is `LAYER_BASE[layer] + freq_score`. Future
8//! immutable layers (e.g., a per-app dictionary shipped by the host) can
9//! stack on top with the same shape.
10//! - **L0** is a thin, per-user override layer:
11//! - **Pinned candidates** — `code → preferred_word`. A pin moves that word
12//! to position 0 in `lookup`'s output, regardless of L1+ weight.
13//! - **Pick counters** — `(code, word) → u32`. [`WubiDict::record_pick`] increments
14//! the counter; once it hits [`PROMOTE_THRESHOLD`], the word is auto-
15//! pinned and all counters for that code are reset (so a later, different
16//! pick has to earn its 3 votes from scratch — prevents thrashing).
17//! - **Layer preferences** — `Layer → f64` multiplier (default 1.0, with
18//! `Auto = 0.7` so extension characters don't dominate). Applied to the
19//! L1 nominal weight at sort time. Settable via API; **not** auto-tuned.
20//!
21//! Layer prefs reorder *within* L1; pins override the resulting ordering at
22//! position 0. So in steady state most codes have empty L0 and the layer
23//! base ordering wins (hence "L0 default ≈ L1 default").
24
25use std::collections::HashMap;
26use std::sync::RwLock;
27
28use inputx_fsa::Dict;
29
30use crate::layer::{DEFAULT_LAYER_PREFS, LAYER_COUNT, Layer, unpack};
31
32const DICT_BYTES: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/wubi86.dict"));
33
34/// Number of consecutive picks of the same `(code, word)` required before
35/// L0 auto-pins it. Defaults to 3; can be overridden at build time via the
36/// `WUBI_PROMOTE_THRESHOLD` env var (developer escape hatch — not exposed
37/// to end users).
38pub const PROMOTE_THRESHOLD: u32 = parse_threshold_const();
39
40const fn parse_threshold_const() -> u32 {
41 match option_env!("WUBI_PROMOTE_THRESHOLD") {
42 Some(s) => parse_u32_const(s),
43 None => 3,
44 }
45}
46
47const fn parse_u32_const(s: &str) -> u32 {
48 let bytes = s.as_bytes();
49 if bytes.is_empty() {
50 panic!("WUBI_PROMOTE_THRESHOLD must not be empty");
51 }
52 let mut i = 0;
53 let mut n: u32 = 0;
54 while i < bytes.len() {
55 let b = bytes[i];
56 if b < b'0' || b > b'9' {
57 panic!("WUBI_PROMOTE_THRESHOLD must be ASCII digits");
58 }
59 n = n * 10 + (b - b'0') as u32;
60 i += 1;
61 }
62 if n == 0 {
63 panic!("WUBI_PROMOTE_THRESHOLD must be >= 1");
64 }
65 n
66}
67
68/// Persistent state of the L0 layer. Caller serializes / deserializes this
69/// however it likes (TOML, MessagePack, sqlite, …) — the crate intentionally
70/// has no `serde` dependency.
71#[derive(Debug, Clone)]
72pub struct L0Snapshot {
73 /// `(code, word)` pairs the user has pinned (manually or via `record_pick`
74 /// reaching threshold).
75 pub pins: Vec<(String, String)>,
76 /// `(code, word, count)` — pending pick counts that haven't yet reached
77 /// `PROMOTE_THRESHOLD`. Snapshot semantics are best-effort; a count of
78 /// `threshold - 1` restored after restart needs only one more pick to
79 /// promote.
80 pub pick_counts: Vec<(String, String, u32)>,
81 /// Layer multipliers, indexed by `Layer as usize`.
82 pub layer_prefs: [f64; LAYER_COUNT],
83}
84
85#[derive(Default)]
86struct L0Inner {
87 pins: HashMap<String, String>,
88 pick_counts: HashMap<(String, String), u32>,
89 layer_prefs: [f64; LAYER_COUNT],
90}
91
92impl L0Inner {
93 fn new() -> Self {
94 Self {
95 pins: HashMap::new(),
96 pick_counts: HashMap::new(),
97 layer_prefs: DEFAULT_LAYER_PREFS,
98 }
99 }
100}
101
102/// The Wubi 86 dictionary: an embedded FST plus a mutable L0 layer for
103/// per-user preference learning.
104///
105/// All read methods take `&self`. L0 mutations (`record_pick`, `pin`,
106/// `forget`, `set_layer_pref`, `import_l0`) also take `&self` — interior
107/// mutability via `RwLock` lets a single shared instance feed every
108/// concurrent IME / WASM session without exposing the lock to the caller.
109pub struct WubiDict {
110 map: Dict<&'static [u8]>,
111 l0: RwLock<L0Inner>,
112}
113
114impl WubiDict {
115 /// Construct the dictionary from the embedded FST. Cheap (validates the
116 /// FST header and initializes an empty L0); callers should still cache
117 /// the instance and reuse it for the program lifetime.
118 pub fn embedded() -> Self {
119 Self {
120 map: Dict::new(DICT_BYTES).expect("invalid embedded wubi dict"),
121 l0: RwLock::new(L0Inner::new()),
122 }
123 }
124
125 /// Number of distinct codes in the dictionary. (The two-level `Dict`
126 /// counts codes, not total (code, word) pairs.)
127 pub fn len(&self) -> usize {
128 self.map.len() as usize
129 }
130
131 /// `true` iff the dictionary has zero codes.
132 pub fn is_empty(&self) -> bool {
133 self.map.is_empty()
134 }
135
136 /// Number of L0 pinned codes.
137 pub fn l0_pin_count(&self) -> usize {
138 self.l0.read().map(|g| g.pins.len()).unwrap_or(0)
139 }
140
141 /// Number of distinct `(code, word)` pairs with pending pick counters.
142 pub fn l0_pending_count(&self) -> usize {
143 self.l0.read().map(|g| g.pick_counts.len()).unwrap_or(0)
144 }
145
146 // -------------------------------------------------------------------
147 // Lookups
148 // -------------------------------------------------------------------
149
150 /// Words for the exact code, ordered by:
151 /// 1. L0 pin (if any) at index 0,
152 /// 2. then `LAYER_BASE[layer] * layer_prefs[layer] + freq_score` desc,
153 /// 3. then FST byte order (stable tiebreaker).
154 ///
155 /// Allocates a fresh `Vec`. Hot-loop callers (the IME's per-keystroke
156 /// candidate refresh) should use [`Self::lookup_into`] to reuse a
157 /// caller-owned buffer.
158 pub fn lookup(&self, code: &str) -> Vec<String> {
159 let mut out = Vec::new();
160 self.lookup_into(code, &mut out);
161 out
162 }
163
164 /// Scored lookup: same ordering as `lookup_into` (layer / freq / promote
165 /// rules + L0 pin), but emits `(word, score)` tuples so the cross-engine
166 /// merge layer can do a single unified sort instead of hard-coding which
167 /// engine wins. Score reflects:
168 /// * layer.base() × layer_prefs (jianma1 = 1e6, …)
169 /// * + freq (corpus weight)
170 /// * × 100.0 if full-code single-char promotion fires
171 /// (see lookup_into doc for the rule)
172 /// * × 1000.0 if the candidate is L0-pinned
173 /// (must dominate any natural score)
174 ///
175 /// The post-multipliers keep wubi simcodes and L0 pins on top across
176 /// the cross-engine merge.
177 pub fn lookup_with_scores_into(&self, code: &str, out: &mut Vec<(String, f64)>) {
178 let mut layered = Vec::with_capacity(out.capacity());
179 self.lookup_with_layer_into(code, &mut layered);
180 out.clear();
181 out.reserve(layered.len());
182 for (w, score, _layer) in layered.drain(..) {
183 out.push((w, score));
184 }
185 }
186
187 /// Layer-aware scored lookup: identical to `lookup_with_scores_into`
188 /// but each candidate also carries its origin `Layer`. The composite
189 /// engine uses the layer tag to make context-aware ranking decisions
190 /// — e.g. demoting low-confidence Auto / Phrase entries at short
191 /// pinyin-shaped input while keeping high-confidence Jianma1/2/3 +
192 /// Zigen simcodes at full strength (the 伙 vs 嶙 distinction —
193 /// 伙 is Jianma2 wubi-simcode and must lead at #0 for its code,
194 /// 嶙 is typically Auto-layer and should not displace pinyin top).
195 pub fn lookup_with_layer_into(
196 &self,
197 code: &str,
198 out: &mut Vec<(String, f64, Layer)>,
199 ) {
200 out.clear();
201 let lower = code.to_ascii_lowercase();
202
203 let prefs = self
204 .l0
205 .read()
206 .map(|g| g.layer_prefs)
207 .unwrap_or(DEFAULT_LAYER_PREFS);
208
209 let full_code = lower.len() == 4;
210 // Tuple: (word, score, is_single, freq, layer).
211 let mut scratch: Vec<(String, f64, bool, u64, Layer)> = Vec::with_capacity(8);
212 let mut max_phrase_freq: u64 = 0;
213 self.map.get_for_each(lower.as_bytes(), |word, value| {
214 if let Ok(s) = core::str::from_utf8(word) {
215 let (layer, freq) = unpack(value);
216 let base = layer.base() as f64;
217 let pref = prefs[layer.as_index()];
218 let is_single = s.chars().count() == 1;
219 if !is_single && freq > max_phrase_freq {
220 max_phrase_freq = freq;
221 }
222 scratch.push((s.to_string(), base * pref + freq as f64, is_single, freq, layer));
223 }
224 });
225
226 // Apply full-code single-char promote (lifts qualifying single
227 // chars above the same-code phrases) and L0 pin (lifts the pinned
228 // word above natural sort).
229 let pinned: Option<String> = self.l0.read().ok().and_then(|g| g.pins.get(&lower).cloned());
230 for e in scratch.iter_mut() {
231 let promote = full_code && e.2 && e.3 > max_phrase_freq;
232 if promote {
233 e.1 *= 100.0;
234 }
235 if let Some(p) = &pinned
236 && &e.0 == p
237 {
238 e.1 *= 1000.0;
239 }
240 }
241 scratch.sort_by(|a, b| {
242 b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)
243 });
244
245 out.reserve(scratch.len());
246 for (w, score, _, _, layer) in scratch.drain(..) {
247 out.push((w, score, layer));
248 }
249 }
250
251 /// Same as [`Self::lookup`] but writes into a caller-owned buffer.
252 /// `out` is cleared (capacity preserved) on entry.
253 ///
254 /// Reuses the result buffer's allocation across calls — a measurable win
255 /// for the IME's per-keystroke candidate refresh, where the candidate
256 /// list is rebuilt thousands of times per typing session. Per-candidate
257 /// `String`s are still freshly allocated (the FST stream yields owned
258 /// bytes; the crate doesn't expose `&'static str` because the stream's
259 /// borrow doesn't outlive the call).
260 pub fn lookup_into(&self, code: &str, out: &mut Vec<String>) {
261 out.clear();
262
263 let lower = code.to_ascii_lowercase();
264 let prefix_len = lower.len();
265
266 let prefs = self
267 .l0
268 .read()
269 .map(|g| g.layer_prefs)
270 .unwrap_or(DEFAULT_LAYER_PREFS);
271
272 // Score during the FST scan; reuse a small scratch Vec.
273 // Tuple: (word, score, is_single_char, freq, is_phrase).
274 //
275 // The wubi-86 "full-code single-char wins" rule applied here:
276 // at a fully-typed 4-letter code, a single-char entry whose
277 // corpus frequency *exceeds the highest phrase frequency at
278 // the same code* gets promoted above all phrases. Otherwise
279 // the standard score ordering (layer_base × pref + freq) wins.
280 //
281 // Why this shape (relative freq comparison, not absolute):
282 //
283 // - gmww 两 (Auto, freq 37372) vs 两败俱伤 (Phrase, freq 15272):
284 // 37372 > 15272 → 两 promoted. ✓
285 //
286 // - wcng 鹟 (Auto, freq 5961) vs 公司 (Phrase, freq 42817):
287 // 5961 < 42817 → 鹟 stays at its natural Auto score (low),
288 // 公司 wins on layer_base alone. ✓
289 //
290 // - khlg 䟧 (Auto, freq 0) vs 中国 (Phrase, freq 44985):
291 // 0 < 44985 → 䟧 stays low, 中国 wins. ✓
292 //
293 // The earlier absolute-`freq > 0` gate worked for gmww but
294 // wrongly promoted any uncommon-but-corpus-present single char
295 // over a popular phrase (the wcng case the user just flagged).
296 let full_code = prefix_len == 4;
297 let mut scratch: Vec<(String, f64, bool, u64)> = Vec::with_capacity(8);
298 // Track the highest phrase frequency at this code so the
299 // promote decision can be made after the scan.
300 let mut max_phrase_freq: u64 = 0;
301 self.map.get_for_each(lower.as_bytes(), |word, value| {
302 if let Ok(s) = core::str::from_utf8(word) {
303 let (layer, freq) = unpack(value);
304 let base = layer.base() as f64;
305 let pref = prefs[layer.as_index()];
306 let is_single = s.chars().count() == 1;
307 if !is_single && freq > max_phrase_freq {
308 max_phrase_freq = freq;
309 }
310 scratch.push((
311 s.to_string(),
312 base * pref + freq as f64,
313 is_single,
314 freq,
315 ));
316 }
317 });
318 scratch.sort_by(|a, b| {
319 let a_promote = full_code && a.2 && a.3 > max_phrase_freq;
320 let b_promote = full_code && b.2 && b.3 > max_phrase_freq;
321 if a_promote != b_promote {
322 return if a_promote {
323 std::cmp::Ordering::Less
324 } else {
325 std::cmp::Ordering::Greater
326 };
327 }
328 b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)
329 });
330
331 out.reserve(scratch.len());
332 for (w, _, _, _) in scratch.drain(..) {
333 out.push(w);
334 }
335
336 // L0 pin: pull to position 0.
337 if let Ok(l0) = self.l0.read() {
338 if let Some(pref) = l0.pins.get(code) {
339 if let Some(idx) = out.iter().position(|w| w == pref) {
340 if idx > 0 {
341 let p = out.remove(idx);
342 out.insert(0, p);
343 }
344 }
345 }
346 }
347 }
348
349 /// Same as [`Self::lookup`] but exposes the `(word, layer, freq_score)` triples
350 /// in the FST's natural byte order. Callers that want to apply their
351 /// own ranking can start from this.
352 pub fn lookup_with_meta(&self, code: &str) -> Vec<(String, Layer, u64)> {
353 let lower = code.to_ascii_lowercase();
354 let mut results = Vec::new();
355 self.map.get_for_each(lower.as_bytes(), |word, value| {
356 if let Ok(s) = core::str::from_utf8(word) {
357 let (layer, freq) = unpack(value);
358 results.push((s.to_string(), layer, freq));
359 }
360 });
361 results
362 }
363
364 /// Prefix-prediction lookups: all `(word, freq, code_len)` triples where
365 /// `code` strictly extends `prefix` (i.e., `code_len > prefix.len()`).
366 /// Exact-code matches are excluded — those are not predictions.
367 ///
368 /// Returned tuples are ordered by `freq` descending, then `word` ascending
369 /// (FST byte order tiebreaker). Pins are NOT applied (per-code; prefix
370 /// scan can't generalize). Used by the composite dispatch to attach Wubi
371 /// prediction candidates in Mixed mode (e.g., `jj` → 日, 时, 旧 as
372 /// predictions in addition to exact 是/我).
373 ///
374 /// Raw frequency is returned (not score) so the caller can compose the
375 /// final score via `scoring::predict_score(base, freq, freq_mult,
376 /// proximity)` where `proximity = typed_len / code_len`.
377 pub fn prefix_predictions(&self, prefix: &str) -> Vec<(String, u64, usize)> {
378 let lower = prefix.to_ascii_lowercase();
379 let prefix_len = lower.len();
380 let mut results: Vec<(String, u64, usize)> = Vec::new();
381 self.map.prefix_for_each(lower.as_bytes(), |code_bytes, word_bytes, value| {
382 if code_bytes.len() <= prefix_len {
383 return;
384 }
385 if let (Ok(_code), Ok(word)) = (
386 core::str::from_utf8(code_bytes),
387 core::str::from_utf8(word_bytes),
388 ) {
389 let (_layer, freq) = unpack(value);
390 results.push((word.to_string(), freq, code_bytes.len()));
391 }
392 });
393 results.sort_by(|a, b| {
394 b.1.cmp(&a.1).then(a.0.cmp(&b.0))
395 });
396 results
397 }
398
399 /// Per-code lookup exposing raw `freq` alongside [`Layer`] — used by
400 /// the v1.4.7 composite hot path for the orthodox score
401 /// decomposition (split log_prior_q4 = Q4·ln(1+freq) from
402 /// log_likelihood_q4 = Q4·ln(layer.base()·pref·demotes)). The
403 /// existing [`Self::lookup_with_layer_into`] returns the combined
404 /// `layer.base()·pref + freq` score; for Q4 log-space additive
405 /// sort key (PLAN.md L1 probability-native ranking) we need the
406 /// two terms unfused.
407 ///
408 /// Rare-CJK filter NOT applied here (caller decides; consistent
409 /// with `lookup_with_layer_into`).
410 pub fn lookup_with_freq_layer_into(
411 &self,
412 code: &str,
413 out: &mut Vec<(String, Layer, u64)>,
414 ) {
415 out.clear();
416 let lower = code.to_ascii_lowercase();
417 self.map.get_for_each(lower.as_bytes(), |word, value| {
418 if let Ok(s) = core::str::from_utf8(word) {
419 let (layer, freq) = unpack(value);
420 out.push((s.to_string(), layer, freq));
421 }
422 });
423 }
424
425 /// Iterate every entry in the embedded dict, in FST traversal order
426 /// (canonical lexicographic by `code` bytes; for a given code the
427 /// internal layout sees `(code, word, packed_value)` triples). Used
428 /// by tools / snapshot binaries (e.g. v1.4.3 `idf-from-wubi-tables`)
429 /// that need to re-emit the full dict in another format. NOT a
430 /// runtime hot-path API — allocates one `(String, String)` pair per
431 /// entry (~135k for the embedded dict, ~5 MB allocation total).
432 ///
433 /// `layer` is the layered confidence band ([`Layer`]), `freq` is the
434 /// per-entry frequency score (post-`pack` / pre-`unpack`).
435 pub fn all_entries(&self) -> Vec<(String, String, Layer, u64)> {
436 let mut results: Vec<(String, String, Layer, u64)> = Vec::new();
437 self.map.prefix_for_each(b"", |code_bytes, word_bytes, value| {
438 if let (Ok(code), Ok(word)) = (
439 core::str::from_utf8(code_bytes),
440 core::str::from_utf8(word_bytes),
441 ) {
442 let (layer, freq) = unpack(value);
443 results.push((code.to_string(), word.to_string(), layer, freq));
444 }
445 });
446 results
447 }
448
449 /// All `(code, word)` pairs with code starting with `prefix`, ordered by
450 /// (effective L1 weight desc, code, word). Pins are NOT applied here —
451 /// they're per-code and don't generalize across a prefix scan.
452 pub fn prefix(&self, prefix: &str) -> Vec<(String, String)> {
453 let lower = prefix.to_ascii_lowercase();
454
455 let prefs = self
456 .l0
457 .read()
458 .map(|g| g.layer_prefs)
459 .unwrap_or(DEFAULT_LAYER_PREFS);
460
461 let mut results: Vec<(String, String, f64)> = Vec::new();
462 self.map.prefix_for_each(lower.as_bytes(), |code_bytes, word_bytes, value| {
463 if let (Ok(code), Ok(word)) = (
464 core::str::from_utf8(code_bytes),
465 core::str::from_utf8(word_bytes),
466 ) {
467 let (layer, freq) = unpack(value);
468 let score = layer.base() as f64 * prefs[layer.as_index()] + freq as f64;
469 results.push((code.to_string(), word.to_string(), score));
470 }
471 });
472 results.sort_by(|a, b| {
473 b.2.partial_cmp(&a.2)
474 .unwrap_or(std::cmp::Ordering::Equal)
475 .then(a.0.cmp(&b.0))
476 .then(a.1.cmp(&b.1))
477 });
478 results.into_iter().map(|(c, w, _)| (c, w)).collect()
479 }
480
481 // -------------------------------------------------------------------
482 // L0 mutation
483 // -------------------------------------------------------------------
484
485 /// Record that the user picked `word` for `code`. If this is the
486 /// `PROMOTE_THRESHOLD`-th consecutive pick, the word is auto-pinned and
487 /// all counters for `code` are cleared. Returns `true` iff this call
488 /// caused a promotion.
489 ///
490 /// Silently no-ops if `(code, word)` isn't in L1 (defends against the
491 /// host accidentally feeding us things the user couldn't actually have
492 /// selected from candidates).
493 pub fn record_pick(&self, code: &str, word: &str) -> bool {
494 if !self.exists_in_l1(code, word) {
495 return false;
496 }
497 let Ok(mut l0) = self.l0.write() else {
498 return false;
499 };
500 let key = (code.to_string(), word.to_string());
501 let count = l0.pick_counts.entry(key).or_insert(0);
502 *count += 1;
503 if *count >= PROMOTE_THRESHOLD {
504 l0.pins.insert(code.to_string(), word.to_string());
505 l0.pick_counts.retain(|(c, _), _| c != code);
506 return true;
507 }
508 false
509 }
510
511 /// Force-pin a word without going through the pick counter. Validates
512 /// against L1; returns whether the pin was applied.
513 pub fn pin(&self, code: &str, word: &str) -> bool {
514 if !self.exists_in_l1(code, word) {
515 return false;
516 }
517 let Ok(mut l0) = self.l0.write() else {
518 return false;
519 };
520 l0.pins.insert(code.to_string(), word.to_string());
521 l0.pick_counts.retain(|(c, _), _| c != code);
522 true
523 }
524
525 /// Drop the pin for `code` (if any) AND any pick counters for it. Returns
526 /// whether any state was removed.
527 pub fn forget(&self, code: &str) -> bool {
528 let Ok(mut l0) = self.l0.write() else {
529 return false;
530 };
531 let had_pin = l0.pins.remove(code).is_some();
532 let len_before = l0.pick_counts.len();
533 l0.pick_counts.retain(|(c, _), _| c != code);
534 had_pin || l0.pick_counts.len() != len_before
535 }
536
537 /// Set the multiplier for `layer`. Negative or non-finite values are
538 /// clamped to 0.0 (silently — they're nonsensical for ranking).
539 pub fn set_layer_pref(&self, layer: Layer, multiplier: f64) {
540 let m = if multiplier.is_finite() && multiplier >= 0.0 {
541 multiplier
542 } else {
543 0.0
544 };
545 if let Ok(mut l0) = self.l0.write() {
546 l0.layer_prefs[layer.as_index()] = m;
547 }
548 }
549
550 /// Current multiplier for `layer`. Returns the default value if the
551 /// internal lock is poisoned (treat as best-effort).
552 pub fn layer_pref(&self, layer: Layer) -> f64 {
553 self.l0
554 .read()
555 .map(|g| g.layer_prefs[layer.as_index()])
556 .unwrap_or(DEFAULT_LAYER_PREFS[layer.as_index()])
557 }
558
559 /// Snapshot the entire L0 layer (pins + pick counts + layer prefs) for
560 /// host-side persistence. Pair with [`WubiDict::import_l0`] on app
561 /// startup.
562 pub fn export_l0(&self) -> L0Snapshot {
563 let Ok(l0) = self.l0.read() else {
564 return L0Snapshot {
565 pins: Vec::new(),
566 pick_counts: Vec::new(),
567 layer_prefs: DEFAULT_LAYER_PREFS,
568 };
569 };
570 L0Snapshot {
571 pins: l0
572 .pins
573 .iter()
574 .map(|(k, v)| (k.clone(), v.clone()))
575 .collect(),
576 pick_counts: l0
577 .pick_counts
578 .iter()
579 .map(|((c, w), n)| (c.clone(), w.clone(), *n))
580 .collect(),
581 layer_prefs: l0.layer_prefs,
582 }
583 }
584
585 /// Replace the entire L0 layer with `snap`. Pins / pick_counts whose
586 /// `(code, word)` isn't in L1 are silently dropped (lexicon may have
587 /// evolved between versions). Returns the count of *accepted* pins.
588 pub fn import_l0(&self, snap: L0Snapshot) -> usize {
589 // Validate everything against L1 before touching state, then commit.
590 let valid_pins: Vec<(String, String)> = snap
591 .pins
592 .into_iter()
593 .filter(|(c, w)| self.exists_in_l1(c, w))
594 .collect();
595 let valid_counts: Vec<((String, String), u32)> = snap
596 .pick_counts
597 .into_iter()
598 .filter_map(|(c, w, n)| {
599 if self.exists_in_l1(&c, &w) {
600 Some(((c, w), n))
601 } else {
602 None
603 }
604 })
605 .collect();
606 let accepted = valid_pins.len();
607
608 let Ok(mut l0) = self.l0.write() else {
609 return 0;
610 };
611 l0.pins = valid_pins.into_iter().collect();
612 l0.pick_counts = valid_counts.into_iter().collect();
613 l0.layer_prefs = snap.layer_prefs;
614 accepted
615 }
616
617 fn exists_in_l1(&self, code: &str, word: &str) -> bool {
618 self.lookup_with_meta(code)
619 .iter()
620 .any(|(w, _, _)| w == word)
621 }
622}
623
624#[cfg(test)]
625mod tests {
626 use super::*;
627
628 #[test]
629 fn embedded_loads() {
630 let d = WubiDict::embedded();
631 assert!(d.len() >= 50);
632 }
633
634 #[test]
635 fn jianma1_g_returns_yi_first() {
636 let d = WubiDict::embedded();
637 let words = d.lookup("g");
638 assert_eq!(words.first().map(String::as_str), Some("一"));
639 }
640
641 #[test]
642 fn khlg_phrase_outranks_extension_char() {
643 let d = WubiDict::embedded();
644 let words = d.lookup("khlg");
645 let zg = words.iter().position(|w| w == "中国");
646 let ext = words.iter().position(|w| w == "䟧");
647 if let (Some(zg), Some(ext)) = (zg, ext) {
648 assert!(zg < ext, "中国 should rank above 䟧, got {words:?}");
649 }
650 }
651
652 #[test]
653 fn rrrr_keyname_outranks_phrase() {
654 let d = WubiDict::embedded();
655 let words = d.lookup("rrrr");
656 let bai = words.iter().position(|w| w == "白");
657 let zhua = words.iter().position(|w| w == "抓拍");
658 if let (Some(bai), Some(zhua)) = (bai, zhua) {
659 assert!(bai < zhua, "白 should rank above 抓拍, got {words:?}");
660 }
661 }
662
663 #[test]
664 fn record_pick_promotes_after_threshold() {
665 let d = WubiDict::embedded();
666 // Three picks → promoted.
667 assert!(!d.record_pick("khlg", "跑车"));
668 assert!(!d.record_pick("khlg", "跑车"));
669 assert!(d.record_pick("khlg", "跑车"));
670 assert_eq!(d.lookup("khlg").first().map(String::as_str), Some("跑车"));
671 assert_eq!(d.l0_pin_count(), 1);
672 // Counters reset on promotion.
673 assert_eq!(d.l0_pending_count(), 0);
674 }
675
676 #[test]
677 fn record_pick_resets_on_promotion_so_others_must_earn_3_again() {
678 let d = WubiDict::embedded();
679 // Promote 跑车 first.
680 for _ in 0..3 {
681 d.record_pick("khlg", "跑车");
682 }
683 // Now picking 中国 once shouldn't auto-flip.
684 assert!(!d.record_pick("khlg", "中国"));
685 assert_eq!(d.lookup("khlg").first().map(String::as_str), Some("跑车"));
686 // But three picks of 中国 will dethrone 跑车.
687 assert!(!d.record_pick("khlg", "中国"));
688 assert!(d.record_pick("khlg", "中国"));
689 assert_eq!(d.lookup("khlg").first().map(String::as_str), Some("中国"));
690 }
691
692 #[test]
693 fn record_pick_rejects_unknown_word() {
694 let d = WubiDict::embedded();
695 for _ in 0..PROMOTE_THRESHOLD {
696 assert!(!d.record_pick("khlg", "this_is_not_a_real_word"));
697 }
698 assert_eq!(d.l0_pin_count(), 0);
699 assert_eq!(d.l0_pending_count(), 0);
700 }
701
702 #[test]
703 fn pin_force_pins_without_counters() {
704 let d = WubiDict::embedded();
705 assert!(d.pin("khlg", "跑车"));
706 assert_eq!(d.lookup("khlg").first().map(String::as_str), Some("跑车"));
707 }
708
709 #[test]
710 fn forget_clears_pin_and_counters() {
711 let d = WubiDict::embedded();
712 d.pin("khlg", "跑车");
713 d.record_pick("khlg", "中国");
714 assert!(d.forget("khlg"));
715 assert_eq!(d.lookup("khlg").first().map(String::as_str), Some("中国"));
716 assert_eq!(d.l0_pin_count(), 0);
717 assert_eq!(d.l0_pending_count(), 0);
718 }
719
720 #[test]
721 fn layer_pref_can_demote_a_layer() {
722 let d = WubiDict::embedded();
723 // Phrase normally beats Auto. Demote Phrase to 0 → Auto wins (if any).
724 // Use a code with both phrase and auto candidates: khlg has 中国 (Phrase)
725 // and 䟧 (Auto). Default Auto pref is 0.7 so Phrase still wins; bump
726 // Auto to 5.0 to flip.
727 d.set_layer_pref(Layer::Phrase, 0.0);
728 d.set_layer_pref(Layer::Auto, 5.0);
729 let words = d.lookup("khlg");
730 let ext = words.iter().position(|w| w == "䟧");
731 let zg = words.iter().position(|w| w == "中国");
732 if let (Some(ext), Some(zg)) = (ext, zg) {
733 assert!(
734 ext < zg,
735 "with Phrase=0 and Auto=5, 䟧 should outrank 中国, got {words:?}"
736 );
737 }
738 }
739
740 #[test]
741 fn export_import_roundtrip() {
742 let d = WubiDict::embedded();
743 d.pin("khlg", "跑车");
744 d.record_pick("wqvb", "您好");
745 d.set_layer_pref(Layer::Phrase, 1.5);
746 let snap = d.export_l0();
747 assert_eq!(snap.pins.len(), 1);
748 assert_eq!(snap.pick_counts.len(), 1);
749 assert!((snap.layer_prefs[Layer::Phrase.as_index()] - 1.5).abs() < f64::EPSILON);
750
751 d.forget("khlg");
752 d.forget("wqvb");
753 d.set_layer_pref(Layer::Phrase, 1.0);
754 assert_eq!(d.l0_pin_count(), 0);
755
756 let accepted = d.import_l0(snap);
757 assert_eq!(accepted, 1);
758 assert_eq!(d.lookup("khlg").first().map(String::as_str), Some("跑车"));
759 assert!((d.layer_pref(Layer::Phrase) - 1.5).abs() < f64::EPSILON);
760 }
761
762 #[test]
763 fn import_drops_invalid_entries() {
764 let d = WubiDict::embedded();
765 let snap = L0Snapshot {
766 pins: vec![
767 ("khlg".into(), "中国".into()),
768 ("khlg".into(), "bogus".into()),
769 ],
770 pick_counts: vec![("khlg".into(), "ghost".into(), 2)],
771 layer_prefs: DEFAULT_LAYER_PREFS,
772 };
773 let accepted = d.import_l0(snap);
774 assert_eq!(accepted, 1);
775 assert_eq!(d.l0_pending_count(), 0);
776 }
777
778 #[test]
779 fn set_layer_pref_clamps_negatives_and_nan() {
780 let d = WubiDict::embedded();
781 d.set_layer_pref(Layer::Phrase, -3.0);
782 assert_eq!(d.layer_pref(Layer::Phrase), 0.0);
783 d.set_layer_pref(Layer::Phrase, f64::NAN);
784 assert_eq!(d.layer_pref(Layer::Phrase), 0.0);
785 }
786
787 // Compile-time check — `PROMOTE_THRESHOLD` is a `const`, so a runtime
788 // assertion would be trivially true (and clippy flags it). A `const _`
789 // assertion fails at compile time if anyone ever sets it to 0.
790 const _: () = assert!(PROMOTE_THRESHOLD >= 1);
791}