Skip to main content

inputx_wubi_data/
lib.rs

1//! `inputx-wubi-data` — embedded Wubi 86 IDFv1 dict + lookup helpers
2//! for the [`inputx-wubi`](https://crates.io/crates/inputx-wubi)
3//! engine, packaged as a publishable stone.
4//!
5//! Successor to [`inputx-wubi-cement`](https://crates.io/crates/inputx-wubi-cement)
6//! under the v1.5 D11 taxonomy correction (2026-05): cement = an
7//! application's source code (your own `wubi.rs` / `engine.rs`),
8//! NOT a published crate. The historical `-cement`-suffix crate is
9//! deprecated and re-exports from this crate for backward compat.
10//!
11//! ## What's in the box
12//!
13//! - [`EMBEDDED_WUBI_IDF`] — IDFv1 binary dict blob with the wubi
14//!   `Layer` enum index encoded in `EntryFlags::engine_tag()`
15//!   (v1.4.7 sub-phase A4 step 2).
16//! - [`wubi_idf_reader`] — process-global `OnceLock<IdfReader>` over
17//!   the embedded blob; amortizes the 4 MB parse + sha256 verify
18//!   across the process lifetime.
19//! - [`layer_from_idf_tag`] — reverse of `Layer::as_u8`; decodes an
20//!   IDF entry's engine_tag back into the originating wubi `Layer`.
21//! - `table` module — process-global stateful `WubiDict` cache +
22//!   per-code lookup helpers (`lookup`, `lookup_with_scores`,
23//!   `lookup_with_layer`, `lookup_with_freq_layer`,
24//!   `prefix_predictions`, `record_pick`, `export_l0`, `import_l0`)
25//!   + rare-CJK toggle (`set_show_rare` / `show_rare`) + warmup
26//!   helper.
27//!
28//! ## What's NOT here
29//!
30//! - **Stateful `WubiEngine`** (buffer / `handle_letter` /
31//!   auto-commit / commit_index / L0 pin state machine) — that
32//!   classifies as application cement per the v1.5 D11 correction
33//!   and now lives in the Inputx monorepo's
34//!   [`inputx-core/src/wubi/engine.rs`](https://github.com/goliajp/inputx/blob/develop/core/crates/inputx-core/src/wubi/engine.rs).
35//!   IME implementers copying this stone are expected to bring their
36//!   own state machine matching their UI ergonomics.
37
38mod table;
39
40use std::sync::OnceLock;
41
42use inputx_dict_format::IdfReader;
43
44pub use table::{
45    export_l0, import_l0, is_displayable, lookup, lookup_with_freq_layer,
46    lookup_with_layer, lookup_with_scores, prefix_predictions, record_pick,
47    set_show_rare, show_rare, warmup,
48};
49/// Re-export of the wubi L0 snapshot type so hosts can build /
50/// destructure it without depending on the `inputx-wubi` crate
51/// directly.
52pub use inputx_wubi::L0Snapshot;
53
54/// Embedded IDFv1 wubi dict blob, sourced from
55/// `inputx-wubi-data/data/words.idf` at compile time. Each entry's
56/// `EntryFlags::engine_tag()` carries the wubi `Layer` enum index
57/// (v1.4.7 sub-phase A4 step 2 schema bump), so cement-side fills
58/// can reconstruct `(word, layer, raw_freq)` without re-reading the
59/// `inputx_wubi::WubiDict` table.
60pub const EMBEDDED_WUBI_IDF: &[u8] =
61    include_bytes!("../data/words.idf");
62
63/// Process-global [`IdfReader`] over [`EMBEDDED_WUBI_IDF`]. Parses
64/// the 4 MB header / FST / entry-table sections once and amortizes
65/// the ~few-ms cost over the whole process lifetime; subsequent
66/// `wubi_idf_reader().lookup(code)` calls are O(|code|) FST walks
67/// with zero allocation per query.
68pub fn wubi_idf_reader() -> &'static IdfReader<&'static [u8]> {
69    static READER: OnceLock<IdfReader<&'static [u8]>> = OnceLock::new();
70    READER.get_or_init(|| {
71        IdfReader::from_bytes(EMBEDDED_WUBI_IDF)
72            .expect("inputx-wubi-data EMBEDDED_WUBI_IDF must be a valid IDFv1 blob")
73    })
74}
75
76/// Decode an IDF wubi entry's `EntryFlags::engine_tag()` back into
77/// the originating `inputx_wubi::Layer` variant. Falls back to
78/// `Layer::Auto` on out-of-range bytes (defensive — the writer only
79/// emits 0..=5).
80pub fn layer_from_idf_tag(tag: u8) -> inputx_wubi::Layer {
81    inputx_wubi::Layer::from_u8(tag).unwrap_or(inputx_wubi::Layer::Auto)
82}
83
84#[cfg(test)]
85mod tests {
86    use super::*;
87
88    #[test]
89    fn wubi_idf_reader_parses_and_supports_exact_lookup_with_layer() {
90        let r = wubi_idf_reader();
91        assert!(r.entry_count() > 100_000);
92        let hits = r.lookup(b"g");
93        assert!(!hits.is_empty(), "g must have at least one Jianma1 entry");
94        let yi = hits.iter().find(|e| e.word == "一");
95        assert!(yi.is_some(), "g → 一 expected; got readings {:?}", hits.iter().map(|e| e.word).collect::<Vec<_>>());
96        let yi = yi.unwrap();
97        assert_eq!(
98            layer_from_idf_tag(yi.flags.engine_tag()),
99            inputx_wubi::Layer::Jianma1,
100        );
101    }
102}