Skip to main content

inputx_wubi/
layer.rs

1//! Layer taxonomy for dictionary entries.
2//!
3//! Every `(code, word)` belongs to exactly one [`Layer`] determined at build
4//! time. The layer feeds two purposes:
5//!
6//! 1. **Coarse ordering** — [`LAYER_BASE`] gives each layer a numeric base
7//!    weight so 一级简码 always outranks any 二级简码, etc., regardless of
8//!    per-entry frequency.
9//! 2. **User-tunable preference** — `WubiDict::set_layer_pref` lets the host
10//!    multiply a layer's contribution at lookup time without touching data.
11//!
12//! Index values pack `(layer << FREQ_BITS) | freq_score` so the runtime can
13//! read both in one stream pass. `freq_score` is the corpus-derived frequency
14//! from `data/weights/weights.tsv` (capped at [`MAX_FREQ_SCORE`]; real data
15//! tops out around 50k), normalized within layer.
16
17/// Discriminants are **ascending priority**: `Auto = 0` is lowest, `Jianma1`
18/// is highest. This makes the packed `(layer << FREQ_BITS) | freq` compare
19/// correctly with raw `u64` ordering — higher u64 = higher priority — so the
20/// build-time merge step can keep the larger value on collision without
21/// special casing.
22#[repr(u8)]
23#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
24pub enum Layer {
25    /// Auto-decomposed CJK extension character — algorithmically valid but
26    /// may pick a non-canonical 字根 sequence.
27    Auto = 0,
28    /// Multi-character phrase (词组).
29    Phrase = 1,
30    /// Hand-curated 字根/seed entry — pedagogically canonical decomposition.
31    Zigen = 2,
32    /// 三级简码 — three-letter shortcut.
33    Jianma3 = 3,
34    /// 二级简码 — two-letter shortcut.
35    Jianma2 = 4,
36    /// 一级简码 — single-letter shortcut (e.g., `g → 一`).
37    Jianma1 = 5,
38}
39
40/// Total number of layers. Acts as the array length for `LAYER_BASE`,
41/// `DEFAULT_LAYER_PREFS`, and any per-layer table the host might keep.
42pub const LAYER_COUNT: usize = 6;
43
44/// Per-layer base weight, **indexed by `Layer as usize`** (ascending). Values
45/// are spaced so that any in-layer frequency score (capped well below the
46/// gap) cannot reorder layers, but a sufficient `layer_pref` multiplier can.
47pub const LAYER_BASE: [u64; LAYER_COUNT] = [
48    100_000,   // [0] Auto
49    400_000,   // [1] Phrase
50    500_000,   // [2] Zigen
51    600_000,   // [3] Jianma3
52    800_000,   // [4] Jianma2
53    1_000_000, // [5] Jianma1
54];
55
56/// Default `layer_prefs`, indexed by `Layer as usize`. `Auto` is dampened to
57/// 0.7 so extension characters don't pollute the top of common 4-letter
58/// codes; everything else is 1.0.
59#[allow(dead_code)] // runtime-only; build.rs doesn't use prefs
60pub const DEFAULT_LAYER_PREFS: [f64; LAYER_COUNT] = [
61    0.7, // [0] Auto
62    1.0, // [1] Phrase
63    1.0, // [2] Zigen
64    1.0, // [3] Jianma3
65    1.0, // [4] Jianma2
66    1.0, // [5] Jianma1
67];
68
69#[allow(dead_code)] // some methods are runtime-only; build.rs sees them as unused
70impl Layer {
71    /// Decode from the discriminant byte. `None` for any value outside
72    /// `0..=5` — used by [`unpack`] to recover the layer from a packed FST
73    /// value, falling back to [`Layer::Auto`] on corruption.
74    pub const fn from_u8(b: u8) -> Option<Self> {
75        match b {
76            0 => Some(Self::Auto),
77            1 => Some(Self::Phrase),
78            2 => Some(Self::Zigen),
79            3 => Some(Self::Jianma3),
80            4 => Some(Self::Jianma2),
81            5 => Some(Self::Jianma1),
82            _ => None,
83        }
84    }
85
86    /// The discriminant byte (0..=5).
87    pub const fn as_u8(self) -> u8 {
88        self as u8
89    }
90
91    /// Convert to a `usize` index suitable for `LAYER_BASE` /
92    /// `DEFAULT_LAYER_PREFS` array access.
93    pub const fn as_index(self) -> usize {
94        self as usize
95    }
96
97    /// Layer base weight. Equivalent to `LAYER_BASE[self.as_index()]`.
98    pub const fn base(self) -> u64 {
99        LAYER_BASE[self as usize]
100    }
101}
102
103/// Bits reserved for `freq_score` in the packed value. The corpus pipeline
104/// caps freq at `max_freq_score` (65535 = 16 bits); 20 gives headroom.
105/// Layer sits ABOVE freq so a larger packed u64 still means higher priority
106/// (layer desc, then freq desc) — the invariant the build-time merge and the
107/// inputx-fsa Dict's value-desc item order both rely on. Keeping the packed
108/// value small (≤ ~2^23 vs the old ~2^58) is what lets the LEB128 value
109/// encoding shrink from ~9 bytes to ~4 (zerodep E1).
110const FREQ_BITS: u32 = 20;
111const FREQ_MASK: u64 = (1 << FREQ_BITS) - 1;
112
113/// Largest `freq_score` the packed value can represent (`2^FREQ_BITS - 1`).
114/// The **single source of truth** for the freq domain — tests and any caller
115/// that needs to clamp/validate import this rather than hardcoding a copy
116/// (a stale copy in the proptest survived the E1 `FREQ_BITS` 56→20 change and
117/// silently broke the invariants until proptest caught it).
118pub const MAX_FREQ_SCORE: u64 = FREQ_MASK;
119
120/// Pack `(layer, freq_score)` into a single u64 index value. `freq_score` is
121/// **saturated** to [`MAX_FREQ_SCORE`] if it exceeds the field, never wrapped:
122/// for an order-by-value structure a wraparound would invert priority (a very
123/// high freq would pack to a tiny value and rank last), whereas clamping keeps
124/// "higher freq → higher-or-equal priority". Real freqs (≤ ~50k) are far
125/// inside the field, so this only matters as a defensive guarantee.
126#[allow(dead_code)] // used by build.rs and at runtime; build_weights.rs doesn't pack
127pub const fn pack(layer: Layer, freq_score: u64) -> u64 {
128    let freq = if freq_score > FREQ_MASK { FREQ_MASK } else { freq_score };
129    ((layer as u64) << FREQ_BITS) | freq
130}
131
132/// Reverse of [`pack`]. Unknown layer bytes fall back to [`Layer::Auto`]
133/// (lowest priority) — preferable to panicking on a corrupt FST.
134#[allow(dead_code)] // runtime-only; build.rs only uses pack
135pub const fn unpack(packed: u64) -> (Layer, u64) {
136    let layer_byte = (packed >> FREQ_BITS) as u8;
137    let freq = packed & FREQ_MASK;
138    let layer = match Layer::from_u8(layer_byte) {
139        Some(l) => l,
140        None => Layer::Auto,
141    };
142    (layer, freq)
143}
144
145#[cfg(test)]
146mod tests {
147    use super::*;
148
149    #[test]
150    fn pack_unpack_roundtrip() {
151        for &l in &[
152            Layer::Jianma1,
153            Layer::Jianma2,
154            Layer::Jianma3,
155            Layer::Zigen,
156            Layer::Phrase,
157            Layer::Auto,
158        ] {
159            for &f in &[0u64, 1, 1234, FREQ_MASK] {
160                let p = pack(l, f);
161                let (lu, fu) = unpack(p);
162                assert_eq!(lu, l);
163                assert_eq!(fu, f);
164            }
165        }
166    }
167
168    #[test]
169    fn freq_overflow_is_saturated() {
170        // Over-range freq clamps to MAX_FREQ_SCORE (not wrapped to 0) and the
171        // layer is untouched — so an out-of-range freq still ranks at the top
172        // of its layer rather than the bottom.
173        for over in [FREQ_MASK + 1, u64::MAX, 1 << 40] {
174            let p = pack(Layer::Phrase, over);
175            let (l, f) = unpack(p);
176            assert_eq!(l, Layer::Phrase);
177            assert_eq!(f, FREQ_MASK);
178        }
179        // Saturated value must equal packing the exact max, and never spill
180        // into the layer bits.
181        assert_eq!(pack(Layer::Phrase, u64::MAX), pack(Layer::Phrase, FREQ_MASK));
182    }
183
184    #[test]
185    fn layer_base_strict_ascending() {
186        for w in LAYER_BASE.windows(2) {
187            assert!(w[0] < w[1], "LAYER_BASE must be strictly ascending (Auto = lowest priority)");
188        }
189    }
190
191    #[test]
192    fn packed_u64_orders_by_priority() {
193        // Higher-priority layer must produce a larger u64 even with zero
194        // freq, so the build-time merge step (`if *w < weight`) keeps the
195        // winner.
196        let auto = pack(Layer::Auto, FREQ_MASK);
197        let phrase = pack(Layer::Phrase, 0);
198        assert!(phrase > auto, "Phrase + 0 freq must beat Auto + max freq");
199        let jm1 = pack(Layer::Jianma1, 0);
200        let jm2 = pack(Layer::Jianma2, FREQ_MASK);
201        assert!(jm1 > jm2);
202    }
203}