inputx_wubi/layer.rs
1//! Layer taxonomy for dictionary entries.
2//!
3//! Every `(code, word)` belongs to exactly one [`Layer`] determined at build
4//! time. The layer feeds two purposes:
5//!
6//! 1. **Coarse ordering** — [`LAYER_BASE`] gives each layer a numeric base
7//! weight so 一级简码 always outranks any 二级简码, etc., regardless of
8//! per-entry frequency.
9//! 2. **User-tunable preference** — `WubiDict::set_layer_pref` lets the host
10//! multiply a layer's contribution at lookup time without touching data.
11//!
12//! Index values pack `(layer << FREQ_BITS) | freq_score` so the runtime can
13//! read both in one stream pass. `freq_score` is the corpus-derived frequency
14//! from `data/weights/weights.tsv` (capped at [`MAX_FREQ_SCORE`]; real data
15//! tops out around 50k), normalized within layer.
16
17/// Discriminants are **ascending priority**: `Auto = 0` is lowest, `Jianma1`
18/// is highest. This makes the packed `(layer << FREQ_BITS) | freq` compare
19/// correctly with raw `u64` ordering — higher u64 = higher priority — so the
20/// build-time merge step can keep the larger value on collision without
21/// special casing.
22#[repr(u8)]
23#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
24pub enum Layer {
25 /// Auto-decomposed CJK extension character — algorithmically valid but
26 /// may pick a non-canonical 字根 sequence.
27 Auto = 0,
28 /// Multi-character phrase (词组).
29 Phrase = 1,
30 /// Hand-curated 字根/seed entry — pedagogically canonical decomposition.
31 Zigen = 2,
32 /// 三级简码 — three-letter shortcut.
33 Jianma3 = 3,
34 /// 二级简码 — two-letter shortcut.
35 Jianma2 = 4,
36 /// 一级简码 — single-letter shortcut (e.g., `g → 一`).
37 Jianma1 = 5,
38}
39
40/// Total number of layers. Acts as the array length for `LAYER_BASE`,
41/// `DEFAULT_LAYER_PREFS`, and any per-layer table the host might keep.
42pub const LAYER_COUNT: usize = 6;
43
44/// Per-layer base weight, **indexed by `Layer as usize`** (ascending). Values
45/// are spaced so that any in-layer frequency score (capped well below the
46/// gap) cannot reorder layers, but a sufficient `layer_pref` multiplier can.
47pub const LAYER_BASE: [u64; LAYER_COUNT] = [
48 100_000, // [0] Auto
49 400_000, // [1] Phrase
50 500_000, // [2] Zigen
51 600_000, // [3] Jianma3
52 800_000, // [4] Jianma2
53 1_000_000, // [5] Jianma1
54];
55
56/// Default `layer_prefs`, indexed by `Layer as usize`. `Auto` is dampened to
57/// 0.7 so extension characters don't pollute the top of common 4-letter
58/// codes; everything else is 1.0.
59#[allow(dead_code)] // runtime-only; build.rs doesn't use prefs
60pub const DEFAULT_LAYER_PREFS: [f64; LAYER_COUNT] = [
61 0.7, // [0] Auto
62 1.0, // [1] Phrase
63 1.0, // [2] Zigen
64 1.0, // [3] Jianma3
65 1.0, // [4] Jianma2
66 1.0, // [5] Jianma1
67];
68
69#[allow(dead_code)] // some methods are runtime-only; build.rs sees them as unused
70impl Layer {
71 /// Decode from the discriminant byte. `None` for any value outside
72 /// `0..=5` — used by [`unpack`] to recover the layer from a packed FST
73 /// value, falling back to [`Layer::Auto`] on corruption.
74 pub const fn from_u8(b: u8) -> Option<Self> {
75 match b {
76 0 => Some(Self::Auto),
77 1 => Some(Self::Phrase),
78 2 => Some(Self::Zigen),
79 3 => Some(Self::Jianma3),
80 4 => Some(Self::Jianma2),
81 5 => Some(Self::Jianma1),
82 _ => None,
83 }
84 }
85
86 /// The discriminant byte (0..=5).
87 pub const fn as_u8(self) -> u8 {
88 self as u8
89 }
90
91 /// Convert to a `usize` index suitable for `LAYER_BASE` /
92 /// `DEFAULT_LAYER_PREFS` array access.
93 pub const fn as_index(self) -> usize {
94 self as usize
95 }
96
97 /// Layer base weight. Equivalent to `LAYER_BASE[self.as_index()]`.
98 pub const fn base(self) -> u64 {
99 LAYER_BASE[self as usize]
100 }
101}
102
103/// Bits reserved for `freq_score` in the packed value. The corpus pipeline
104/// caps freq at `max_freq_score` (65535 = 16 bits); 20 gives headroom.
105/// Layer sits ABOVE freq so a larger packed u64 still means higher priority
106/// (layer desc, then freq desc) — the invariant the build-time merge and the
107/// inputx-fsa Dict's value-desc item order both rely on. Keeping the packed
108/// value small (≤ ~2^23 vs the old ~2^58) is what lets the LEB128 value
109/// encoding shrink from ~9 bytes to ~4 (zerodep E1).
110const FREQ_BITS: u32 = 20;
111const FREQ_MASK: u64 = (1 << FREQ_BITS) - 1;
112
113/// Largest `freq_score` the packed value can represent (`2^FREQ_BITS - 1`).
114/// The **single source of truth** for the freq domain — tests and any caller
115/// that needs to clamp/validate import this rather than hardcoding a copy
116/// (a stale copy in the proptest survived the E1 `FREQ_BITS` 56→20 change and
117/// silently broke the invariants until proptest caught it).
118pub const MAX_FREQ_SCORE: u64 = FREQ_MASK;
119
120/// Pack `(layer, freq_score)` into a single u64 index value. `freq_score` is
121/// **saturated** to [`MAX_FREQ_SCORE`] if it exceeds the field, never wrapped:
122/// for an order-by-value structure a wraparound would invert priority (a very
123/// high freq would pack to a tiny value and rank last), whereas clamping keeps
124/// "higher freq → higher-or-equal priority". Real freqs (≤ ~50k) are far
125/// inside the field, so this only matters as a defensive guarantee.
126#[allow(dead_code)] // used by build.rs and at runtime; build_weights.rs doesn't pack
127pub const fn pack(layer: Layer, freq_score: u64) -> u64 {
128 let freq = if freq_score > FREQ_MASK { FREQ_MASK } else { freq_score };
129 ((layer as u64) << FREQ_BITS) | freq
130}
131
132/// Reverse of [`pack`]. Unknown layer bytes fall back to [`Layer::Auto`]
133/// (lowest priority) — preferable to panicking on a corrupt FST.
134#[allow(dead_code)] // runtime-only; build.rs only uses pack
135pub const fn unpack(packed: u64) -> (Layer, u64) {
136 let layer_byte = (packed >> FREQ_BITS) as u8;
137 let freq = packed & FREQ_MASK;
138 let layer = match Layer::from_u8(layer_byte) {
139 Some(l) => l,
140 None => Layer::Auto,
141 };
142 (layer, freq)
143}
144
145#[cfg(test)]
146mod tests {
147 use super::*;
148
149 #[test]
150 fn pack_unpack_roundtrip() {
151 for &l in &[
152 Layer::Jianma1,
153 Layer::Jianma2,
154 Layer::Jianma3,
155 Layer::Zigen,
156 Layer::Phrase,
157 Layer::Auto,
158 ] {
159 for &f in &[0u64, 1, 1234, FREQ_MASK] {
160 let p = pack(l, f);
161 let (lu, fu) = unpack(p);
162 assert_eq!(lu, l);
163 assert_eq!(fu, f);
164 }
165 }
166 }
167
168 #[test]
169 fn freq_overflow_is_saturated() {
170 // Over-range freq clamps to MAX_FREQ_SCORE (not wrapped to 0) and the
171 // layer is untouched — so an out-of-range freq still ranks at the top
172 // of its layer rather than the bottom.
173 for over in [FREQ_MASK + 1, u64::MAX, 1 << 40] {
174 let p = pack(Layer::Phrase, over);
175 let (l, f) = unpack(p);
176 assert_eq!(l, Layer::Phrase);
177 assert_eq!(f, FREQ_MASK);
178 }
179 // Saturated value must equal packing the exact max, and never spill
180 // into the layer bits.
181 assert_eq!(pack(Layer::Phrase, u64::MAX), pack(Layer::Phrase, FREQ_MASK));
182 }
183
184 #[test]
185 fn layer_base_strict_ascending() {
186 for w in LAYER_BASE.windows(2) {
187 assert!(w[0] < w[1], "LAYER_BASE must be strictly ascending (Auto = lowest priority)");
188 }
189 }
190
191 #[test]
192 fn packed_u64_orders_by_priority() {
193 // Higher-priority layer must produce a larger u64 even with zero
194 // freq, so the build-time merge step (`if *w < weight`) keeps the
195 // winner.
196 let auto = pack(Layer::Auto, FREQ_MASK);
197 let phrase = pack(Layer::Phrase, 0);
198 assert!(phrase > auto, "Phrase + 0 freq must beat Auto + max freq");
199 let jm1 = pack(Layer::Jianma1, 0);
200 let jm2 = pack(Layer::Jianma2, FREQ_MASK);
201 assert!(jm1 > jm2);
202 }
203}