wubi/layer.rs
1//! Layer taxonomy for dictionary entries.
2//!
3//! Every `(code, word)` belongs to exactly one [`Layer`] determined at build
4//! time. The layer feeds two purposes:
5//!
6//! 1. **Coarse ordering** — [`LAYER_BASE`] gives each layer a numeric base
7//! weight so 一级简码 always outranks any 二级简码, etc., regardless of
8//! per-entry frequency.
9//! 2. **User-tunable preference** — `WubiDict::set_layer_pref` lets the host
10//! multiply a layer's contribution at lookup time without touching data.
11//!
12//! FST values pack `(layer << 56) | freq_score` so the runtime can read
13//! both in one stream pass. `freq_score` is currently always 0 (placeholder
14//! until the corpus pipeline lands); when populated it'll be the
15//! corpus-derived frequency normalized within layer.
16
17/// Discriminants are **ascending priority**: `Auto = 0` is lowest, `Jianma1`
18/// is highest. This makes the FST's packed `(layer << 56) | freq` compare
19/// correctly with raw `u64` ordering — higher u64 = higher priority — so the
20/// build-time merge step can keep the larger value on collision without
21/// special casing.
22#[repr(u8)]
23#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
24pub enum Layer {
25 /// Auto-decomposed CJK extension character — algorithmically valid but
26 /// may pick a non-canonical 字根 sequence.
27 Auto = 0,
28 /// Multi-character phrase (词组).
29 Phrase = 1,
30 /// Hand-curated 字根/seed entry — pedagogically canonical decomposition.
31 Zigen = 2,
32 /// 三级简码 — three-letter shortcut.
33 Jianma3 = 3,
34 /// 二级简码 — two-letter shortcut.
35 Jianma2 = 4,
36 /// 一级简码 — single-letter shortcut (e.g., `g → 一`).
37 Jianma1 = 5,
38}
39
40/// Total number of layers. Acts as the array length for `LAYER_BASE`,
41/// `DEFAULT_LAYER_PREFS`, and any per-layer table the host might keep.
42pub const LAYER_COUNT: usize = 6;
43
44/// Per-layer base weight, **indexed by `Layer as usize`** (ascending). Values
45/// are spaced so that any in-layer frequency score (capped well below the
46/// gap) cannot reorder layers, but a sufficient `layer_pref` multiplier can.
47pub const LAYER_BASE: [u64; LAYER_COUNT] = [
48 100_000, // [0] Auto
49 400_000, // [1] Phrase
50 500_000, // [2] Zigen
51 600_000, // [3] Jianma3
52 800_000, // [4] Jianma2
53 1_000_000, // [5] Jianma1
54];
55
56/// Default `layer_prefs`, indexed by `Layer as usize`. `Auto` is dampened to
57/// 0.7 so extension characters don't pollute the top of common 4-letter
58/// codes; everything else is 1.0.
59#[allow(dead_code)] // runtime-only; build.rs doesn't use prefs
60pub const DEFAULT_LAYER_PREFS: [f64; LAYER_COUNT] = [
61 0.7, // [0] Auto
62 1.0, // [1] Phrase
63 1.0, // [2] Zigen
64 1.0, // [3] Jianma3
65 1.0, // [4] Jianma2
66 1.0, // [5] Jianma1
67];
68
69#[allow(dead_code)] // some methods are runtime-only; build.rs sees them as unused
70impl Layer {
71 /// Decode from the discriminant byte. `None` for any value outside
72 /// `0..=5` — used by [`unpack`] to recover the layer from a packed FST
73 /// value, falling back to [`Layer::Auto`] on corruption.
74 pub const fn from_u8(b: u8) -> Option<Self> {
75 match b {
76 0 => Some(Self::Auto),
77 1 => Some(Self::Phrase),
78 2 => Some(Self::Zigen),
79 3 => Some(Self::Jianma3),
80 4 => Some(Self::Jianma2),
81 5 => Some(Self::Jianma1),
82 _ => None,
83 }
84 }
85
86 /// The discriminant byte (0..=5).
87 pub const fn as_u8(self) -> u8 {
88 self as u8
89 }
90
91 /// Convert to a `usize` index suitable for `LAYER_BASE` /
92 /// `DEFAULT_LAYER_PREFS` array access.
93 pub const fn as_index(self) -> usize {
94 self as usize
95 }
96
97 /// Layer base weight. Equivalent to `LAYER_BASE[self.as_index()]`.
98 pub const fn base(self) -> u64 {
99 LAYER_BASE[self as usize]
100 }
101}
102
103const FREQ_MASK: u64 = 0x00FF_FFFF_FFFF_FFFF;
104
105/// Pack `(layer, freq_score)` into a single u64 FST value. `freq_score`
106/// must fit in 56 bits; higher bits are silently truncated.
107#[allow(dead_code)] // used by build.rs and at runtime; build_weights.rs doesn't pack
108pub const fn pack(layer: Layer, freq_score: u64) -> u64 {
109 ((layer as u64) << 56) | (freq_score & FREQ_MASK)
110}
111
112/// Reverse of [`pack`]. Unknown layer bytes fall back to [`Layer::Auto`]
113/// (lowest priority) — preferable to panicking on a corrupt FST.
114#[allow(dead_code)] // runtime-only; build.rs only uses pack
115pub const fn unpack(packed: u64) -> (Layer, u64) {
116 let layer_byte = (packed >> 56) as u8;
117 let freq = packed & FREQ_MASK;
118 let layer = match Layer::from_u8(layer_byte) {
119 Some(l) => l,
120 None => Layer::Auto,
121 };
122 (layer, freq)
123}
124
125#[cfg(test)]
126mod tests {
127 use super::*;
128
129 #[test]
130 fn pack_unpack_roundtrip() {
131 for &l in &[
132 Layer::Jianma1,
133 Layer::Jianma2,
134 Layer::Jianma3,
135 Layer::Zigen,
136 Layer::Phrase,
137 Layer::Auto,
138 ] {
139 for &f in &[0u64, 1, 1234, FREQ_MASK] {
140 let p = pack(l, f);
141 let (lu, fu) = unpack(p);
142 assert_eq!(lu, l);
143 assert_eq!(fu, f);
144 }
145 }
146 }
147
148 #[test]
149 fn freq_overflow_is_truncated() {
150 let p = pack(Layer::Phrase, FREQ_MASK + 1);
151 let (l, f) = unpack(p);
152 assert_eq!(l, Layer::Phrase);
153 assert_eq!(f, 0);
154 }
155
156 #[test]
157 fn layer_base_strict_ascending() {
158 for w in LAYER_BASE.windows(2) {
159 assert!(w[0] < w[1], "LAYER_BASE must be strictly ascending (Auto = lowest priority)");
160 }
161 }
162
163 #[test]
164 fn packed_u64_orders_by_priority() {
165 // Higher-priority layer must produce a larger u64 even with zero
166 // freq, so the build-time merge step (`if *w < weight`) keeps the
167 // winner.
168 let auto = pack(Layer::Auto, FREQ_MASK);
169 let phrase = pack(Layer::Phrase, 0);
170 assert!(phrase > auto, "Phrase + 0 freq must beat Auto + max freq");
171 let jm1 = pack(Layer::Jianma1, 0);
172 let jm2 = pack(Layer::Jianma2, FREQ_MASK);
173 assert!(jm1 > jm2);
174 }
175}