Skip to main content

wubi/
layer.rs

1//! Layer taxonomy for dictionary entries.
2//!
3//! Every `(code, word)` belongs to exactly one [`Layer`] determined at build
4//! time. The layer feeds two purposes:
5//!
6//! 1. **Coarse ordering** — [`LAYER_BASE`] gives each layer a numeric base
7//!    weight so 一级简码 always outranks any 二级简码, etc., regardless of
8//!    per-entry frequency.
9//! 2. **User-tunable preference** — `WubiDict::set_layer_pref` lets the host
10//!    multiply a layer's contribution at lookup time without touching data.
11//!
12//! FST values pack `(layer << 56) | freq_score` so the runtime can read
13//! both in one stream pass. `freq_score` is currently always 0 (placeholder
14//! until the corpus pipeline lands); when populated it'll be the
15//! corpus-derived frequency normalized within layer.
16
17/// Discriminants are **ascending priority**: `Auto = 0` is lowest, `Jianma1`
18/// is highest. This makes the FST's packed `(layer << 56) | freq` compare
19/// correctly with raw `u64` ordering — higher u64 = higher priority — so the
20/// build-time merge step can keep the larger value on collision without
21/// special casing.
22#[repr(u8)]
23#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
24pub enum Layer {
25    /// Auto-decomposed CJK extension character — algorithmically valid but
26    /// may pick a non-canonical 字根 sequence.
27    Auto = 0,
28    /// Multi-character phrase (词组).
29    Phrase = 1,
30    /// Hand-curated 字根/seed entry — pedagogically canonical decomposition.
31    Zigen = 2,
32    /// 三级简码 — three-letter shortcut.
33    Jianma3 = 3,
34    /// 二级简码 — two-letter shortcut.
35    Jianma2 = 4,
36    /// 一级简码 — single-letter shortcut (e.g., `g → 一`).
37    Jianma1 = 5,
38}
39
40/// Total number of layers. Acts as the array length for `LAYER_BASE`,
41/// `DEFAULT_LAYER_PREFS`, and any per-layer table the host might keep.
42pub const LAYER_COUNT: usize = 6;
43
44/// Per-layer base weight, **indexed by `Layer as usize`** (ascending). Values
45/// are spaced so that any in-layer frequency score (capped well below the
46/// gap) cannot reorder layers, but a sufficient `layer_pref` multiplier can.
47pub const LAYER_BASE: [u64; LAYER_COUNT] = [
48    100_000,   // [0] Auto
49    400_000,   // [1] Phrase
50    500_000,   // [2] Zigen
51    600_000,   // [3] Jianma3
52    800_000,   // [4] Jianma2
53    1_000_000, // [5] Jianma1
54];
55
56/// Default `layer_prefs`, indexed by `Layer as usize`. `Auto` is dampened to
57/// 0.7 so extension characters don't pollute the top of common 4-letter
58/// codes; everything else is 1.0.
59#[allow(dead_code)] // runtime-only; build.rs doesn't use prefs
60pub const DEFAULT_LAYER_PREFS: [f64; LAYER_COUNT] = [
61    0.7, // [0] Auto
62    1.0, // [1] Phrase
63    1.0, // [2] Zigen
64    1.0, // [3] Jianma3
65    1.0, // [4] Jianma2
66    1.0, // [5] Jianma1
67];
68
69#[allow(dead_code)] // some methods are runtime-only; build.rs sees them as unused
70impl Layer {
71    /// Decode from the discriminant byte. `None` for any value outside
72    /// `0..=5` — used by [`unpack`] to recover the layer from a packed FST
73    /// value, falling back to [`Layer::Auto`] on corruption.
74    pub const fn from_u8(b: u8) -> Option<Self> {
75        match b {
76            0 => Some(Self::Auto),
77            1 => Some(Self::Phrase),
78            2 => Some(Self::Zigen),
79            3 => Some(Self::Jianma3),
80            4 => Some(Self::Jianma2),
81            5 => Some(Self::Jianma1),
82            _ => None,
83        }
84    }
85
86    /// The discriminant byte (0..=5).
87    pub const fn as_u8(self) -> u8 {
88        self as u8
89    }
90
91    /// Convert to a `usize` index suitable for `LAYER_BASE` /
92    /// `DEFAULT_LAYER_PREFS` array access.
93    pub const fn as_index(self) -> usize {
94        self as usize
95    }
96
97    /// Layer base weight. Equivalent to `LAYER_BASE[self.as_index()]`.
98    pub const fn base(self) -> u64 {
99        LAYER_BASE[self as usize]
100    }
101}
102
103const FREQ_MASK: u64 = 0x00FF_FFFF_FFFF_FFFF;
104
105/// Pack `(layer, freq_score)` into a single u64 FST value. `freq_score`
106/// must fit in 56 bits; higher bits are silently truncated.
107#[allow(dead_code)] // used by build.rs and at runtime; build_weights.rs doesn't pack
108pub const fn pack(layer: Layer, freq_score: u64) -> u64 {
109    ((layer as u64) << 56) | (freq_score & FREQ_MASK)
110}
111
112/// Reverse of [`pack`]. Unknown layer bytes fall back to [`Layer::Auto`]
113/// (lowest priority) — preferable to panicking on a corrupt FST.
114#[allow(dead_code)] // runtime-only; build.rs only uses pack
115pub const fn unpack(packed: u64) -> (Layer, u64) {
116    let layer_byte = (packed >> 56) as u8;
117    let freq = packed & FREQ_MASK;
118    let layer = match Layer::from_u8(layer_byte) {
119        Some(l) => l,
120        None => Layer::Auto,
121    };
122    (layer, freq)
123}
124
125#[cfg(test)]
126mod tests {
127    use super::*;
128
129    #[test]
130    fn pack_unpack_roundtrip() {
131        for &l in &[
132            Layer::Jianma1,
133            Layer::Jianma2,
134            Layer::Jianma3,
135            Layer::Zigen,
136            Layer::Phrase,
137            Layer::Auto,
138        ] {
139            for &f in &[0u64, 1, 1234, FREQ_MASK] {
140                let p = pack(l, f);
141                let (lu, fu) = unpack(p);
142                assert_eq!(lu, l);
143                assert_eq!(fu, f);
144            }
145        }
146    }
147
148    #[test]
149    fn freq_overflow_is_truncated() {
150        let p = pack(Layer::Phrase, FREQ_MASK + 1);
151        let (l, f) = unpack(p);
152        assert_eq!(l, Layer::Phrase);
153        assert_eq!(f, 0);
154    }
155
156    #[test]
157    fn layer_base_strict_ascending() {
158        for w in LAYER_BASE.windows(2) {
159            assert!(w[0] < w[1], "LAYER_BASE must be strictly ascending (Auto = lowest priority)");
160        }
161    }
162
163    #[test]
164    fn packed_u64_orders_by_priority() {
165        // Higher-priority layer must produce a larger u64 even with zero
166        // freq, so the build-time merge step (`if *w < weight`) keeps the
167        // winner.
168        let auto = pack(Layer::Auto, FREQ_MASK);
169        let phrase = pack(Layer::Phrase, 0);
170        assert!(phrase > auto, "Phrase + 0 freq must beat Auto + max freq");
171        let jm1 = pack(Layer::Jianma1, 0);
172        let jm2 = pack(Layer::Jianma2, FREQ_MASK);
173        assert!(jm1 > jm2);
174    }
175}