Skip to main content

wubi/
codec.rs

1//! Self-contained Wubi 86 codec — pure algorithm + types, zero external imports.
2//!
3//! This module is intentionally usable from `build.rs` (via `#[path]`) as
4//! well as the runtime crate. **Do not** add `use crate::...` lines here, or
5//! `extern crate alloc;`; the only allowed imports are `core::*`.
6//!
7//! Everything is `#[inline]` and zero-allocation. The encoder writes into a
8//! caller-provided `[u8; 4]` buffer and returns the populated length.
9
10use core::fmt;
11
12/// One of the five Wubi stroke categories. Discriminants 1..=5 match the
13/// canonical numbering used in seed data files; do not renumber.
14#[repr(u8)]
15#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
16pub enum Stroke {
17    /// 横 — horizontal.
18    Heng = 1,
19    /// 竖 — vertical.
20    Shu = 2,
21    /// 撇 — left-falling.
22    Pie = 3,
23    /// 捺 — right-falling (incl. 点 / dot).
24    Na = 4,
25    /// 折 — turning.
26    Zhe = 5,
27}
28
29impl Stroke {
30    /// Decode a stroke discriminant from its `u8`. `None` for any value
31    /// outside `1..=5` — seed-file parsers should treat that as a row-skip,
32    /// not a hard error.
33    #[inline]
34    pub const fn from_u8(v: u8) -> Option<Self> {
35        match v {
36            1 => Some(Self::Heng),
37            2 => Some(Self::Shu),
38            3 => Some(Self::Pie),
39            4 => Some(Self::Na),
40            5 => Some(Self::Zhe),
41            _ => None,
42        }
43    }
44}
45
46/// Wubi 字形 (character shape) — three-way classification used by the 末笔
47/// 识别码 rule. Discriminants 1..=3 match the seed-file numbering.
48#[repr(u8)]
49#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
50pub enum Shape {
51    /// 左右 — left-right structure.
52    LeftRight = 1,
53    /// 上下 — top-bottom structure.
54    TopBottom = 2,
55    /// 杂合 — single-component / unsplit.
56    Whole = 3,
57}
58
59impl Shape {
60    /// Decode a shape discriminant from its `u8`. `None` for any value
61    /// outside `1..=3`.
62    #[inline]
63    pub const fn from_u8(v: u8) -> Option<Self> {
64        match v {
65            1 => Some(Self::LeftRight),
66            2 => Some(Self::TopBottom),
67            3 => Some(Self::Whole),
68            _ => None,
69        }
70    }
71}
72
73/// Wubi 86 末笔识别码: (last stroke, shape) → key letter.
74#[inline]
75pub const fn shibie_ma(stroke: Stroke, shape: Shape) -> u8 {
76    let table: [[u8; 3]; 5] = [
77        [b'g', b'f', b'd'],
78        [b'h', b'j', b'k'],
79        [b't', b'r', b'e'],
80        [b'y', b'u', b'i'],
81        [b'n', b'b', b'v'],
82    ];
83    let s = stroke as usize - 1;
84    let p = shape as usize - 1;
85    table[s][p]
86}
87
88/// Region letter — first key of each stroke region.
89/// Used by 成字字根 rule.
90#[inline]
91pub const fn region_letter(stroke: Stroke) -> u8 {
92    match stroke {
93        Stroke::Heng => b'g',
94        Stroke::Shu => b'h',
95        Stroke::Pie => b't',
96        Stroke::Na => b'y',
97        Stroke::Zhe => b'n',
98    }
99}
100
101/// 25 键名字根 of Wubi 86 — encode as letter × 4.
102pub const JIANMING_ZIGEN: &str = "王土大木工目日口田山禾白月人金言立水火之已子女又纟";
103
104/// 5 单笔画 字根 — encode as `letter letter L L`.
105pub const DAN_BI_HUA: &[char] = &['一', '丨', '丿', '丶', '乙'];
106
107/// Borrowed character decomposition. Cheap to construct from `&[char]` /
108/// `&[Stroke]`, suitable for both stack-only build pipelines and
109/// the runtime hot path.
110#[derive(Debug, Clone, Copy)]
111pub struct DecompRef<'a> {
112    /// Ordered 字根 sequence (1..=N items). Encoder consumes positions
113    /// `[0, 1, 2, last]`.
114    pub zigen: &'a [char],
115    /// Stroke sequence for 成字字根 / 末笔识别码 rules. May be longer than
116    /// `zigen`; only the first/second/last positions are read.
117    pub strokes: &'a [Stroke],
118    /// Whole-character shape — drives the 末笔识别码 lookup.
119    pub shape: Shape,
120}
121
122impl<'a> DecompRef<'a> {
123    /// First stroke, or `None` for an empty stroke list.
124    #[inline]
125    pub fn first_stroke(&self) -> Option<Stroke> {
126        self.strokes.first().copied()
127    }
128    /// Second stroke (index 1), or `None` if fewer than 2 strokes.
129    #[inline]
130    pub fn second_stroke(&self) -> Option<Stroke> {
131        self.strokes.get(1).copied()
132    }
133    /// Last stroke, or `None` for an empty stroke list. Used by the 识别码
134    /// rule for 2- and 3-字根 codes.
135    #[inline]
136    pub fn last_stroke(&self) -> Option<Stroke> {
137        self.strokes.last().copied()
138    }
139}
140
141/// Why an `encode_with_lookup` call failed. Always recoverable — never
142/// panics.
143#[derive(Debug, Clone, PartialEq, Eq)]
144pub enum EncodeError {
145    /// `decomp.zigen` was empty.
146    EmptyZigen,
147    /// One of the 字根 wasn't in the supplied lookup. Carries the offending
148    /// character so the caller can log + continue past the bad row.
149    UnknownZigen(char),
150    /// A 2- or 3-字根 decomposition needs strokes for the 识别码 / 成字字根
151    /// rule but `decomp.strokes` was empty.
152    MissingStroke,
153}
154
155impl fmt::Display for EncodeError {
156    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
157        match self {
158            EncodeError::EmptyZigen => f.write_str("empty zigen sequence"),
159            EncodeError::UnknownZigen(c) => write!(f, "unknown zigen: {c}"),
160            EncodeError::MissingStroke => f.write_str("decomp has no strokes"),
161        }
162    }
163}
164
165#[inline]
166fn is_jianming(c: char) -> bool {
167    // O(n) on 25-char string; ~50ns. Phase 2 swaps to PHF set for ~5ns.
168    JIANMING_ZIGEN.contains(c)
169}
170
171#[inline]
172fn is_dan_bi_hua(c: char) -> bool {
173    let mut i = 0;
174    while i < DAN_BI_HUA.len() {
175        if DAN_BI_HUA[i] == c {
176            return true;
177        }
178        i += 1;
179    }
180    false
181}
182
183/// Encode a decomposition using the supplied 字根 lookup. Writes the code
184/// bytes into `out` and returns the number of bytes written (3 or 4).
185///
186/// The encoder is the ONLY entry point that runs the rules; all platform
187/// glue (PHF runtime tables, HashMap build-time tables) goes through here
188/// via the lookup closure.
189pub fn encode_with_lookup<F>(
190    decomp: &DecompRef,
191    lookup: F,
192    out: &mut [u8; 4],
193) -> Result<usize, EncodeError>
194where
195    F: Fn(char) -> Option<u8>,
196{
197    if decomp.zigen.is_empty() {
198        return Err(EncodeError::EmptyZigen);
199    }
200
201    let n = decomp.zigen.len();
202
203    match n {
204        1 => encode_single_zigen(decomp, lookup, out),
205        2 => {
206            let l1 = lookup(decomp.zigen[0]).ok_or(EncodeError::UnknownZigen(decomp.zigen[0]))?;
207            let l2 = lookup(decomp.zigen[1]).ok_or(EncodeError::UnknownZigen(decomp.zigen[1]))?;
208            let last = decomp.last_stroke().ok_or(EncodeError::MissingStroke)?;
209            let im = shibie_ma(last, decomp.shape);
210            out[0] = l1;
211            out[1] = l2;
212            out[2] = im;
213            Ok(3)
214        }
215        3 => {
216            let l1 = lookup(decomp.zigen[0]).ok_or(EncodeError::UnknownZigen(decomp.zigen[0]))?;
217            let l2 = lookup(decomp.zigen[1]).ok_or(EncodeError::UnknownZigen(decomp.zigen[1]))?;
218            let l3 = lookup(decomp.zigen[2]).ok_or(EncodeError::UnknownZigen(decomp.zigen[2]))?;
219            let last = decomp.last_stroke().ok_or(EncodeError::MissingStroke)?;
220            let im = shibie_ma(last, decomp.shape);
221            out[0] = l1;
222            out[1] = l2;
223            out[2] = l3;
224            out[3] = im;
225            Ok(4)
226        }
227        _ => {
228            // 4+ 字根: 1st + 2nd + 3rd + last
229            let l1 = lookup(decomp.zigen[0]).ok_or(EncodeError::UnknownZigen(decomp.zigen[0]))?;
230            let l2 = lookup(decomp.zigen[1]).ok_or(EncodeError::UnknownZigen(decomp.zigen[1]))?;
231            let l3 = lookup(decomp.zigen[2]).ok_or(EncodeError::UnknownZigen(decomp.zigen[2]))?;
232            let ll = lookup(decomp.zigen[n - 1])
233                .ok_or(EncodeError::UnknownZigen(decomp.zigen[n - 1]))?;
234            out[0] = l1;
235            out[1] = l2;
236            out[2] = l3;
237            out[3] = ll;
238            Ok(4)
239        }
240    }
241}
242
243#[inline]
244fn encode_single_zigen<F>(
245    decomp: &DecompRef,
246    lookup: F,
247    out: &mut [u8; 4],
248) -> Result<usize, EncodeError>
249where
250    F: Fn(char) -> Option<u8>,
251{
252    let z = decomp.zigen[0];
253    let l = lookup(z).ok_or(EncodeError::UnknownZigen(z))?;
254
255    if is_dan_bi_hua(z) {
256        out[0] = l;
257        out[1] = l;
258        out[2] = b'l';
259        out[3] = b'l';
260        return Ok(4);
261    }
262    if is_jianming(z) {
263        out[0] = l;
264        out[1] = l;
265        out[2] = l;
266        out[3] = l;
267        return Ok(4);
268    }
269    // 成字字根 rules depend on stroke count:
270    //   - 1 stroke → handled by 单笔画 above (this branch unreachable here)
271    //   - 2 strokes → 3-letter code: letter + first + last
272    //   - ≥ 3 strokes → 4-letter code: letter + first + second + last
273    let first = decomp.first_stroke().ok_or(EncodeError::MissingStroke)?;
274    let last = decomp.last_stroke().ok_or(EncodeError::MissingStroke)?;
275    let stroke_count = decomp.strokes.len();
276    if stroke_count == 2 {
277        out[0] = l;
278        out[1] = region_letter(first);
279        out[2] = region_letter(last);
280        return Ok(3);
281    }
282    let second = decomp.second_stroke().ok_or(EncodeError::MissingStroke)?;
283    out[0] = l;
284    out[1] = region_letter(first);
285    out[2] = region_letter(second);
286    out[3] = region_letter(last);
287    Ok(4)
288}
289
290#[cfg(test)]
291mod tests {
292    use super::*;
293
294    fn dummy(c: char) -> Option<u8> {
295        match c {
296            '王' => Some(b'g'),
297            '土' => Some(b'f'),
298            '大' => Some(b'd'),
299            '人' => Some(b'w'),
300            '一' => Some(b'g'),
301            '丨' => Some(b'h'),
302            '丿' => Some(b't'),
303            '丶' => Some(b'y'),
304            '乙' => Some(b'n'),
305            _ => None,
306        }
307    }
308
309    #[test]
310    fn shibie_grid() {
311        assert_eq!(shibie_ma(Stroke::Heng, Shape::LeftRight), b'g');
312        assert_eq!(shibie_ma(Stroke::Heng, Shape::Whole), b'd');
313        assert_eq!(shibie_ma(Stroke::Zhe, Shape::Whole), b'v');
314    }
315
316    #[test]
317    fn region_letters() {
318        assert_eq!(region_letter(Stroke::Heng), b'g');
319        assert_eq!(region_letter(Stroke::Zhe), b'n');
320    }
321
322    #[test]
323    fn jianming_letter_x4() {
324        let d = DecompRef {
325            zigen: &['王'],
326            strokes: &[Stroke::Heng],
327            shape: Shape::Whole,
328        };
329        let mut out = [0u8; 4];
330        let n = encode_with_lookup(&d, dummy, &mut out).unwrap();
331        assert_eq!(&out[..n], b"gggg");
332    }
333
334    #[test]
335    fn dan_bi_hua_rule() {
336        for (c, stroke, expected) in &[
337            ('一', Stroke::Heng, b"ggll"),
338            ('丨', Stroke::Shu, b"hhll"),
339            ('丿', Stroke::Pie, b"ttll"),
340            ('丶', Stroke::Na, b"yyll"),
341            ('乙', Stroke::Zhe, b"nnll"),
342        ] {
343            let d = DecompRef {
344                zigen: &[*c],
345                strokes: &[*stroke],
346                shape: Shape::Whole,
347            };
348            let mut out = [0u8; 4];
349            let n = encode_with_lookup(&d, dummy, &mut out).unwrap();
350            assert_eq!(&out[..n], *expected, "{c} mismatch");
351        }
352    }
353
354    #[test]
355    fn unknown_zigen_errors_out() {
356        let d = DecompRef {
357            zigen: &['🦀'],
358            strokes: &[Stroke::Heng],
359            shape: Shape::Whole,
360        };
361        let mut out = [0u8; 4];
362        assert!(matches!(
363            encode_with_lookup(&d, dummy, &mut out),
364            Err(EncodeError::UnknownZigen('🦀'))
365        ));
366    }
367
368    #[test]
369    fn empty_zigen_errors_out() {
370        let d = DecompRef {
371            zigen: &[],
372            strokes: &[Stroke::Heng],
373            shape: Shape::Whole,
374        };
375        let mut out = [0u8; 4];
376        assert!(matches!(
377            encode_with_lookup(&d, dummy, &mut out),
378            Err(EncodeError::EmptyZigen)
379        ));
380    }
381
382    // ---- Coverage fill: helper accessors + error formatting + 2/3-字根
383    // branches were exercised only by the wider integration tests, which
384    // hides regressions in these specific surfaces behind unrelated test
385    // failures. Direct unit tests keep them honest.
386
387    #[test]
388    fn stroke_shape_from_u8_round_trip_and_reject() {
389        for (n, s) in [
390            (1u8, Stroke::Heng),
391            (2, Stroke::Shu),
392            (3, Stroke::Pie),
393            (4, Stroke::Na),
394            (5, Stroke::Zhe),
395        ] {
396            assert_eq!(Stroke::from_u8(n), Some(s));
397        }
398        assert_eq!(Stroke::from_u8(0), None);
399        assert_eq!(Stroke::from_u8(6), None);
400
401        for (n, sh) in [
402            (1u8, Shape::LeftRight),
403            (2, Shape::TopBottom),
404            (3, Shape::Whole),
405        ] {
406            assert_eq!(Shape::from_u8(n), Some(sh));
407        }
408        assert_eq!(Shape::from_u8(0), None);
409        assert_eq!(Shape::from_u8(4), None);
410    }
411
412    #[test]
413    fn decomp_ref_stroke_accessors() {
414        let with_three = DecompRef {
415            zigen: &[],
416            strokes: &[Stroke::Heng, Stroke::Shu, Stroke::Pie],
417            shape: Shape::Whole,
418        };
419        assert_eq!(with_three.first_stroke(), Some(Stroke::Heng));
420        assert_eq!(with_three.second_stroke(), Some(Stroke::Shu));
421        assert_eq!(with_three.last_stroke(), Some(Stroke::Pie));
422
423        let single = DecompRef {
424            zigen: &[],
425            strokes: &[Stroke::Heng],
426            shape: Shape::Whole,
427        };
428        assert_eq!(single.first_stroke(), Some(Stroke::Heng));
429        assert_eq!(single.second_stroke(), None);
430        assert_eq!(single.last_stroke(), Some(Stroke::Heng));
431
432        let empty = DecompRef {
433            zigen: &[],
434            strokes: &[],
435            shape: Shape::Whole,
436        };
437        assert_eq!(empty.first_stroke(), None);
438        assert_eq!(empty.second_stroke(), None);
439        assert_eq!(empty.last_stroke(), None);
440    }
441
442    #[test]
443    fn encode_error_display_messages() {
444        assert_eq!(
445            format!("{}", EncodeError::EmptyZigen),
446            "empty zigen sequence"
447        );
448        assert_eq!(
449            format!("{}", EncodeError::UnknownZigen('🦀')),
450            "unknown zigen: 🦀"
451        );
452        assert_eq!(
453            format!("{}", EncodeError::MissingStroke),
454            "decomp has no strokes"
455        );
456    }
457
458    #[test]
459    fn two_zigen_emits_three_codes_with_shibie() {
460        // 2-字根: l1 + l2 + 识别码(last stroke, shape). dummy() only knows
461        // single-stroke 字根, so use the 一+一 case → both map to 'g', last
462        // stroke is Heng, shape Whole → 识别码 = 'd'. Expected: "ggd".
463        let d = DecompRef {
464            zigen: &['一', '一'],
465            strokes: &[Stroke::Heng, Stroke::Heng],
466            shape: Shape::Whole,
467        };
468        let mut out = [0u8; 4];
469        let n = encode_with_lookup(&d, dummy, &mut out).unwrap();
470        assert_eq!(&out[..n], b"ggd");
471    }
472
473    #[test]
474    fn three_zigen_emits_four_codes_with_shibie() {
475        // 3-字根: l1 + l2 + l3 + 识别码. Three 一 字根 + Heng + Whole
476        // shape → "ggg" + 识别码('d') = "gggd".
477        let d = DecompRef {
478            zigen: &['一', '一', '一'],
479            strokes: &[Stroke::Heng, Stroke::Heng, Stroke::Heng],
480            shape: Shape::Whole,
481        };
482        let mut out = [0u8; 4];
483        let n = encode_with_lookup(&d, dummy, &mut out).unwrap();
484        assert_eq!(&out[..n], b"gggd");
485    }
486
487    #[test]
488    fn two_zigen_missing_stroke_errors_out() {
489        let d = DecompRef {
490            zigen: &['一', '一'],
491            strokes: &[],
492            shape: Shape::Whole,
493        };
494        let mut out = [0u8; 4];
495        assert!(matches!(
496            encode_with_lookup(&d, dummy, &mut out),
497            Err(EncodeError::MissingStroke)
498        ));
499    }
500}