Skip to main content

wubi/
decomp.rs

1//! Owned character-decomposition data (`char → Decomp`) and a parser for
2//! `data/seed.txt`.
3//!
4//! The runtime encoder works against the borrowed [`crate::codec::DecompRef`].
5//! `Decomp` is the convenient owned counterpart used by tools and tests; it
6//! exposes `as_ref()` to feed the codec.
7
8use crate::codec::{DecompRef, Shape, Stroke};
9
10const SEED_TXT: &str = include_str!(concat!(env!("CARGO_MANIFEST_DIR"), "/data/seed.txt"));
11
12/// Owned counterpart of [`DecompRef`]. Use [`Decomp::as_ref`] to feed the
13/// runtime encoder; the encoder itself only ever sees the borrowed form.
14#[derive(Debug, Clone)]
15pub struct Decomp {
16    /// Ordered 字根 sequence (1..=N).
17    pub zigen: Vec<char>,
18    /// Stroke sequence, used for 成字字根 and 末笔识别码 rules.
19    pub strokes: Vec<Stroke>,
20    /// 字形 (left-right / top-bottom / whole).
21    pub shape: Shape,
22}
23
24impl Decomp {
25    /// Borrow this `Decomp` as a [`DecompRef`] for the encoder.
26    #[inline]
27    pub fn as_ref(&self) -> DecompRef<'_> {
28        DecompRef {
29            zigen: &self.zigen,
30            strokes: &self.strokes,
31            shape: self.shape,
32        }
33    }
34
35    /// First stroke or `None` if the stroke list is empty.
36    pub fn first_stroke(&self) -> Option<Stroke> {
37        self.strokes.first().copied()
38    }
39    /// Last stroke or `None` if the stroke list is empty.
40    pub fn last_stroke(&self) -> Option<Stroke> {
41        self.strokes.last().copied()
42    }
43}
44
45/// Parse a seed-format TSV string into `(char, Decomp)` pairs.
46///
47/// Format: each non-blank, non-`#` line is `<char>\t<zigen>\t<strokes>\t<shape>`,
48/// where `zigen` is whitespace-separated 字根, `strokes` is whitespace-
49/// separated digits 1..=5, and `shape` is one of 1..=3. Malformed lines are
50/// silently skipped (consistent with the build-time parser in `build.rs`).
51pub fn parse_seed(src: &str) -> Vec<(char, Decomp)> {
52    let mut out = Vec::new();
53    for raw in src.lines() {
54        let line = raw.trim();
55        if line.is_empty() || line.starts_with('#') {
56            continue;
57        }
58        let mut parts = line.splitn(4, '\t');
59        let (Some(ch), Some(zg), Some(strokes_field), Some(shape)) =
60            (parts.next(), parts.next(), parts.next(), parts.next())
61        else {
62            continue;
63        };
64        let ch = ch.trim();
65        if ch.chars().count() != 1 {
66            continue;
67        }
68        let ch = ch.chars().next().unwrap();
69        let zigen: Vec<char> = zg.split_whitespace().flat_map(|s| s.chars()).collect();
70        if zigen.is_empty() {
71            continue;
72        }
73        let strokes: Vec<Stroke> = strokes_field
74            .split_whitespace()
75            .filter_map(|s| s.parse::<u8>().ok().and_then(Stroke::from_u8))
76            .collect();
77        if strokes.is_empty() {
78            continue;
79        }
80        let Ok(p) = shape.trim().parse::<u8>() else {
81            continue;
82        };
83        let Some(shape) = Shape::from_u8(p) else {
84            continue;
85        };
86        out.push((
87            ch,
88            Decomp {
89                zigen,
90                strokes,
91                shape,
92            },
93        ));
94    }
95    out
96}
97
98/// Parse the seed bundled into the binary at compile time.
99/// Equivalent to `parse_seed(include_str!("data/seed.txt"))`.
100pub fn embedded_seed() -> Vec<(char, Decomp)> {
101    parse_seed(SEED_TXT)
102}
103
104#[cfg(test)]
105mod tests {
106    use super::*;
107
108    #[test]
109    fn parses_seed_entries() {
110        let entries = embedded_seed();
111        assert!(entries.len() >= 25);
112        let (ch, d) = &entries[0];
113        assert_eq!(*ch, '王');
114        assert_eq!(d.zigen, vec!['王']);
115        assert_eq!(d.first_stroke(), Some(Stroke::Heng));
116    }
117}