Skip to main content

rust_canto/
lib.rs

1mod token;
2mod trie;
3mod utils;
4mod yale;
5use std::sync::LazyLock;
6
7use yale::{jyutping_to_yale, jyutping_to_yale_vec};
8
9use token::Token;
10use trie::Trie;
11use wasm_minimal_protocol::*;
12
13initiate_protocol!();
14
15const TRIE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/trie.dat"));
16static TRIE: LazyLock<Trie> = LazyLock::new(build_trie);
17
18fn build_trie() -> Trie {
19    let mut data_ptr = TRIE_DATA;
20    let decomp = zstd::decode_all(&mut data_ptr).expect("Failed to decompress trie data");
21    postcard::from_bytes(&decomp).expect("Failed to deserialize trie data")
22}
23
24#[wasm_func]
25pub fn annotate(input: &[u8]) -> Vec<u8> {
26    let text = std::str::from_utf8(input).unwrap_or("");
27    let tokens = TRIE.segment(text);
28
29    let output: Vec<Token> = tokens
30        .into_iter()
31        .map(|t| Token {
32            word: t.word,
33            yale: t.reading.as_deref().and_then(jyutping_to_yale_vec),
34            reading: t.reading,
35        })
36        .collect();
37
38    serde_json::to_string(&output)
39        .unwrap_or_else(|_| "[]".to_string())
40        .into_bytes()
41}
42
43/// Input: jyutping bytes, e.g. b"gwong2 dung1 waa2"
44/// Output: Yale with tone numbers, e.g. b"gwong2 dung1 waa2"
45#[wasm_func]
46pub fn to_yale_numeric(input: &[u8]) -> Vec<u8> {
47    let jp = std::str::from_utf8(input).unwrap_or("");
48    jyutping_to_yale(jp, false).unwrap_or_default().into_bytes()
49}
50
51/// Input: jyutping bytes
52/// Output: Yale with diacritics, e.g. b"gwóngdūngwá"
53#[wasm_func]
54pub fn to_yale_diacritics(input: &[u8]) -> Vec<u8> {
55    let jp = std::str::from_utf8(input).unwrap_or("");
56    jyutping_to_yale(jp, true).unwrap_or_default().into_bytes()
57}
58
59#[cfg(test)]
60mod tests {
61    use super::*;
62
63    #[test]
64    fn test_segmentation() {
65        let trie = build_trie();
66
67        let cases: &'static [(&str, &'static [(&str, Option<&str>)])] = &[
68            // --- basic CJK ---
69            (
70                "佢係好學生",
71                &[
72                    ("佢", Some("keoi5")),
73                    ("係", Some("hai6")),
74                    ("好", Some("hou2")),
75                    ("學生", Some("hok6 saang1")),
76                ],
77            ),
78            // --- CJK + special chars + lettered dict (no space before AB膠) ---
79            (
80                "都會大學入面3%人識用AB膠",
81                &[
82                    ("都會大學", Some("dou1 wui6 daai6 hok6")),
83                    ("入面", Some("jap6 min6")),
84                    ("3", None),              // digit: alpha run, no dict entry
85                    ("%", Some("pat6 sen1")), // single-char lettered entry
86                    ("人", Some("jan4")),
87                    ("識", Some("sik1")),
88                    ("用", Some("jung6")),
89                    ("AB膠", Some("ei1 bi1 gaau1")), // mixed lettered dict entry
90                ],
91            ),
92            // --- pure alpha non-lettered-word run at start ---
93            ("abc", &[("abc", None)]),
94            // --- pure alpha lettered-word run at start ---
95            ("ge", &[("ge", Some("ge3"))]),
96            // --- alpha run beside CJK, with space ---
97            (
98                "ABCD 一二",
99                &[
100                    ("ABCD", None),
101                    (" ", None),
102                    ("一", Some("jat1")),
103                    ("二", Some("ji6")),
104                ],
105            ),
106            // --- accented letter in alpha run ---
107            (
108                "café好",
109                &[("café", Some("kat6 fei1")), ("好", Some("hou2"))],
110            ),
111            // --- hyphenated lettered dict entry ---
112            (
113                "我做part-time",
114                &[
115                    ("我", Some("ngo5")),
116                    ("做part-time", Some("zou6 paat1 taai1")),
117                ],
118            ),
119            // --- mixed CJK+Latin lettered entry ---
120            (
121                "Hap唔Happy呀",
122                &[("Hap唔Happy呀", Some("hep1 m4 hep1 pi2 aa3"))],
123            ),
124            // --- newline becomes its own token ---
125            (
126                "你好\n世界",
127                &[
128                    ("你", Some("nei5")),
129                    ("好", Some("hou2")),
130                    ("\n", None),
131                    ("世界", Some("sai3 gaai3")),
132                ],
133            ),
134        ];
135
136        for (input, expected) in cases {
137            println!("Testing: {}", input);
138            let result = trie.segment(input);
139            assert_eq!(
140                result.len(),
141                expected.len(),
142                "token count mismatch for {:?}: got [{}]",
143                input,
144                result
145                    .iter()
146                    .map(|t| format!("{:?}", t.word))
147                    .collect::<Vec<_>>()
148                    .join(", ")
149            );
150            for (i, token) in result.iter().enumerate() {
151                assert_eq!(
152                    token.word, expected[i].0,
153                    "word mismatch at index {} for {:?}",
154                    i, input
155                );
156                assert_eq!(
157                    token.reading.as_deref(),
158                    expected[i].1,
159                    "reading mismatch at index {} for {:?} (word={:?})",
160                    i,
161                    input,
162                    token.word
163                );
164            }
165        }
166    }
167}