1mod token;
2mod trie;
3mod utils;
4mod yale;
5use std::sync::LazyLock;
6
7use yale::{jyutping_to_yale, jyutping_to_yale_vec};
8
9use token::Token;
10use trie::Trie;
11use wasm_minimal_protocol::*;
12
13initiate_protocol!();
14
15const TRIE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/trie.dat"));
16static TRIE: LazyLock<Trie> = LazyLock::new(build_trie);
17
18fn build_trie() -> Trie {
19 let mut data_ptr = TRIE_DATA;
20 let decomp = zstd::decode_all(&mut data_ptr).expect("Failed to decompress trie data");
21 postcard::from_bytes(&decomp).expect("Failed to deserialize trie data")
22}
23
24#[wasm_func]
25pub fn annotate(input: &[u8]) -> Vec<u8> {
26 let text = std::str::from_utf8(input).unwrap_or("");
27 let tokens = TRIE.segment(text);
28
29 let output: Vec<Token> = tokens
30 .into_iter()
31 .map(|t| Token {
32 word: t.word,
33 yale: t.reading.as_deref().and_then(jyutping_to_yale_vec),
34 reading: t.reading,
35 })
36 .collect();
37
38 serde_json::to_string(&output)
39 .unwrap_or_else(|_| "[]".to_string())
40 .into_bytes()
41}
42
43#[wasm_func]
46pub fn to_yale_numeric(input: &[u8]) -> Vec<u8> {
47 let jp = std::str::from_utf8(input).unwrap_or("");
48 jyutping_to_yale(jp, false).unwrap_or_default().into_bytes()
49}
50
51#[wasm_func]
54pub fn to_yale_diacritics(input: &[u8]) -> Vec<u8> {
55 let jp = std::str::from_utf8(input).unwrap_or("");
56 jyutping_to_yale(jp, true).unwrap_or_default().into_bytes()
57}
58
59#[cfg(test)]
60mod tests {
61 use super::*;
62
63 #[test]
64 fn test_segmentation() {
65 let trie = build_trie();
66
67 let cases: &'static [(&str, &'static [(&str, Option<&str>)])] = &[
68 (
70 "佢係好學生",
71 &[
72 ("佢", Some("keoi5")),
73 ("係", Some("hai6")),
74 ("好", Some("hou2")),
75 ("學生", Some("hok6 saang1")),
76 ],
77 ),
78 (
80 "都會大學入面3%人識用AB膠",
81 &[
82 ("都會大學", Some("dou1 wui6 daai6 hok6")),
83 ("入面", Some("jap6 min6")),
84 ("3", None), ("%", Some("pat6 sen1")), ("人", Some("jan4")),
87 ("識", Some("sik1")),
88 ("用", Some("jung6")),
89 ("AB膠", Some("ei1 bi1 gaau1")), ],
91 ),
92 ("abc", &[("abc", None)]),
94 ("ge", &[("ge", Some("ge3"))]),
96 (
98 "ABCD 一二",
99 &[
100 ("ABCD", None),
101 (" ", None),
102 ("一", Some("jat1")),
103 ("二", Some("ji6")),
104 ],
105 ),
106 (
108 "café好",
109 &[("café", Some("kat6 fei1")), ("好", Some("hou2"))],
110 ),
111 (
113 "我做part-time",
114 &[
115 ("我", Some("ngo5")),
116 ("做part-time", Some("zou6 paat1 taai1")),
117 ],
118 ),
119 (
121 "Hap唔Happy呀",
122 &[("Hap唔Happy呀", Some("hep1 m4 hep1 pi2 aa3"))],
123 ),
124 (
126 "你好\n世界",
127 &[
128 ("你", Some("nei5")),
129 ("好", Some("hou2")),
130 ("\n", None),
131 ("世界", Some("sai3 gaai3")),
132 ],
133 ),
134 ];
135
136 for (input, expected) in cases {
137 println!("Testing: {}", input);
138 let result = trie.segment(input);
139 assert_eq!(
140 result.len(),
141 expected.len(),
142 "token count mismatch for {:?}: got [{}]",
143 input,
144 result
145 .iter()
146 .map(|t| format!("{:?}", t.word))
147 .collect::<Vec<_>>()
148 .join(", ")
149 );
150 for (i, token) in result.iter().enumerate() {
151 assert_eq!(
152 token.word, expected[i].0,
153 "word mismatch at index {} for {:?}",
154 i, input
155 );
156 assert_eq!(
157 token.reading.as_deref(),
158 expected[i].1,
159 "reading mismatch at index {} for {:?} (word={:?})",
160 i,
161 input,
162 token.word
163 );
164 }
165 }
166 }
167}