1mod trie;
2mod token;
3mod utils;
4mod yale;
5use yale::{jyutping_to_yale, jyutping_to_yale_vec};
6
7use trie::Trie;
8use token::Token;
9use once_cell::sync::Lazy;
10use wasm_minimal_protocol::*;
11
12const CHAR_DATA: &str = include_str!("../data/chars.tsv");
13const WORD_DATA: &str = include_str!("../data/words.tsv");
14const FREQ_DATA: &str = include_str!("../data/freq.txt");
15const LETTERED_DATA: &str = include_str!("../data/lettered.tsv");
16
17initiate_protocol!();
18
19static TRIE: Lazy<Trie> = Lazy::new(|| build_trie());
20
21fn build_trie() -> Trie {
22 let mut trie = Trie::new();
23
24 for line in CHAR_DATA.lines() {
25 let parts: Vec<&str> = line.split('\t').collect();
26 if parts.len() >= 2 {
27 if let Some(ch) = parts[0].chars().next() {
28 let weight = parts.get(2)
30 .map(|s| s.replace('%', "").trim().parse::<u32>().unwrap_or(0))
31 .unwrap_or(100);
32 trie.insert_char(ch, parts[1], weight);
33 }
34 }
35 }
36
37 for line in WORD_DATA.lines() {
38 let Some((left, right)) = line.split_once('\t') else {
39 continue;
40 };
41 trie.insert_word(left, right);
42 }
43
44 for line in FREQ_DATA.lines() {
45 let parts: Vec<&str> = line.split('\t').collect();
46 if parts.len() >= 2 {
47 if let Ok(freq) = parts[1].parse::<i64>() {
48 trie.insert_freq(parts[0], freq);
49 }
50 }
51 }
52
53 for line in LETTERED_DATA.lines() {
54 let Some((left, right)) = line.split_once('\t') else {
55 continue;
56 };
57 trie.insert_lettered(left, right);
58 }
59
60 trie
61}
62
63#[wasm_func]
64pub fn annotate(input: &[u8]) -> Vec<u8> {
65 let text = std::str::from_utf8(input).unwrap_or("");
66 let tokens = TRIE.segment(text);
67
68 let output: Vec<Token> = tokens
69 .into_iter()
70 .map(|t| Token {
71 word: t.word,
72 yale: t.reading.as_deref().and_then(jyutping_to_yale_vec),
73 reading: t.reading,
74 })
75 .collect();
76
77 serde_json::to_string(&output)
78 .unwrap_or_else(|_| "[]".to_string())
79 .into_bytes()
80}
81
82#[wasm_func]
85pub fn to_yale_numeric(input: &[u8]) -> Vec<u8> {
86 let jp = std::str::from_utf8(input).unwrap_or("");
87 jyutping_to_yale(jp, false)
88 .unwrap_or_default()
89 .into_bytes()
90}
91
92#[wasm_func]
95pub fn to_yale_diacritics(input: &[u8]) -> Vec<u8> {
96 let jp = std::str::from_utf8(input).unwrap_or("");
97 jyutping_to_yale(jp, true)
98 .unwrap_or_default()
99 .into_bytes()
100}
101
102#[cfg(test)]
103mod tests {
104 use super::*;
105
106 #[test]
107 fn test_segmentation() {
108 let trie = build_trie();
109
110 let cases: Vec<(&str, Vec<(&str, Option<&str>)>)> = vec![
111 (
113 "佢係好學生",
114 vec![
115 ("佢", Some("keoi5")),
116 ("係", Some("hai6")),
117 ("好", Some("hou2")),
118 ("學生", Some("hok6 saang1")),
119 ],
120 ),
121 (
123 "都會大學入面3%人識用AB膠",
124 vec![
125 ("都會大學", Some("dou1 wui6 daai6 hok6")),
126 ("入面", Some("jap6 min6")),
127 ("3", None), ("%", Some("pat6 sen1")), ("人", Some("jan4")),
130 ("識", Some("sik1")),
131 ("用", Some("jung6")),
132 ("AB膠", Some("ei1 bi1 gaau1")), ],
134 ),
135 (
137 "abc",
138 vec![
139 ("abc", None),
140 ],
141 ),
142 (
144 "ge",
145 vec![
146 ("ge", Some("ge3")),
147 ],
148 ),
149 (
151 "ABCD 一二",
152 vec![
153 ("ABCD", None),
154 (" ", None),
155 ("一", Some("jat1")),
156 ("二", Some("ji6")),
157 ],
158 ),
159 (
161 "café好",
162 vec![
163 ("café", Some("kat6 fei1")),
164 ("好", Some("hou2")),
165 ],
166 ),
167 (
169 "我做part-time",
170 vec![
171 ("我", Some("ngo5")),
172 ("做part-time", Some("zou6 paat1 taai1")),
173 ],
174 ),
175 (
177 "Hap唔Happy呀",
178 vec![
179 ("Hap唔Happy呀", Some("hep1 m4 hep1 pi2 aa3")),
180 ],
181 ),
182 (
184 "你好\n世界",
185 vec![
186 ("你", Some("nei5")),
187 ("好", Some("hou2")),
188 ("\n", None),
189 ("世界", Some("sai3 gaai3")),
190 ],
191 ),
192 ];
193
194 for (input, expected) in &cases {
195 println!("Testing: {}", input);
196 let result = trie.segment(input);
197 assert_eq!(
198 result.len(), expected.len(),
199 "token count mismatch for {:?}: got [{}]",
200 input,
201 result.iter().map(|t| format!("{:?}", t.word)).collect::<Vec<_>>().join(", ")
202 );
203 for (i, token) in result.iter().enumerate() {
204 assert_eq!(
205 token.word, expected[i].0,
206 "word mismatch at index {} for {:?}", i, input
207 );
208 assert_eq!(
209 token.reading.as_deref(), expected[i].1,
210 "reading mismatch at index {} for {:?} (word={:?})", i, input, token.word
211 );
212 }
213 }
214 }
215}