Skip to main content

rust_canto/
lib.rs

1mod trie;
2mod token;
3mod utils;
4mod yale;
5use yale::{jyutping_to_yale, jyutping_to_yale_vec};
6
7use trie::Trie;
8use token::Token;
9use once_cell::sync::Lazy;
10use wasm_minimal_protocol::*;
11
12const CHAR_DATA: &str = include_str!("../data/chars.tsv");
13const WORD_DATA: &str = include_str!("../data/words.tsv");
14const FREQ_DATA: &str = include_str!("../data/freq.txt");
15const LETTERED_DATA: &str = include_str!("../data/lettered.tsv");
16
17initiate_protocol!();
18
19static TRIE: Lazy<Trie> = Lazy::new(|| build_trie());
20
21fn build_trie() -> Trie {
22    let mut trie = Trie::new();
23
24    for line in CHAR_DATA.lines() {
25        let parts: Vec<&str> = line.split('\t').collect();
26        if parts.len() >= 2 {
27            if let Some(ch) = parts[0].chars().next() {
28                // parse "5%" → 5, missing → 100 (highest priority)
29                let weight = parts.get(2)
30                    .map(|s| s.replace('%', "").trim().parse::<u32>().unwrap_or(0))
31                    .unwrap_or(100);
32                trie.insert_char(ch, parts[1], weight);
33            }
34        }
35    }
36
37    for line in WORD_DATA.lines() {
38        let Some((left, right)) = line.split_once('\t') else {
39            continue;
40        };
41        trie.insert_word(left, right);
42    }
43
44    for line in FREQ_DATA.lines() {
45        let parts: Vec<&str> = line.split('\t').collect();
46        if parts.len() >= 2 {
47            if let Ok(freq) = parts[1].parse::<i64>() {
48                trie.insert_freq(parts[0], freq);
49            }
50        }
51    }
52
53    for line in LETTERED_DATA.lines() {
54        let Some((left, right)) = line.split_once('\t') else {
55            continue;
56        };
57        trie.insert_lettered(left, right);
58    }
59
60    trie
61}
62
63#[wasm_func]
64pub fn annotate(input: &[u8]) -> Vec<u8> {
65    let text = std::str::from_utf8(input).unwrap_or("");
66    let tokens = TRIE.segment(text);
67
68    let output: Vec<Token> = tokens
69        .into_iter()
70        .map(|t| Token {
71            word: t.word,
72            yale: t.reading.as_deref().and_then(jyutping_to_yale_vec),
73            reading: t.reading,
74        })
75        .collect();
76
77    serde_json::to_string(&output)
78        .unwrap_or_else(|_| "[]".to_string())
79        .into_bytes()
80}
81
82/// Input: jyutping bytes, e.g. b"gwong2 dung1 waa2"
83/// Output: Yale with tone numbers, e.g. b"gwong2 dung1 waa2"
84#[wasm_func]
85pub fn to_yale_numeric(input: &[u8]) -> Vec<u8> {
86    let jp = std::str::from_utf8(input).unwrap_or("");
87    jyutping_to_yale(jp, false)
88        .unwrap_or_default()
89        .into_bytes()
90}
91
92/// Input: jyutping bytes
93/// Output: Yale with diacritics, e.g. b"gwóngdūngwá"
94#[wasm_func]
95pub fn to_yale_diacritics(input: &[u8]) -> Vec<u8> {
96    let jp = std::str::from_utf8(input).unwrap_or("");
97    jyutping_to_yale(jp, true)
98        .unwrap_or_default()
99        .into_bytes()
100}
101
102#[cfg(test)]
103mod tests {
104    use super::*;
105
106    #[test]
107    fn test_segmentation() {
108        let trie = build_trie();
109
110        let cases: Vec<(&str, Vec<(&str, Option<&str>)>)> = vec![
111            // --- basic CJK ---
112            (
113                "佢係好學生",
114                vec![
115                    ("佢",   Some("keoi5")),
116                    ("係",   Some("hai6")),
117                    ("好",   Some("hou2")),
118                    ("學生", Some("hok6 saang1")),
119                ],
120            ),
121            // --- CJK + special chars + lettered dict (no space before AB膠) ---
122            (
123                "都會大學入面3%人識用AB膠",
124                vec![
125                    ("都會大學", Some("dou1 wui6 daai6 hok6")),
126                    ("入面",     Some("jap6 min6")),
127                    ("3",        None),               // digit: alpha run, no dict entry
128                    ("%",        Some("pat6 sen1")),   // single-char lettered entry
129                    ("人",       Some("jan4")),
130                    ("識",       Some("sik1")),
131                    ("用",       Some("jung6")),
132                    ("AB膠",     Some("ei1 bi1 gaau1")), // mixed lettered dict entry
133                ],
134            ),
135            // --- pure alpha non-lettered-word run at start ---
136            (
137                "abc",
138                vec![
139                    ("abc", None),
140                ],
141            ),
142            // --- pure alpha lettered-word run at start ---
143            (
144                "ge",
145                vec![
146                    ("ge", Some("ge3")),
147                ],
148            ),
149            // --- alpha run beside CJK, with space ---
150            (
151                "ABCD 一二",
152                vec![
153                    ("ABCD", None),
154                    (" ",    None),
155                    ("一",   Some("jat1")),
156                    ("二",   Some("ji6")),
157                ],
158            ),
159            // --- accented letter in alpha run ---
160            (
161                "café好",
162                vec![
163                    ("café", Some("kat6 fei1")),
164                    ("好",   Some("hou2")),
165                ],
166            ),
167            // --- hyphenated lettered dict entry ---
168            (
169                "我做part-time",
170                vec![
171                    ("我",        Some("ngo5")),
172                    ("做part-time", Some("zou6 paat1 taai1")),
173                ],
174            ),
175            // --- mixed CJK+Latin lettered entry ---
176            (
177                "Hap唔Happy呀",
178                vec![
179                    ("Hap唔Happy呀", Some("hep1 m4 hep1 pi2 aa3")),
180                ],
181            ),
182            // --- newline becomes its own token ---
183            (
184                "你好\n世界",
185                vec![
186                    ("你", Some("nei5")),
187                    ("好", Some("hou2")),
188                    ("\n",   None),
189                    ("世界", Some("sai3 gaai3")),
190                ],
191            ),
192        ];
193
194        for (input, expected) in &cases {
195            println!("Testing: {}", input);
196            let result = trie.segment(input);
197            assert_eq!(
198                result.len(), expected.len(),
199                "token count mismatch for {:?}: got [{}]",
200                input,
201                result.iter().map(|t| format!("{:?}", t.word)).collect::<Vec<_>>().join(", ")
202            );
203            for (i, token) in result.iter().enumerate() {
204                assert_eq!(
205                    token.word, expected[i].0,
206                    "word mismatch at index {} for {:?}", i, input
207                );
208                assert_eq!(
209                    token.reading.as_deref(), expected[i].1,
210                    "reading mismatch at index {} for {:?} (word={:?})", i, input, token.word
211                );
212            }
213        }
214    }
215}