1use alloc::collections::BTreeMap;
9use alloc::string::String;
10
11static BUILTIN_FREQ_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/tnc_freq.bin"));
12
13pub struct FreqMap(BTreeMap<String, u32>);
21
22impl FreqMap {
23 pub fn from_tsv(data: &str) -> Self {
25 let mut map = BTreeMap::new();
26 for line in data.lines() {
27 if let Some((word, freq_str)) = line.split_once('\t') {
28 if let Ok(freq) = freq_str.trim().parse::<u32>() {
29 map.insert(String::from(word), freq);
30 }
31 }
32 }
33 FreqMap(map)
34 }
35
36 pub fn builtin() -> Self {
44 Self::from_tsv(&crate::decompress_builtin(BUILTIN_FREQ_DATA))
45 }
46
47 #[inline]
49 pub fn get(&self, word: &str) -> u32 {
50 self.0.get(word).copied().unwrap_or(0)
51 }
52
53 pub fn max_freq(&self) -> u32 {
70 self.0.values().copied().max().unwrap_or(0)
71 }
72}
73
74#[cfg(test)]
79mod tests {
80 use super::*;
81
82 #[test]
85 fn parses_tab_separated_entries() {
86 let m = FreqMap::from_tsv("กิน\t1234\nข้าว\t5678\n");
87 assert_eq!(m.get("กิน"), 1234);
88 assert_eq!(m.get("ข้าว"), 5678);
89 }
90
91 #[test]
92 fn blank_lines_are_skipped() {
93 let m = FreqMap::from_tsv("\n\nกิน\t10\n\n");
94 assert_eq!(m.get("กิน"), 10);
95 }
96
97 #[test]
98 fn line_without_tab_is_skipped() {
99 let m = FreqMap::from_tsv("noop\nกิน\t42\n");
100 assert_eq!(m.get("noop"), 0);
101 assert_eq!(m.get("กิน"), 42);
102 }
103
104 #[test]
105 fn non_numeric_count_is_skipped() {
106 let m = FreqMap::from_tsv("กิน\tabc\nข้าว\t99\n");
107 assert_eq!(m.get("กิน"), 0);
108 assert_eq!(m.get("ข้าว"), 99);
109 }
110
111 #[test]
112 fn later_duplicate_overwrites_earlier() {
113 let m = FreqMap::from_tsv("กิน\t10\nกิน\t99\n");
114 assert_eq!(m.get("กิน"), 99);
115 }
116
117 #[test]
118 fn whitespace_trimmed_from_count() {
119 let m = FreqMap::from_tsv("กิน\t 42 \n");
120 assert_eq!(m.get("กิน"), 42);
121 }
122
123 #[test]
126 fn unknown_word_returns_zero() {
127 let m = FreqMap::from_tsv("กิน\t100\n");
128 assert_eq!(m.get("xyz"), 0);
129 }
130
131 #[test]
132 fn empty_lookup_returns_zero() {
133 let m = FreqMap::from_tsv("กิน\t100\n");
134 assert_eq!(m.get(""), 0);
135 }
136
137 #[test]
138 fn empty_input_produces_empty_map() {
139 let m = FreqMap::from_tsv("");
140 assert_eq!(m.get("กิน"), 0);
141 }
142
143 #[test]
146 fn builtin_loads_without_panic() {
147 let _ = FreqMap::builtin();
148 }
149
150 #[test]
151 fn builtin_has_expected_entry_count() {
152 let m = FreqMap::builtin();
153 let count = m.0.len();
154 assert!(count > 100_000, "expected >100k TNC entries, got {count}");
155 }
156
157 #[test]
158 fn builtin_common_words_have_nonzero_freq() {
159 let m = FreqMap::builtin();
160 for word in &["กิน", "ข้าว", "ไป", "มา", "คน", "ที่", "นี้"]
161 {
162 assert!(
163 m.get(word) > 0,
164 "expected '{word}' to have non-zero TNC freq"
165 );
166 }
167 }
168
169 #[test]
170 fn builtin_unknown_word_returns_zero() {
171 let m = FreqMap::builtin();
172 assert_eq!(m.get("กขคงจฉชซ"), 0);
173 }
174
175 #[test]
176 fn builtin_high_freq_words_outrank_rare_words() {
177 let m = FreqMap::builtin();
178 assert!(
180 m.get("ที่") > m.get("มะม่วงหิมพานต์"),
181 "expected 'ที่' to have higher TNC freq than 'มะม่วงหิมพานต์'"
182 );
183 }
184
185 #[test]
188 fn fewer_tokens_preferred_over_split_components() {
189 use crate::Tokenizer;
190 use alloc::vec::Vec;
191 let tok = Tokenizer::new();
195 let tokens = tok.segment("ตากลม");
196 let words: Vec<&str> = tokens.iter().map(|t| t.text).collect();
197 assert_eq!(
198 words,
199 alloc::vec!["ตากลม"],
200 "compound word should be preferred over split — got {words:?}"
201 );
202 }
203}