1use alloc::collections::BTreeMap;
9use alloc::string::String;
10
11static BUILTIN_FREQ_DATA: &str = include_str!("../data/tnc_freq.txt");
12
13pub struct FreqMap(BTreeMap<String, u32>);
21
22impl FreqMap {
23 pub fn from_tsv(data: &str) -> Self {
25 let mut map = BTreeMap::new();
26 for line in data.lines() {
27 if let Some((word, freq_str)) = line.split_once('\t') {
28 if let Ok(freq) = freq_str.trim().parse::<u32>() {
29 map.insert(String::from(word), freq);
30 }
31 }
32 }
33 FreqMap(map)
34 }
35
36 pub fn builtin() -> Self {
44 Self::from_tsv(BUILTIN_FREQ_DATA)
45 }
46
47 #[inline]
49 pub fn get(&self, word: &str) -> u32 {
50 self.0.get(word).copied().unwrap_or(0)
51 }
52}
53
54#[cfg(test)]
59mod tests {
60 use super::*;
61
62 #[test]
65 fn parses_tab_separated_entries() {
66 let m = FreqMap::from_tsv("กิน\t1234\nข้าว\t5678\n");
67 assert_eq!(m.get("กิน"), 1234);
68 assert_eq!(m.get("ข้าว"), 5678);
69 }
70
71 #[test]
72 fn blank_lines_are_skipped() {
73 let m = FreqMap::from_tsv("\n\nกิน\t10\n\n");
74 assert_eq!(m.get("กิน"), 10);
75 }
76
77 #[test]
78 fn line_without_tab_is_skipped() {
79 let m = FreqMap::from_tsv("noop\nกิน\t42\n");
80 assert_eq!(m.get("noop"), 0);
81 assert_eq!(m.get("กิน"), 42);
82 }
83
84 #[test]
85 fn non_numeric_count_is_skipped() {
86 let m = FreqMap::from_tsv("กิน\tabc\nข้าว\t99\n");
87 assert_eq!(m.get("กิน"), 0);
88 assert_eq!(m.get("ข้าว"), 99);
89 }
90
91 #[test]
92 fn later_duplicate_overwrites_earlier() {
93 let m = FreqMap::from_tsv("กิน\t10\nกิน\t99\n");
94 assert_eq!(m.get("กิน"), 99);
95 }
96
97 #[test]
98 fn whitespace_trimmed_from_count() {
99 let m = FreqMap::from_tsv("กิน\t 42 \n");
100 assert_eq!(m.get("กิน"), 42);
101 }
102
103 #[test]
106 fn unknown_word_returns_zero() {
107 let m = FreqMap::from_tsv("กิน\t100\n");
108 assert_eq!(m.get("xyz"), 0);
109 }
110
111 #[test]
112 fn empty_lookup_returns_zero() {
113 let m = FreqMap::from_tsv("กิน\t100\n");
114 assert_eq!(m.get(""), 0);
115 }
116
117 #[test]
118 fn empty_input_produces_empty_map() {
119 let m = FreqMap::from_tsv("");
120 assert_eq!(m.get("กิน"), 0);
121 }
122
123 #[test]
126 fn builtin_loads_without_panic() {
127 let _ = FreqMap::builtin();
128 }
129
130 #[test]
131 fn builtin_has_expected_entry_count() {
132 let m = FreqMap::builtin();
133 let count = m.0.len();
134 assert!(count > 100_000, "expected >100k TNC entries, got {count}");
135 }
136
137 #[test]
138 fn builtin_common_words_have_nonzero_freq() {
139 let m = FreqMap::builtin();
140 for word in &["กิน", "ข้าว", "ไป", "มา", "คน", "ที่", "นี้"]
141 {
142 assert!(
143 m.get(word) > 0,
144 "expected '{word}' to have non-zero TNC freq"
145 );
146 }
147 }
148
149 #[test]
150 fn builtin_unknown_word_returns_zero() {
151 let m = FreqMap::builtin();
152 assert_eq!(m.get("กขคงจฉชซ"), 0);
153 }
154
155 #[test]
156 fn builtin_high_freq_words_outrank_rare_words() {
157 let m = FreqMap::builtin();
158 assert!(
160 m.get("ที่") > m.get("มะม่วงหิมพานต์"),
161 "expected 'ที่' to have higher TNC freq than 'มะม่วงหิมพานต์'"
162 );
163 }
164
165 #[test]
168 fn freq_breaks_tie_toward_common_segmentation() {
169 use crate::Tokenizer;
170 use alloc::vec::Vec;
171 let tok = Tokenizer::new();
175 let tokens = tok.segment("ตากลม");
176 let words: Vec<&str> = tokens.iter().map(|t| t.text).collect();
177 assert_eq!(
178 words,
179 alloc::vec!["ตา", "กลม"],
180 "freq scoring should prefer 'ตา|กลม' over 'ตาก|ลม' — got {words:?}"
181 );
182 }
183}