Skip to main content

kham_core/
freq.rs

1//! Word frequency table built from the Thai National Corpus (TNC).
2//!
3//! The built-in data comes from `tnc_freq.txt` (CC0), which maps Thai words to
4//! their raw occurrence counts in the TNC. Frequencies are used by the newmm
5//! DAG scorer to prefer more common segmentations when multiple paths have the
6//! same number of dictionary matches.
7
8use alloc::collections::BTreeMap;
9use alloc::string::String;
10
11static BUILTIN_FREQ_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/tnc_freq.bin"));
12
13/// A word→frequency lookup table backed by the Thai National Corpus (TNC).
14///
15/// Frequencies are raw TNC occurrence counts. Words absent from the table
16/// return 0, which is a safe default (the DP scorer simply ignores them).
17///
18/// The built-in table is loaded with [`FreqMap::builtin`]. Custom tables can
19/// be constructed from any tab-separated `word\tcount` text via [`FreqMap::from_tsv`].
20pub struct FreqMap(BTreeMap<String, u32>);
21
22impl FreqMap {
23    /// Parse a tab-separated `word\tcount` text (one entry per line).
24    pub fn from_tsv(data: &str) -> Self {
25        let mut map = BTreeMap::new();
26        for line in data.lines() {
27            if let Some((word, freq_str)) = line.split_once('\t') {
28                if let Ok(freq) = freq_str.trim().parse::<u32>() {
29                    map.insert(String::from(word), freq);
30                }
31            }
32        }
33        FreqMap(map)
34    }
35
36    /// Load the built-in TNC frequency table.
37    ///
38    /// Parses the embedded `tnc_freq.txt` (106k entries) into a [`BTreeMap`].
39    /// This is a pay-once startup cost; the returned [`FreqMap`] should be
40    /// reused across segmentation calls (e.g. stored in [`Tokenizer`]).
41    ///
42    /// [`Tokenizer`]: crate::Tokenizer
43    pub fn builtin() -> Self {
44        Self::from_tsv(&crate::decompress_builtin(BUILTIN_FREQ_DATA))
45    }
46
47    /// Look up a word's frequency; returns 0 if not found.
48    #[inline]
49    pub fn get(&self, word: &str) -> u32 {
50        self.0.get(word).copied().unwrap_or(0)
51    }
52
53    /// Return the maximum frequency value in the table, or 0 if the table is empty.
54    ///
55    /// Used by [`KeyExtractor`](crate::keyword::KeyExtractor) to compute the
56    /// IDF numerator for TF-IDF scoring.
57    ///
58    /// # Examples
59    ///
60    /// ```rust
61    /// use kham_core::freq::FreqMap;
62    ///
63    /// let m = FreqMap::from_tsv("กิน\t100\nข้าว\t500\nที่\t9999\n");
64    /// assert_eq!(m.max_freq(), 9999);
65    ///
66    /// let empty = FreqMap::from_tsv("");
67    /// assert_eq!(empty.max_freq(), 0);
68    /// ```
69    pub fn max_freq(&self) -> u32 {
70        self.0.values().copied().max().unwrap_or(0)
71    }
72}
73
74// ---------------------------------------------------------------------------
75// Tests
76// ---------------------------------------------------------------------------
77
78#[cfg(test)]
79mod tests {
80    use super::*;
81
82    // ── from_tsv parsing ──────────────────────────────────────────────────────
83
84    #[test]
85    fn parses_tab_separated_entries() {
86        let m = FreqMap::from_tsv("กิน\t1234\nข้าว\t5678\n");
87        assert_eq!(m.get("กิน"), 1234);
88        assert_eq!(m.get("ข้าว"), 5678);
89    }
90
91    #[test]
92    fn blank_lines_are_skipped() {
93        let m = FreqMap::from_tsv("\n\nกิน\t10\n\n");
94        assert_eq!(m.get("กิน"), 10);
95    }
96
97    #[test]
98    fn line_without_tab_is_skipped() {
99        let m = FreqMap::from_tsv("noop\nกิน\t42\n");
100        assert_eq!(m.get("noop"), 0);
101        assert_eq!(m.get("กิน"), 42);
102    }
103
104    #[test]
105    fn non_numeric_count_is_skipped() {
106        let m = FreqMap::from_tsv("กิน\tabc\nข้าว\t99\n");
107        assert_eq!(m.get("กิน"), 0);
108        assert_eq!(m.get("ข้าว"), 99);
109    }
110
111    #[test]
112    fn later_duplicate_overwrites_earlier() {
113        let m = FreqMap::from_tsv("กิน\t10\nกิน\t99\n");
114        assert_eq!(m.get("กิน"), 99);
115    }
116
117    #[test]
118    fn whitespace_trimmed_from_count() {
119        let m = FreqMap::from_tsv("กิน\t  42  \n");
120        assert_eq!(m.get("กิน"), 42);
121    }
122
123    // ── get edge cases ────────────────────────────────────────────────────────
124
125    #[test]
126    fn unknown_word_returns_zero() {
127        let m = FreqMap::from_tsv("กิน\t100\n");
128        assert_eq!(m.get("xyz"), 0);
129    }
130
131    #[test]
132    fn empty_lookup_returns_zero() {
133        let m = FreqMap::from_tsv("กิน\t100\n");
134        assert_eq!(m.get(""), 0);
135    }
136
137    #[test]
138    fn empty_input_produces_empty_map() {
139        let m = FreqMap::from_tsv("");
140        assert_eq!(m.get("กิน"), 0);
141    }
142
143    // ── built-in data ─────────────────────────────────────────────────────────
144
145    #[test]
146    fn builtin_loads_without_panic() {
147        let _ = FreqMap::builtin();
148    }
149
150    #[test]
151    fn builtin_has_expected_entry_count() {
152        let m = FreqMap::builtin();
153        let count = m.0.len();
154        assert!(count > 100_000, "expected >100k TNC entries, got {count}");
155    }
156
157    #[test]
158    fn builtin_common_words_have_nonzero_freq() {
159        let m = FreqMap::builtin();
160        for word in &["กิน", "ข้าว", "ไป", "มา", "คน", "ที่", "นี้"]
161        {
162            assert!(
163                m.get(word) > 0,
164                "expected '{word}' to have non-zero TNC freq"
165            );
166        }
167    }
168
169    #[test]
170    fn builtin_unknown_word_returns_zero() {
171        let m = FreqMap::builtin();
172        assert_eq!(m.get("กขคงจฉชซ"), 0);
173    }
174
175    #[test]
176    fn builtin_high_freq_words_outrank_rare_words() {
177        let m = FreqMap::builtin();
178        // "ที่" (relativiser, extremely common) should rank above "มะม่วงหิมพานต์"
179        assert!(
180            m.get("ที่") > m.get("มะม่วงหิมพานต์"),
181            "expected 'ที่' to have higher TNC freq than 'มะม่วงหิมพานต์'"
182        );
183    }
184
185    // ── freq influences segmentation (integration) ────────────────────────────
186
187    #[test]
188    fn fewer_tokens_preferred_over_split_components() {
189        use crate::Tokenizer;
190        use alloc::vec::Vec;
191        // "ตากลม" is in the dictionary as a compound word (1 token).
192        // Fewer-tokens priority means the compound wins over ตา|กลม or ตาก|ลม (2 tokens each).
193        // This matches PyThaiNLP newmm behaviour.
194        let tok = Tokenizer::new();
195        let tokens = tok.segment("ตากลม");
196        let words: Vec<&str> = tokens.iter().map(|t| t.text).collect();
197        assert_eq!(
198            words,
199            alloc::vec!["ตากลม"],
200            "compound word should be preferred over split — got {words:?}"
201        );
202    }
203}