Skip to main content

kham_core/
freq.rs

1//! Word frequency table built from the Thai National Corpus (TNC).
2//!
3//! The built-in data comes from `tnc_freq.txt` (CC0), which maps Thai words to
4//! their raw occurrence counts in the TNC. Frequencies are used by the newmm
5//! DAG scorer to prefer more common segmentations when multiple paths have the
6//! same number of dictionary matches.
7
8use alloc::collections::BTreeMap;
9use alloc::string::String;
10
11static BUILTIN_FREQ_DATA: &str = include_str!("../data/tnc_freq.txt");
12
13/// A word→frequency lookup table backed by the Thai National Corpus (TNC).
14///
15/// Frequencies are raw TNC occurrence counts. Words absent from the table
16/// return 0, which is a safe default (the DP scorer simply ignores them).
17///
18/// The built-in table is loaded with [`FreqMap::builtin`]. Custom tables can
19/// be constructed from any tab-separated `word\tcount` text via [`FreqMap::from_tsv`].
20pub struct FreqMap(BTreeMap<String, u32>);
21
22impl FreqMap {
23    /// Parse a tab-separated `word\tcount` text (one entry per line).
24    pub fn from_tsv(data: &str) -> Self {
25        let mut map = BTreeMap::new();
26        for line in data.lines() {
27            if let Some((word, freq_str)) = line.split_once('\t') {
28                if let Ok(freq) = freq_str.trim().parse::<u32>() {
29                    map.insert(String::from(word), freq);
30                }
31            }
32        }
33        FreqMap(map)
34    }
35
36    /// Load the built-in TNC frequency table.
37    ///
38    /// Parses the embedded `tnc_freq.txt` (106k entries) into a [`BTreeMap`].
39    /// This is a pay-once startup cost; the returned [`FreqMap`] should be
40    /// reused across segmentation calls (e.g. stored in [`Tokenizer`]).
41    ///
42    /// [`Tokenizer`]: crate::Tokenizer
43    pub fn builtin() -> Self {
44        Self::from_tsv(BUILTIN_FREQ_DATA)
45    }
46
47    /// Look up a word's frequency; returns 0 if not found.
48    #[inline]
49    pub fn get(&self, word: &str) -> u32 {
50        self.0.get(word).copied().unwrap_or(0)
51    }
52}
53
54// ---------------------------------------------------------------------------
55// Tests
56// ---------------------------------------------------------------------------
57
58#[cfg(test)]
59mod tests {
60    use super::*;
61
62    // ── from_tsv parsing ──────────────────────────────────────────────────────
63
64    #[test]
65    fn parses_tab_separated_entries() {
66        let m = FreqMap::from_tsv("กิน\t1234\nข้าว\t5678\n");
67        assert_eq!(m.get("กิน"), 1234);
68        assert_eq!(m.get("ข้าว"), 5678);
69    }
70
71    #[test]
72    fn blank_lines_are_skipped() {
73        let m = FreqMap::from_tsv("\n\nกิน\t10\n\n");
74        assert_eq!(m.get("กิน"), 10);
75    }
76
77    #[test]
78    fn line_without_tab_is_skipped() {
79        let m = FreqMap::from_tsv("noop\nกิน\t42\n");
80        assert_eq!(m.get("noop"), 0);
81        assert_eq!(m.get("กิน"), 42);
82    }
83
84    #[test]
85    fn non_numeric_count_is_skipped() {
86        let m = FreqMap::from_tsv("กิน\tabc\nข้าว\t99\n");
87        assert_eq!(m.get("กิน"), 0);
88        assert_eq!(m.get("ข้าว"), 99);
89    }
90
91    #[test]
92    fn later_duplicate_overwrites_earlier() {
93        let m = FreqMap::from_tsv("กิน\t10\nกิน\t99\n");
94        assert_eq!(m.get("กิน"), 99);
95    }
96
97    #[test]
98    fn whitespace_trimmed_from_count() {
99        let m = FreqMap::from_tsv("กิน\t  42  \n");
100        assert_eq!(m.get("กิน"), 42);
101    }
102
103    // ── get edge cases ────────────────────────────────────────────────────────
104
105    #[test]
106    fn unknown_word_returns_zero() {
107        let m = FreqMap::from_tsv("กิน\t100\n");
108        assert_eq!(m.get("xyz"), 0);
109    }
110
111    #[test]
112    fn empty_lookup_returns_zero() {
113        let m = FreqMap::from_tsv("กิน\t100\n");
114        assert_eq!(m.get(""), 0);
115    }
116
117    #[test]
118    fn empty_input_produces_empty_map() {
119        let m = FreqMap::from_tsv("");
120        assert_eq!(m.get("กิน"), 0);
121    }
122
123    // ── built-in data ─────────────────────────────────────────────────────────
124
125    #[test]
126    fn builtin_loads_without_panic() {
127        let _ = FreqMap::builtin();
128    }
129
130    #[test]
131    fn builtin_has_expected_entry_count() {
132        let m = FreqMap::builtin();
133        let count = m.0.len();
134        assert!(count > 100_000, "expected >100k TNC entries, got {count}");
135    }
136
137    #[test]
138    fn builtin_common_words_have_nonzero_freq() {
139        let m = FreqMap::builtin();
140        for word in &["กิน", "ข้าว", "ไป", "มา", "คน", "ที่", "นี้"]
141        {
142            assert!(
143                m.get(word) > 0,
144                "expected '{word}' to have non-zero TNC freq"
145            );
146        }
147    }
148
149    #[test]
150    fn builtin_unknown_word_returns_zero() {
151        let m = FreqMap::builtin();
152        assert_eq!(m.get("กขคงจฉชซ"), 0);
153    }
154
155    #[test]
156    fn builtin_high_freq_words_outrank_rare_words() {
157        let m = FreqMap::builtin();
158        // "ที่" (relativiser, extremely common) should rank above "มะม่วงหิมพานต์"
159        assert!(
160            m.get("ที่") > m.get("มะม่วงหิมพานต์"),
161            "expected 'ที่' to have higher TNC freq than 'มะม่วงหิมพานต์'"
162        );
163    }
164
165    // ── freq influences segmentation (integration) ────────────────────────────
166
167    #[test]
168    fn freq_breaks_tie_toward_common_segmentation() {
169        use crate::Tokenizer;
170        use alloc::vec::Vec;
171        // "ตากลม" can be read as "ตา|กลม" (eye + round, both dict words)
172        // or "ตาก|ลม" (to dry + wind, also both dict words).
173        // TNC freq for "ตา" >> "ตาก", so freq scoring should prefer "ตา|กลม".
174        let tok = Tokenizer::new();
175        let tokens = tok.segment("ตากลม");
176        let words: Vec<&str> = tokens.iter().map(|t| t.text).collect();
177        assert_eq!(
178            words,
179            alloc::vec!["ตา", "กลม"],
180            "freq scoring should prefer 'ตา|กลม' over 'ตาก|ลม' — got {words:?}"
181        );
182    }
183}