kham_core/keyword.rs
1//! Thai keyword extraction using TF × inverse-corpus-frequency (TF-IDF proxy).
2//!
3//! [`KeyExtractor`] segments text with the built-in tokenizer, discards
4//! stopwords and single-character tokens, then ranks content words by how
5//! often they appear in the document relative to their frequency in the Thai
6//! National Corpus (TNC).
7//!
8//! The scoring formula uses only basic `f32` arithmetic (no transcendentals),
9//! keeping the module `no_std` compatible:
10//!
11//! ```text
12//! TF(t) = occurrences(t, doc) / total_content_tokens(doc)
13//! IDF_proxy(t) = (max_tnc_freq + 1) / (tnc_freq(t) + 1)
14//! score(t) = TF(t) × IDF_proxy(t)
15//! ```
16//!
17//! Words absent from TNC receive the maximum IDF weight — they are likely
18//! domain-specific and therefore the most distinctive keywords.
19//!
20//! ```rust
21//! use kham_core::keyword::KeyExtractor;
22//!
23//! let kex = KeyExtractor::builtin();
24//! let kws = kex.extract("การพัฒนาซอฟต์แวร์เป็นสิ่งสำคัญในยุคดิจิทัล", 5);
25//! assert!(!kws.is_empty());
26//! // Results are always sorted by score descending
27//! for pair in kws.windows(2) {
28//! assert!(pair[0].score >= pair[1].score);
29//! }
30//! ```
31
32use alloc::collections::BTreeMap;
33use alloc::string::String;
34use alloc::vec::Vec;
35
36use crate::freq::FreqMap;
37use crate::segmenter::Tokenizer;
38use crate::stopwords::StopwordSet;
39use crate::token::TokenKind;
40
41// ---------------------------------------------------------------------------
42// Public types
43// ---------------------------------------------------------------------------
44
45/// A keyword extracted from a document with its relevance score.
46///
47/// Scores are computed as `TF × IDF_proxy`:
48/// - **TF**: how often the word appears in this document (normalized by total
49/// content tokens)
50/// - **IDF_proxy**: `(max_tnc_freq + 1) / (tnc_freq + 1)` — rare corpus
51/// words receive a higher weight than common function words
52///
53/// Keywords are returned sorted by `score` descending.
54#[derive(Debug, Clone, PartialEq)]
55pub struct Keyword {
56 /// The word text.
57 pub word: String,
58 /// TF × IDF_proxy score. Higher means more document-distinctive.
59 pub score: f32,
60 /// Raw occurrence count of this word in the document.
61 pub count: usize,
62}
63
64/// Thai keyword extractor using TF × inverse-corpus-frequency scoring.
65///
66/// Backed by the built-in 62k-word tokenizer, the TNC frequency table
67/// (~106k entries), and the Thai stopword list (~1 029 entries).
68///
69/// Construction is O(n) in the TNC table size — reuse the returned instance
70/// rather than calling [`builtin()`](KeyExtractor::builtin) on every query.
71///
72/// # Filtering rules
73///
74/// A token is eligible as a keyword when **all** of the following hold:
75/// 1. Kind is `Thai`, `Latin`, `Number`, or `Named` (whitespace, punctuation,
76/// emoji, and unknown tokens are always skipped)
77/// 2. Character length ≥ 2 (single-char tokens are too coarse to be keywords)
78/// 3. Not in the built-in Thai stopword list
79///
80/// # Examples
81///
82/// ```rust
83/// use kham_core::keyword::KeyExtractor;
84///
85/// let kex = KeyExtractor::builtin();
86///
87/// // Rare domain-specific word outranks a common word
88/// // "ซอฟต์แวร์" (software) is rare in TNC and should appear as a top keyword
89/// let kws = kex.extract("นักพัฒนาซอฟต์แวร์เขียนซอฟต์แวร์ทุกวัน", 5);
90/// assert!(kws.iter().any(|k| k.word == "ซอฟต์แวร์"));
91/// ```
92pub struct KeyExtractor {
93 tokenizer: Tokenizer,
94 freq: FreqMap,
95 stops: StopwordSet,
96 max_corpus_freq: u32,
97}
98
99impl KeyExtractor {
100 /// Create a keyword extractor backed by the built-in tokenizer, TNC
101 /// frequency table, and Thai stopword list.
102 ///
103 /// # Examples
104 ///
105 /// ```rust
106 /// use kham_core::keyword::KeyExtractor;
107 ///
108 /// let kex = KeyExtractor::builtin();
109 /// assert!(!kex.extract("กินข้าวกับปลา", 5).is_empty());
110 /// ```
111 pub fn builtin() -> Self {
112 let freq = FreqMap::builtin();
113 let max_corpus_freq = freq.max_freq();
114 Self {
115 tokenizer: Tokenizer::new(),
116 freq,
117 stops: StopwordSet::builtin(),
118 max_corpus_freq,
119 }
120 }
121
122 /// Extract up to `max_n` keywords from `text`, ranked by TF-IDF score.
123 ///
124 /// Returns an empty `Vec` when `text` is empty, contains no eligible
125 /// content words, or `max_n` is zero.
126 ///
127 /// Ties in score are broken alphabetically so results are deterministic.
128 ///
129 /// # Examples
130 ///
131 /// ```rust
132 /// use kham_core::keyword::KeyExtractor;
133 ///
134 /// let kex = KeyExtractor::builtin();
135 ///
136 /// // Edge cases
137 /// assert!(kex.extract("", 5).is_empty());
138 /// assert!(kex.extract("กินข้าวกับปลา", 0).is_empty());
139 ///
140 /// // Score order is non-increasing
141 /// let kws = kex.extract("การเรียนภาษาโปรแกรมมิ่งเป็นทักษะสำคัญสำหรับนักพัฒนา", 10);
142 /// for pair in kws.windows(2) {
143 /// assert!(
144 /// pair[0].score >= pair[1].score,
145 /// "out-of-order: {:?} before {:?}", pair[0], pair[1]
146 /// );
147 /// }
148 /// ```
149 pub fn extract(&self, text: &str, max_n: usize) -> Vec<Keyword> {
150 if text.is_empty() || max_n == 0 {
151 return Vec::new();
152 }
153
154 let tokens = self.tokenizer.segment(text);
155
156 // Count all content tokens for the TF denominator.
157 // Count candidate tokens (non-stop, len ≥ 2) for keyword scoring.
158 let mut total_content: usize = 0;
159 let mut counts: BTreeMap<String, usize> = BTreeMap::new();
160
161 for token in &tokens {
162 match token.kind {
163 TokenKind::Whitespace
164 | TokenKind::Punctuation
165 | TokenKind::Emoji
166 | TokenKind::Unknown => continue,
167 _ => {}
168 }
169
170 total_content += 1;
171
172 // Single-char tokens and stopwords are counted in the denominator
173 // but excluded from the keyword candidates.
174 if token.text.chars().count() < 2 || self.stops.contains(token.text) {
175 continue;
176 }
177
178 *counts.entry(String::from(token.text)).or_insert(0) += 1;
179 }
180
181 if total_content == 0 || counts.is_empty() {
182 return Vec::new();
183 }
184
185 let total_f = total_content as f32;
186 // IDF numerator: max corpus frequency + 1 (avoids div-by-zero for max entry).
187 let idf_num = self.max_corpus_freq as f32 + 1.0;
188
189 let mut results: Vec<Keyword> = counts
190 .into_iter()
191 .map(|(word, count)| {
192 let tf = count as f32 / total_f;
193 let corpus_freq = self.freq.get(&word);
194 let idf = idf_num / (corpus_freq as f32 + 1.0);
195 Keyword {
196 word,
197 score: tf * idf,
198 count,
199 }
200 })
201 .collect();
202
203 // Sort: score DESC, word ASC for deterministic ties
204 results.sort_unstable_by(|a, b| {
205 b.score
206 .partial_cmp(&a.score)
207 .unwrap_or(core::cmp::Ordering::Equal)
208 .then(a.word.cmp(&b.word))
209 });
210
211 results.truncate(max_n);
212 results
213 }
214}
215
216// ---------------------------------------------------------------------------
217// Tests
218// ---------------------------------------------------------------------------
219
220#[cfg(test)]
221mod tests {
222 use super::*;
223
224 fn kex() -> KeyExtractor {
225 KeyExtractor::builtin()
226 }
227
228 // ── edge cases ──────────────────────────────────────────────────────────
229
230 #[test]
231 fn empty_text_returns_empty() {
232 assert!(kex().extract("", 5).is_empty());
233 }
234
235 #[test]
236 fn zero_max_n_returns_empty() {
237 assert!(kex().extract("กินข้าวกับปลา", 0).is_empty());
238 }
239
240 #[test]
241 fn only_stopwords_returns_empty() {
242 // "และ" "หรือ" "ของ" are all stopwords
243 assert!(kex().extract("และหรือของ", 5).is_empty());
244 }
245
246 #[test]
247 fn only_single_chars_returns_empty() {
248 // Single Thai characters are below the min-length threshold
249 assert!(kex().extract("ก ข ค ง", 5).is_empty());
250 }
251
252 // ── result properties ────────────────────────────────────────────────────
253
254 #[test]
255 fn respects_max_n() {
256 let kws = kex().extract("การพัฒนาซอฟต์แวร์เป็นสิ่งสำคัญในยุคดิจิทัลสำหรับนักพัฒนา", 3);
257 assert!(kws.len() <= 3, "expected ≤ 3 results, got {}", kws.len());
258 }
259
260 #[test]
261 fn results_sorted_by_score_descending() {
262 let kws = kex().extract("การเรียนภาษาโปรแกรมมิ่งเป็นทักษะสำคัญสำหรับนักพัฒนาซอฟต์แวร์", 10);
263 for pair in kws.windows(2) {
264 assert!(
265 pair[0].score >= pair[1].score,
266 "sort order violated: {:?} before {:?}",
267 pair[0],
268 pair[1]
269 );
270 }
271 }
272
273 #[test]
274 fn count_reflects_occurrences() {
275 // "ซอฟต์แวร์" appears 3 times in the input
276 let kws = kex().extract("นักพัฒนาซอฟต์แวร์เขียนซอฟต์แวร์และทดสอบซอฟต์แวร์ทุกวัน", 10);
277 let sw = kws.iter().find(|k| k.word == "ซอฟต์แวร์");
278 assert!(sw.is_some(), "expected ซอฟต์แวร์ in keywords; got: {kws:?}");
279 assert_eq!(sw.unwrap().count, 3, "expected count=3 for ซอฟต์แวร์");
280 }
281
282 #[test]
283 fn stopwords_not_in_results() {
284 let kws = kex().extract("กินข้าวกับปลาและดื่มน้ำ", 20);
285 // "กับ" and "และ" are stopwords and must not appear
286 assert!(
287 kws.iter().all(|k| k.word != "กับ" && k.word != "และ"),
288 "stopword found in results: {kws:?}"
289 );
290 }
291
292 #[test]
293 fn all_scores_positive() {
294 let kws = kex().extract("การพัฒนาซอฟต์แวร์ต้องการทักษะและประสบการณ์", 10);
295 assert!(
296 kws.iter().all(|k| k.score > 0.0),
297 "expected all scores > 0; got: {kws:?}"
298 );
299 }
300
301 // ── IDF weighting ────────────────────────────────────────────────────────
302
303 #[test]
304 fn rare_word_outranks_common_word_with_same_count() {
305 // Both appear once; rare corpus word should score higher.
306 // "ไดโนเสาร์" (dinosaur) is rare in TNC; "คน" (person) is very common.
307 let kws = kex().extract("ไดโนเสาร์กินคน", 10);
308 let rare = kws.iter().find(|k| k.word == "ไดโนเสาร์");
309 let common = kws.iter().find(|k| k.word == "คน");
310 if let (Some(r), Some(c)) = (rare, common) {
311 assert!(
312 r.score > c.score,
313 "expected ไดโนเสาร์ ({}) to outscore คน ({})",
314 r.score,
315 c.score
316 );
317 }
318 }
319
320 #[test]
321 fn repeated_word_scores_higher_than_single_occurrence() {
322 // "ซอฟต์แวร์" ×3 vs "นักพัฒนา" ×1 — same IDF, TF difference wins
323 let kws = kex().extract("นักพัฒนาซอฟต์แวร์เขียนซอฟต์แวร์และทดสอบซอฟต์แวร์", 10);
324 let sw = kws.iter().find(|k| k.word == "ซอฟต์แวร์");
325 let dev = kws.iter().find(|k| k.word == "นักพัฒนา");
326 if let (Some(s), Some(d)) = (sw, dev) {
327 assert!(
328 s.score > d.score,
329 "expected ซอฟต์แวร์ (×3, score {}) > นักพัฒนา (×1, score {})",
330 s.score,
331 d.score
332 );
333 }
334 }
335
336 // ── mixed script ─────────────────────────────────────────────────────────
337
338 #[test]
339 fn latin_tokens_included_as_candidates() {
340 let kws = kex().extract("เขียน Python และใช้ Python ทุกวัน", 10);
341 // "Python" appears twice and is a Latin token — must be in results
342 let py = kws.iter().find(|k| k.word == "Python");
343 assert!(py.is_some(), "expected Python in keywords; got: {kws:?}");
344 assert_eq!(py.unwrap().count, 2);
345 }
346
347 #[test]
348 fn punctuation_not_in_results() {
349 let kws = kex().extract("กินข้าว, ดื่มน้ำ. นอนหลับ!", 20);
350 assert!(
351 kws.iter()
352 .all(|k| !k.word.chars().all(|c| c.is_ascii_punctuation())),
353 "punctuation token found in results: {kws:?}"
354 );
355 }
356}