kham_core/keyword.rs
1//! Thai keyword extraction using TF × inverse-corpus-frequency (TF-IDF proxy).
2//!
3//! [`KeyExtractor`] segments text with the built-in tokenizer, discards
4//! stopwords and single-character tokens, then ranks content words by how
5//! often they appear in the document relative to their frequency in the Thai
6//! National Corpus (TNC).
7//!
8//! The scoring formula uses only basic `f32` arithmetic (no transcendentals),
9//! keeping the module `no_std` compatible:
10//!
11//! ```text
12//! TF(t) = occurrences(t, doc) / total_content_tokens(doc)
13//! IDF_proxy(t) = (max_tnc_freq + 1) / (tnc_freq(t) + 1)
14//! score(t) = TF(t) × IDF_proxy(t)
15//! ```
16//!
17//! Words absent from TNC receive the maximum IDF weight — they are likely
18//! domain-specific and therefore the most distinctive keywords.
19//!
20//! ```rust
21//! use kham_core::keyword::KeyExtractor;
22//!
23//! let kex = KeyExtractor::builtin();
24//! let kws = kex.extract("การพัฒนาซอฟต์แวร์เป็นสิ่งสำคัญในยุคดิจิทัล", 5);
25//! assert!(!kws.is_empty());
26//! // Results are always sorted by score descending
27//! for pair in kws.windows(2) {
28//! assert!(pair[0].score >= pair[1].score);
29//! }
30//! ```
31
32use alloc::collections::BTreeMap;
33use alloc::string::String;
34use alloc::vec::Vec;
35
36use crate::freq::FreqMap;
37use crate::segmenter::Tokenizer;
38use crate::stopwords::StopwordSet;
39use crate::token::TokenKind;
40
41// ---------------------------------------------------------------------------
42// Public types
43// ---------------------------------------------------------------------------
44
45/// A keyword extracted from a document with its relevance score.
46///
47/// Scores are computed as `TF × IDF_proxy`:
48/// - **TF**: how often the word appears in this document (normalized by total
49/// content tokens)
50/// - **IDF_proxy**: `(max_tnc_freq + 1) / (tnc_freq + 1)` — rare corpus
51/// words receive a higher weight than common function words
52///
53/// Keywords are returned sorted by `score` descending.
54#[derive(Debug, Clone, PartialEq)]
55pub struct Keyword {
56 /// The word text.
57 pub word: String,
58 /// TF × IDF_proxy score. Higher means more document-distinctive.
59 pub score: f32,
60 /// Raw occurrence count of this word in the document.
61 pub count: usize,
62}
63
64/// Thai keyword extractor using TF × inverse-corpus-frequency scoring.
65///
66/// Backed by the built-in 62k-word tokenizer, the TNC frequency table
67/// (~106k entries), and the Thai stopword list (~1 029 entries).
68///
69/// Construction is O(n) in the TNC table size — reuse the returned instance
70/// rather than calling [`builtin()`](KeyExtractor::builtin) on every query.
71///
72/// # Filtering rules
73///
74/// A token is eligible as a keyword when **all** of the following hold:
75/// 1. Kind is `Thai`, `Latin`, `Number`, or `Named` (whitespace, punctuation,
76/// emoji, and unknown tokens are always skipped)
77/// 2. Character length ≥ 2 (single-char tokens are too coarse to be keywords)
78/// 3. Not in the built-in Thai stopword list
79///
80/// # Examples
81///
82/// ```rust
83/// use kham_core::keyword::KeyExtractor;
84///
85/// let kex = KeyExtractor::builtin();
86///
87/// // Rare domain-specific word outranks a common word
88/// // "ซอฟต์แวร์" (software) is rare in TNC and should appear as a top keyword
89/// let kws = kex.extract("นักพัฒนาซอฟต์แวร์เขียนซอฟต์แวร์ทุกวัน", 5);
90/// assert!(kws.iter().any(|k| k.word == "ซอฟต์แวร์"));
91/// ```
92pub struct KeyExtractor {
93 tokenizer: Tokenizer,
94 freq: FreqMap,
95 stops: StopwordSet,
96 max_corpus_freq: u32,
97}
98
99impl KeyExtractor {
100 /// Create a keyword extractor backed by the built-in tokenizer, TNC
101 /// frequency table, and Thai stopword list.
102 ///
103 /// # Examples
104 ///
105 /// ```rust
106 /// use kham_core::keyword::KeyExtractor;
107 ///
108 /// let kex = KeyExtractor::builtin();
109 /// assert!(!kex.extract("กินข้าวกับปลา", 5).is_empty());
110 /// ```
111 pub fn builtin() -> Self {
112 let freq = FreqMap::builtin();
113 let max_corpus_freq = freq.max_freq();
114 Self {
115 tokenizer: Tokenizer::new(),
116 freq,
117 stops: StopwordSet::builtin(),
118 max_corpus_freq,
119 }
120 }
121
122 /// Extract up to `max_n` keywords from `text`, ranked by TF-IDF score.
123 ///
124 /// Returns an empty `Vec` when `text` is empty, contains no eligible
125 /// content words, or `max_n` is zero.
126 ///
127 /// Ties in score are broken alphabetically so results are deterministic.
128 ///
129 /// # Examples
130 ///
131 /// ```rust
132 /// use kham_core::keyword::KeyExtractor;
133 ///
134 /// let kex = KeyExtractor::builtin();
135 ///
136 /// // Edge cases
137 /// assert!(kex.extract("", 5).is_empty());
138 /// assert!(kex.extract("กินข้าวกับปลา", 0).is_empty());
139 ///
140 /// // Score order is non-increasing
141 /// let kws = kex.extract("การเรียนภาษาโปรแกรมมิ่งเป็นทักษะสำคัญสำหรับนักพัฒนา", 10);
142 /// for pair in kws.windows(2) {
143 /// assert!(
144 /// pair[0].score >= pair[1].score,
145 /// "out-of-order: {:?} before {:?}", pair[0], pair[1]
146 /// );
147 /// }
148 /// ```
149 pub fn extract(&self, text: &str, max_n: usize) -> Vec<Keyword> {
150 if text.is_empty() || max_n == 0 {
151 return Vec::new();
152 }
153
154 let tokens = self.tokenizer.segment(text);
155
156 // Count all content tokens for the TF denominator.
157 // Count candidate tokens (non-stop, len ≥ 2) for keyword scoring.
158 let mut total_content: usize = 0;
159 let mut counts: BTreeMap<String, usize> = BTreeMap::new();
160
161 for token in &tokens {
162 match token.kind {
163 TokenKind::Whitespace
164 | TokenKind::Punctuation
165 | TokenKind::Emoji
166 | TokenKind::Unknown => continue,
167 _ => {}
168 }
169
170 total_content += 1;
171
172 // Single-char tokens and stopwords are counted in the denominator
173 // but excluded from the keyword candidates.
174 if token.text.chars().count() < 2 || self.stops.contains(token.text) {
175 continue;
176 }
177
178 *counts.entry(String::from(token.text)).or_insert(0) += 1;
179 }
180
181 if total_content == 0 || counts.is_empty() {
182 return Vec::new();
183 }
184
185 let total_f = total_content as f32;
186 // IDF numerator: max corpus frequency + 1 (avoids div-by-zero for max entry).
187 let idf_num = self.max_corpus_freq as f32 + 1.0;
188
189 let mut results: Vec<Keyword> = counts
190 .into_iter()
191 .map(|(word, count)| {
192 let tf = count as f32 / total_f;
193 let corpus_freq = self.freq.get(&word);
194 let idf = idf_num / (corpus_freq as f32 + 1.0);
195 Keyword {
196 word,
197 score: tf * idf,
198 count,
199 }
200 })
201 .collect();
202
203 // Sort: score DESC, word ASC for deterministic ties
204 results.sort_unstable_by(|a, b| {
205 b.score
206 .partial_cmp(&a.score)
207 .unwrap_or(core::cmp::Ordering::Equal)
208 .then(a.word.cmp(&b.word))
209 });
210
211 results.truncate(max_n);
212 results
213 }
214
215 /// Extract up to `max_n` multi-word keyphrases (bigrams and trigrams) from
216 /// `text`, ranked by TF × average-IDF score.
217 ///
218 /// Phrases are formed from adjacent content tokens — tokens that pass the
219 /// same eligibility rules as [`extract`]: non-whitespace, non-punctuation,
220 /// non-emoji, non-unknown, character length ≥ 2, and not a stopword. A
221 /// bigram is two such consecutive tokens; a trigram is three.
222 ///
223 /// The IDF for a phrase is the average IDF of its constituent words.
224 ///
225 /// Returns an empty `Vec` when `text` has fewer than 2 eligible tokens or
226 /// `max_n` is zero.
227 ///
228 /// # Example
229 /// ```rust
230 /// use kham_core::keyword::KeyExtractor;
231 ///
232 /// let kex = KeyExtractor::builtin();
233 /// let phrases = kex.extract_phrases("นักพัฒนาซอฟต์แวร์เขียนโค้ดทุกวัน", 5);
234 /// // Each keyword word field contains a space-separated phrase
235 /// assert!(phrases.iter().all(|k| k.word.contains(' ')));
236 /// ```
237 pub fn extract_phrases(&self, text: &str, max_n: usize) -> Vec<Keyword> {
238 if text.is_empty() || max_n == 0 {
239 return Vec::new();
240 }
241
242 let tokens = self.tokenizer.segment(text);
243
244 // Collect eligible content token texts
245 let content: Vec<&str> = tokens
246 .iter()
247 .filter(|t| {
248 !matches!(
249 t.kind,
250 TokenKind::Whitespace
251 | TokenKind::Punctuation
252 | TokenKind::Emoji
253 | TokenKind::Unknown
254 )
255 })
256 .filter(|t| t.text.chars().count() >= 2 && !self.stops.contains(t.text))
257 .map(|t| t.text)
258 .collect();
259
260 if content.len() < 2 {
261 return Vec::new();
262 }
263
264 let total_f = content.len() as f32;
265 let idf_num = self.max_corpus_freq as f32 + 1.0;
266
267 let mut counts: BTreeMap<String, usize> = BTreeMap::new();
268
269 // Bigrams
270 for w in content.windows(2) {
271 let phrase = alloc::format!("{} {}", w[0], w[1]);
272 *counts.entry(phrase).or_insert(0) += 1;
273 }
274 // Trigrams
275 for w in content.windows(3) {
276 let phrase = alloc::format!("{} {} {}", w[0], w[1], w[2]);
277 *counts.entry(phrase).or_insert(0) += 1;
278 }
279
280 let mut results: Vec<Keyword> = counts
281 .into_iter()
282 .map(|(phrase, count)| {
283 let tf = count as f32 / total_f;
284 let parts: Vec<&str> = phrase.split(' ').collect();
285 let avg_idf = parts
286 .iter()
287 .map(|w| idf_num / (self.freq.get(w) as f32 + 1.0))
288 .sum::<f32>()
289 / parts.len() as f32;
290 Keyword {
291 word: phrase,
292 score: tf * avg_idf,
293 count,
294 }
295 })
296 .collect();
297
298 results.sort_unstable_by(|a, b| {
299 b.score
300 .partial_cmp(&a.score)
301 .unwrap_or(core::cmp::Ordering::Equal)
302 .then(a.word.cmp(&b.word))
303 });
304 results.truncate(max_n);
305 results
306 }
307}
308
309// ---------------------------------------------------------------------------
310// Tests
311// ---------------------------------------------------------------------------
312
313#[cfg(test)]
314mod tests {
315 use super::*;
316
317 fn kex() -> KeyExtractor {
318 KeyExtractor::builtin()
319 }
320
321 // ── edge cases ──────────────────────────────────────────────────────────
322
323 #[test]
324 fn empty_text_returns_empty() {
325 assert!(kex().extract("", 5).is_empty());
326 }
327
328 #[test]
329 fn zero_max_n_returns_empty() {
330 assert!(kex().extract("กินข้าวกับปลา", 0).is_empty());
331 }
332
333 #[test]
334 fn only_stopwords_returns_empty() {
335 // "และ" "หรือ" "ของ" are all stopwords
336 assert!(kex().extract("และหรือของ", 5).is_empty());
337 }
338
339 #[test]
340 fn only_single_chars_returns_empty() {
341 // Single Thai characters are below the min-length threshold
342 assert!(kex().extract("ก ข ค ง", 5).is_empty());
343 }
344
345 // ── result properties ────────────────────────────────────────────────────
346
347 #[test]
348 fn respects_max_n() {
349 let kws = kex().extract("การพัฒนาซอฟต์แวร์เป็นสิ่งสำคัญในยุคดิจิทัลสำหรับนักพัฒนา", 3);
350 assert!(kws.len() <= 3, "expected ≤ 3 results, got {}", kws.len());
351 }
352
353 #[test]
354 fn results_sorted_by_score_descending() {
355 let kws = kex().extract("การเรียนภาษาโปรแกรมมิ่งเป็นทักษะสำคัญสำหรับนักพัฒนาซอฟต์แวร์", 10);
356 for pair in kws.windows(2) {
357 assert!(
358 pair[0].score >= pair[1].score,
359 "sort order violated: {:?} before {:?}",
360 pair[0],
361 pair[1]
362 );
363 }
364 }
365
366 #[test]
367 fn count_reflects_occurrences() {
368 // "ซอฟต์แวร์" appears 3 times in the input
369 let kws = kex().extract("นักพัฒนาซอฟต์แวร์เขียนซอฟต์แวร์และทดสอบซอฟต์แวร์ทุกวัน", 10);
370 let sw = kws.iter().find(|k| k.word == "ซอฟต์แวร์");
371 assert!(sw.is_some(), "expected ซอฟต์แวร์ in keywords; got: {kws:?}");
372 assert_eq!(sw.unwrap().count, 3, "expected count=3 for ซอฟต์แวร์");
373 }
374
375 #[test]
376 fn stopwords_not_in_results() {
377 let kws = kex().extract("กินข้าวกับปลาและดื่มน้ำ", 20);
378 // "กับ" and "และ" are stopwords and must not appear
379 assert!(
380 kws.iter().all(|k| k.word != "กับ" && k.word != "และ"),
381 "stopword found in results: {kws:?}"
382 );
383 }
384
385 #[test]
386 fn all_scores_positive() {
387 let kws = kex().extract("การพัฒนาซอฟต์แวร์ต้องการทักษะและประสบการณ์", 10);
388 assert!(
389 kws.iter().all(|k| k.score > 0.0),
390 "expected all scores > 0; got: {kws:?}"
391 );
392 }
393
394 // ── IDF weighting ────────────────────────────────────────────────────────
395
396 #[test]
397 fn rare_word_outranks_common_word_with_same_count() {
398 // Both appear once; rare corpus word should score higher.
399 // "ไดโนเสาร์" (dinosaur) is rare in TNC; "คน" (person) is very common.
400 let kws = kex().extract("ไดโนเสาร์กินคน", 10);
401 let rare = kws.iter().find(|k| k.word == "ไดโนเสาร์");
402 let common = kws.iter().find(|k| k.word == "คน");
403 if let (Some(r), Some(c)) = (rare, common) {
404 assert!(
405 r.score > c.score,
406 "expected ไดโนเสาร์ ({}) to outscore คน ({})",
407 r.score,
408 c.score
409 );
410 }
411 }
412
413 #[test]
414 fn repeated_word_scores_higher_than_single_occurrence() {
415 // "ซอฟต์แวร์" ×3 vs "นักพัฒนา" ×1 — same IDF, TF difference wins
416 let kws = kex().extract("นักพัฒนาซอฟต์แวร์เขียนซอฟต์แวร์และทดสอบซอฟต์แวร์", 10);
417 let sw = kws.iter().find(|k| k.word == "ซอฟต์แวร์");
418 let dev = kws.iter().find(|k| k.word == "นักพัฒนา");
419 if let (Some(s), Some(d)) = (sw, dev) {
420 assert!(
421 s.score > d.score,
422 "expected ซอฟต์แวร์ (×3, score {}) > นักพัฒนา (×1, score {})",
423 s.score,
424 d.score
425 );
426 }
427 }
428
429 // ── mixed script ─────────────────────────────────────────────────────────
430
431 #[test]
432 fn latin_tokens_included_as_candidates() {
433 let kws = kex().extract("เขียน Python และใช้ Python ทุกวัน", 10);
434 // "Python" appears twice and is a Latin token — must be in results
435 let py = kws.iter().find(|k| k.word == "Python");
436 assert!(py.is_some(), "expected Python in keywords; got: {kws:?}");
437 assert_eq!(py.unwrap().count, 2);
438 }
439
440 #[test]
441 fn punctuation_not_in_results() {
442 let kws = kex().extract("กินข้าว, ดื่มน้ำ. นอนหลับ!", 20);
443 assert!(
444 kws.iter()
445 .all(|k| !k.word.chars().all(|c| c.is_ascii_punctuation())),
446 "punctuation token found in results: {kws:?}"
447 );
448 }
449
450 // extract_phrases tests ----------------------------------------------------
451
452 #[test]
453 fn extract_phrases_empty_input() {
454 assert!(kex().extract_phrases("", 5).is_empty());
455 }
456
457 #[test]
458 fn extract_phrases_contains_space() {
459 let phrases = kex().extract_phrases("นักพัฒนาซอฟต์แวร์เขียนโค้ดทุกวัน", 5);
460 assert!(
461 phrases.iter().all(|k| k.word.contains(' ')),
462 "all phrases should contain a space; got: {phrases:?}"
463 );
464 }
465
466 #[test]
467 fn extract_phrases_score_order() {
468 let phrases = kex().extract_phrases("การพัฒนาซอฟต์แวร์เป็นสิ่งสำคัญในยุคดิจิทัลสำหรับนักพัฒนา", 10);
469 for pair in phrases.windows(2) {
470 assert!(
471 pair[0].score >= pair[1].score,
472 "sort order violated: {:?} before {:?}",
473 pair[0],
474 pair[1]
475 );
476 }
477 }
478}