Skip to main content

kham_core/
keyword.rs

1//! Thai keyword extraction using TF × inverse-corpus-frequency (TF-IDF proxy).
2//!
3//! [`KeyExtractor`] segments text with the built-in tokenizer, discards
4//! stopwords and single-character tokens, then ranks content words by how
5//! often they appear in the document relative to their frequency in the Thai
6//! National Corpus (TNC).
7//!
8//! The scoring formula uses only basic `f32` arithmetic (no transcendentals),
9//! keeping the module `no_std` compatible:
10//!
11//! ```text
12//! TF(t)        = occurrences(t, doc) / total_content_tokens(doc)
13//! IDF_proxy(t) = (max_tnc_freq + 1) / (tnc_freq(t) + 1)
14//! score(t)     = TF(t) × IDF_proxy(t)
15//! ```
16//!
17//! Words absent from TNC receive the maximum IDF weight — they are likely
18//! domain-specific and therefore the most distinctive keywords.
19//!
20//! ```rust
21//! use kham_core::keyword::KeyExtractor;
22//!
23//! let kex = KeyExtractor::builtin();
24//! let kws = kex.extract("การพัฒนาซอฟต์แวร์เป็นสิ่งสำคัญในยุคดิจิทัล", 5);
25//! assert!(!kws.is_empty());
26//! // Results are always sorted by score descending
27//! for pair in kws.windows(2) {
28//!     assert!(pair[0].score >= pair[1].score);
29//! }
30//! ```
31
32use alloc::collections::BTreeMap;
33use alloc::string::String;
34use alloc::vec::Vec;
35
36use crate::freq::FreqMap;
37use crate::segmenter::Tokenizer;
38use crate::stopwords::StopwordSet;
39use crate::token::TokenKind;
40
41// ---------------------------------------------------------------------------
42// Public types
43// ---------------------------------------------------------------------------
44
45/// A keyword extracted from a document with its relevance score.
46///
47/// Scores are computed as `TF × IDF_proxy`:
48/// - **TF**: how often the word appears in this document (normalized by total
49///   content tokens)
50/// - **IDF_proxy**: `(max_tnc_freq + 1) / (tnc_freq + 1)` — rare corpus
51///   words receive a higher weight than common function words
52///
53/// Keywords are returned sorted by `score` descending.
54#[derive(Debug, Clone, PartialEq)]
55pub struct Keyword {
56    /// The word text.
57    pub word: String,
58    /// TF × IDF_proxy score. Higher means more document-distinctive.
59    pub score: f32,
60    /// Raw occurrence count of this word in the document.
61    pub count: usize,
62}
63
64/// Thai keyword extractor using TF × inverse-corpus-frequency scoring.
65///
66/// Backed by the built-in 62k-word tokenizer, the TNC frequency table
67/// (~106k entries), and the Thai stopword list (~1 029 entries).
68///
69/// Construction is O(n) in the TNC table size — reuse the returned instance
70/// rather than calling [`builtin()`](KeyExtractor::builtin) on every query.
71///
72/// # Filtering rules
73///
74/// A token is eligible as a keyword when **all** of the following hold:
75/// 1. Kind is `Thai`, `Latin`, `Number`, or `Named` (whitespace, punctuation,
76///    emoji, and unknown tokens are always skipped)
77/// 2. Character length ≥ 2 (single-char tokens are too coarse to be keywords)
78/// 3. Not in the built-in Thai stopword list
79///
80/// # Examples
81///
82/// ```rust
83/// use kham_core::keyword::KeyExtractor;
84///
85/// let kex = KeyExtractor::builtin();
86///
87/// // Rare domain-specific word outranks a common word
88/// // "ซอฟต์แวร์" (software) is rare in TNC and should appear as a top keyword
89/// let kws = kex.extract("นักพัฒนาซอฟต์แวร์เขียนซอฟต์แวร์ทุกวัน", 5);
90/// assert!(kws.iter().any(|k| k.word == "ซอฟต์แวร์"));
91/// ```
92pub struct KeyExtractor {
93    tokenizer: Tokenizer,
94    freq: FreqMap,
95    stops: StopwordSet,
96    max_corpus_freq: u32,
97}
98
99impl KeyExtractor {
100    /// Create a keyword extractor backed by the built-in tokenizer, TNC
101    /// frequency table, and Thai stopword list.
102    ///
103    /// # Examples
104    ///
105    /// ```rust
106    /// use kham_core::keyword::KeyExtractor;
107    ///
108    /// let kex = KeyExtractor::builtin();
109    /// assert!(!kex.extract("กินข้าวกับปลา", 5).is_empty());
110    /// ```
111    pub fn builtin() -> Self {
112        let freq = FreqMap::builtin();
113        let max_corpus_freq = freq.max_freq();
114        Self {
115            tokenizer: Tokenizer::new(),
116            freq,
117            stops: StopwordSet::builtin(),
118            max_corpus_freq,
119        }
120    }
121
122    /// Extract up to `max_n` keywords from `text`, ranked by TF-IDF score.
123    ///
124    /// Returns an empty `Vec` when `text` is empty, contains no eligible
125    /// content words, or `max_n` is zero.
126    ///
127    /// Ties in score are broken alphabetically so results are deterministic.
128    ///
129    /// # Examples
130    ///
131    /// ```rust
132    /// use kham_core::keyword::KeyExtractor;
133    ///
134    /// let kex = KeyExtractor::builtin();
135    ///
136    /// // Edge cases
137    /// assert!(kex.extract("", 5).is_empty());
138    /// assert!(kex.extract("กินข้าวกับปลา", 0).is_empty());
139    ///
140    /// // Score order is non-increasing
141    /// let kws = kex.extract("การเรียนภาษาโปรแกรมมิ่งเป็นทักษะสำคัญสำหรับนักพัฒนา", 10);
142    /// for pair in kws.windows(2) {
143    ///     assert!(
144    ///         pair[0].score >= pair[1].score,
145    ///         "out-of-order: {:?} before {:?}", pair[0], pair[1]
146    ///     );
147    /// }
148    /// ```
149    pub fn extract(&self, text: &str, max_n: usize) -> Vec<Keyword> {
150        if text.is_empty() || max_n == 0 {
151            return Vec::new();
152        }
153
154        let tokens = self.tokenizer.segment(text);
155
156        // Count all content tokens for the TF denominator.
157        // Count candidate tokens (non-stop, len ≥ 2) for keyword scoring.
158        let mut total_content: usize = 0;
159        let mut counts: BTreeMap<String, usize> = BTreeMap::new();
160
161        for token in &tokens {
162            match token.kind {
163                TokenKind::Whitespace
164                | TokenKind::Punctuation
165                | TokenKind::Emoji
166                | TokenKind::Unknown => continue,
167                _ => {}
168            }
169
170            total_content += 1;
171
172            // Single-char tokens and stopwords are counted in the denominator
173            // but excluded from the keyword candidates.
174            if token.text.chars().count() < 2 || self.stops.contains(token.text) {
175                continue;
176            }
177
178            *counts.entry(String::from(token.text)).or_insert(0) += 1;
179        }
180
181        if total_content == 0 || counts.is_empty() {
182            return Vec::new();
183        }
184
185        let total_f = total_content as f32;
186        // IDF numerator: max corpus frequency + 1 (avoids div-by-zero for max entry).
187        let idf_num = self.max_corpus_freq as f32 + 1.0;
188
189        let mut results: Vec<Keyword> = counts
190            .into_iter()
191            .map(|(word, count)| {
192                let tf = count as f32 / total_f;
193                let corpus_freq = self.freq.get(&word);
194                let idf = idf_num / (corpus_freq as f32 + 1.0);
195                Keyword {
196                    word,
197                    score: tf * idf,
198                    count,
199                }
200            })
201            .collect();
202
203        // Sort: score DESC, word ASC for deterministic ties
204        results.sort_unstable_by(|a, b| {
205            b.score
206                .partial_cmp(&a.score)
207                .unwrap_or(core::cmp::Ordering::Equal)
208                .then(a.word.cmp(&b.word))
209        });
210
211        results.truncate(max_n);
212        results
213    }
214
215    /// Extract up to `max_n` multi-word keyphrases (bigrams and trigrams) from
216    /// `text`, ranked by TF × average-IDF score.
217    ///
218    /// Phrases are formed from adjacent content tokens — tokens that pass the
219    /// same eligibility rules as [`extract`]: non-whitespace, non-punctuation,
220    /// non-emoji, non-unknown, character length ≥ 2, and not a stopword. A
221    /// bigram is two such consecutive tokens; a trigram is three.
222    ///
223    /// The IDF for a phrase is the average IDF of its constituent words.
224    ///
225    /// Returns an empty `Vec` when `text` has fewer than 2 eligible tokens or
226    /// `max_n` is zero.
227    ///
228    /// # Example
229    /// ```rust
230    /// use kham_core::keyword::KeyExtractor;
231    ///
232    /// let kex = KeyExtractor::builtin();
233    /// let phrases = kex.extract_phrases("นักพัฒนาซอฟต์แวร์เขียนโค้ดทุกวัน", 5);
234    /// // Each keyword word field contains a space-separated phrase
235    /// assert!(phrases.iter().all(|k| k.word.contains(' ')));
236    /// ```
237    pub fn extract_phrases(&self, text: &str, max_n: usize) -> Vec<Keyword> {
238        if text.is_empty() || max_n == 0 {
239            return Vec::new();
240        }
241
242        let tokens = self.tokenizer.segment(text);
243
244        // Collect eligible content token texts
245        let content: Vec<&str> = tokens
246            .iter()
247            .filter(|t| {
248                !matches!(
249                    t.kind,
250                    TokenKind::Whitespace
251                        | TokenKind::Punctuation
252                        | TokenKind::Emoji
253                        | TokenKind::Unknown
254                )
255            })
256            .filter(|t| t.text.chars().count() >= 2 && !self.stops.contains(t.text))
257            .map(|t| t.text)
258            .collect();
259
260        if content.len() < 2 {
261            return Vec::new();
262        }
263
264        let total_f = content.len() as f32;
265        let idf_num = self.max_corpus_freq as f32 + 1.0;
266
267        let mut counts: BTreeMap<String, usize> = BTreeMap::new();
268
269        // Bigrams
270        for w in content.windows(2) {
271            let phrase = alloc::format!("{} {}", w[0], w[1]);
272            *counts.entry(phrase).or_insert(0) += 1;
273        }
274        // Trigrams
275        for w in content.windows(3) {
276            let phrase = alloc::format!("{} {} {}", w[0], w[1], w[2]);
277            *counts.entry(phrase).or_insert(0) += 1;
278        }
279
280        let mut results: Vec<Keyword> = counts
281            .into_iter()
282            .map(|(phrase, count)| {
283                let tf = count as f32 / total_f;
284                let parts: Vec<&str> = phrase.split(' ').collect();
285                let avg_idf = parts
286                    .iter()
287                    .map(|w| idf_num / (self.freq.get(w) as f32 + 1.0))
288                    .sum::<f32>()
289                    / parts.len() as f32;
290                Keyword {
291                    word: phrase,
292                    score: tf * avg_idf,
293                    count,
294                }
295            })
296            .collect();
297
298        results.sort_unstable_by(|a, b| {
299            b.score
300                .partial_cmp(&a.score)
301                .unwrap_or(core::cmp::Ordering::Equal)
302                .then(a.word.cmp(&b.word))
303        });
304        results.truncate(max_n);
305        results
306    }
307}
308
309// ---------------------------------------------------------------------------
310// Tests
311// ---------------------------------------------------------------------------
312
313#[cfg(test)]
314mod tests {
315    use super::*;
316
317    fn kex() -> KeyExtractor {
318        KeyExtractor::builtin()
319    }
320
321    // ── edge cases ──────────────────────────────────────────────────────────
322
323    #[test]
324    fn empty_text_returns_empty() {
325        assert!(kex().extract("", 5).is_empty());
326    }
327
328    #[test]
329    fn zero_max_n_returns_empty() {
330        assert!(kex().extract("กินข้าวกับปลา", 0).is_empty());
331    }
332
333    #[test]
334    fn only_stopwords_returns_empty() {
335        // "และ" "หรือ" "ของ" are all stopwords
336        assert!(kex().extract("และหรือของ", 5).is_empty());
337    }
338
339    #[test]
340    fn only_single_chars_returns_empty() {
341        // Single Thai characters are below the min-length threshold
342        assert!(kex().extract("ก ข ค ง", 5).is_empty());
343    }
344
345    // ── result properties ────────────────────────────────────────────────────
346
347    #[test]
348    fn respects_max_n() {
349        let kws = kex().extract("การพัฒนาซอฟต์แวร์เป็นสิ่งสำคัญในยุคดิจิทัลสำหรับนักพัฒนา", 3);
350        assert!(kws.len() <= 3, "expected ≤ 3 results, got {}", kws.len());
351    }
352
353    #[test]
354    fn results_sorted_by_score_descending() {
355        let kws = kex().extract("การเรียนภาษาโปรแกรมมิ่งเป็นทักษะสำคัญสำหรับนักพัฒนาซอฟต์แวร์", 10);
356        for pair in kws.windows(2) {
357            assert!(
358                pair[0].score >= pair[1].score,
359                "sort order violated: {:?} before {:?}",
360                pair[0],
361                pair[1]
362            );
363        }
364    }
365
366    #[test]
367    fn count_reflects_occurrences() {
368        // "ซอฟต์แวร์" appears 3 times in the input
369        let kws = kex().extract("นักพัฒนาซอฟต์แวร์เขียนซอฟต์แวร์และทดสอบซอฟต์แวร์ทุกวัน", 10);
370        let sw = kws.iter().find(|k| k.word == "ซอฟต์แวร์");
371        assert!(sw.is_some(), "expected ซอฟต์แวร์ in keywords; got: {kws:?}");
372        assert_eq!(sw.unwrap().count, 3, "expected count=3 for ซอฟต์แวร์");
373    }
374
375    #[test]
376    fn stopwords_not_in_results() {
377        let kws = kex().extract("กินข้าวกับปลาและดื่มน้ำ", 20);
378        // "กับ" and "และ" are stopwords and must not appear
379        assert!(
380            kws.iter().all(|k| k.word != "กับ" && k.word != "และ"),
381            "stopword found in results: {kws:?}"
382        );
383    }
384
385    #[test]
386    fn all_scores_positive() {
387        let kws = kex().extract("การพัฒนาซอฟต์แวร์ต้องการทักษะและประสบการณ์", 10);
388        assert!(
389            kws.iter().all(|k| k.score > 0.0),
390            "expected all scores > 0; got: {kws:?}"
391        );
392    }
393
394    // ── IDF weighting ────────────────────────────────────────────────────────
395
396    #[test]
397    fn rare_word_outranks_common_word_with_same_count() {
398        // Both appear once; rare corpus word should score higher.
399        // "ไดโนเสาร์" (dinosaur) is rare in TNC; "คน" (person) is very common.
400        let kws = kex().extract("ไดโนเสาร์กินคน", 10);
401        let rare = kws.iter().find(|k| k.word == "ไดโนเสาร์");
402        let common = kws.iter().find(|k| k.word == "คน");
403        if let (Some(r), Some(c)) = (rare, common) {
404            assert!(
405                r.score > c.score,
406                "expected ไดโนเสาร์ ({}) to outscore คน ({})",
407                r.score,
408                c.score
409            );
410        }
411    }
412
413    #[test]
414    fn repeated_word_scores_higher_than_single_occurrence() {
415        // "ซอฟต์แวร์" ×3 vs "นักพัฒนา" ×1 — same IDF, TF difference wins
416        let kws = kex().extract("นักพัฒนาซอฟต์แวร์เขียนซอฟต์แวร์และทดสอบซอฟต์แวร์", 10);
417        let sw = kws.iter().find(|k| k.word == "ซอฟต์แวร์");
418        let dev = kws.iter().find(|k| k.word == "นักพัฒนา");
419        if let (Some(s), Some(d)) = (sw, dev) {
420            assert!(
421                s.score > d.score,
422                "expected ซอฟต์แวร์ (×3, score {}) > นักพัฒนา (×1, score {})",
423                s.score,
424                d.score
425            );
426        }
427    }
428
429    // ── mixed script ─────────────────────────────────────────────────────────
430
431    #[test]
432    fn latin_tokens_included_as_candidates() {
433        let kws = kex().extract("เขียน Python และใช้ Python ทุกวัน", 10);
434        // "Python" appears twice and is a Latin token — must be in results
435        let py = kws.iter().find(|k| k.word == "Python");
436        assert!(py.is_some(), "expected Python in keywords; got: {kws:?}");
437        assert_eq!(py.unwrap().count, 2);
438    }
439
440    #[test]
441    fn punctuation_not_in_results() {
442        let kws = kex().extract("กินข้าว, ดื่มน้ำ. นอนหลับ!", 20);
443        assert!(
444            kws.iter()
445                .all(|k| !k.word.chars().all(|c| c.is_ascii_punctuation())),
446            "punctuation token found in results: {kws:?}"
447        );
448    }
449
450    // extract_phrases tests ----------------------------------------------------
451
452    #[test]
453    fn extract_phrases_empty_input() {
454        assert!(kex().extract_phrases("", 5).is_empty());
455    }
456
457    #[test]
458    fn extract_phrases_contains_space() {
459        let phrases = kex().extract_phrases("นักพัฒนาซอฟต์แวร์เขียนโค้ดทุกวัน", 5);
460        assert!(
461            phrases.iter().all(|k| k.word.contains(' ')),
462            "all phrases should contain a space; got: {phrases:?}"
463        );
464    }
465
466    #[test]
467    fn extract_phrases_score_order() {
468        let phrases = kex().extract_phrases("การพัฒนาซอฟต์แวร์เป็นสิ่งสำคัญในยุคดิจิทัลสำหรับนักพัฒนา", 10);
469        for pair in phrases.windows(2) {
470            assert!(
471                pair[0].score >= pair[1].score,
472                "sort order violated: {:?} before {:?}",
473                pair[0],
474                pair[1]
475            );
476        }
477    }
478}