Skip to main content

kham_core/
keyword.rs

1//! Thai keyword extraction using TF × inverse-corpus-frequency (TF-IDF proxy).
2//!
3//! [`KeyExtractor`] segments text with the built-in tokenizer, discards
4//! stopwords and single-character tokens, then ranks content words by how
5//! often they appear in the document relative to their frequency in the Thai
6//! National Corpus (TNC).
7//!
8//! The scoring formula uses only basic `f32` arithmetic (no transcendentals),
9//! keeping the module `no_std` compatible:
10//!
11//! ```text
12//! TF(t)        = occurrences(t, doc) / total_content_tokens(doc)
13//! IDF_proxy(t) = (max_tnc_freq + 1) / (tnc_freq(t) + 1)
14//! score(t)     = TF(t) × IDF_proxy(t)
15//! ```
16//!
17//! Words absent from TNC receive the maximum IDF weight — they are likely
18//! domain-specific and therefore the most distinctive keywords.
19//!
20//! ```rust
21//! use kham_core::keyword::KeyExtractor;
22//!
23//! let kex = KeyExtractor::builtin();
24//! let kws = kex.extract("การพัฒนาซอฟต์แวร์เป็นสิ่งสำคัญในยุคดิจิทัล", 5);
25//! assert!(!kws.is_empty());
26//! // Results are always sorted by score descending
27//! for pair in kws.windows(2) {
28//!     assert!(pair[0].score >= pair[1].score);
29//! }
30//! ```
31
32use alloc::collections::BTreeMap;
33use alloc::string::String;
34use alloc::vec::Vec;
35
36use crate::freq::FreqMap;
37use crate::segmenter::Tokenizer;
38use crate::stopwords::StopwordSet;
39use crate::token::TokenKind;
40
41// ---------------------------------------------------------------------------
42// Public types
43// ---------------------------------------------------------------------------
44
45/// A keyword extracted from a document with its relevance score.
46///
47/// Scores are computed as `TF × IDF_proxy`:
48/// - **TF**: how often the word appears in this document (normalized by total
49///   content tokens)
50/// - **IDF_proxy**: `(max_tnc_freq + 1) / (tnc_freq + 1)` — rare corpus
51///   words receive a higher weight than common function words
52///
53/// Keywords are returned sorted by `score` descending.
54#[derive(Debug, Clone, PartialEq)]
55pub struct Keyword {
56    /// The word text.
57    pub word: String,
58    /// TF × IDF_proxy score. Higher means more document-distinctive.
59    pub score: f32,
60    /// Raw occurrence count of this word in the document.
61    pub count: usize,
62}
63
64/// Thai keyword extractor using TF × inverse-corpus-frequency scoring.
65///
66/// Backed by the built-in 62k-word tokenizer, the TNC frequency table
67/// (~106k entries), and the Thai stopword list (~1 029 entries).
68///
69/// Construction is O(n) in the TNC table size — reuse the returned instance
70/// rather than calling [`builtin()`](KeyExtractor::builtin) on every query.
71///
72/// # Filtering rules
73///
74/// A token is eligible as a keyword when **all** of the following hold:
75/// 1. Kind is `Thai`, `Latin`, `Number`, or `Named` (whitespace, punctuation,
76///    emoji, and unknown tokens are always skipped)
77/// 2. Character length ≥ 2 (single-char tokens are too coarse to be keywords)
78/// 3. Not in the built-in Thai stopword list
79///
80/// # Examples
81///
82/// ```rust
83/// use kham_core::keyword::KeyExtractor;
84///
85/// let kex = KeyExtractor::builtin();
86///
87/// // Rare domain-specific word outranks a common word
88/// // "ซอฟต์แวร์" (software) is rare in TNC and should appear as a top keyword
89/// let kws = kex.extract("นักพัฒนาซอฟต์แวร์เขียนซอฟต์แวร์ทุกวัน", 5);
90/// assert!(kws.iter().any(|k| k.word == "ซอฟต์แวร์"));
91/// ```
92pub struct KeyExtractor {
93    tokenizer: Tokenizer,
94    freq: FreqMap,
95    stops: StopwordSet,
96    max_corpus_freq: u32,
97}
98
99impl KeyExtractor {
100    /// Create a keyword extractor backed by the built-in tokenizer, TNC
101    /// frequency table, and Thai stopword list.
102    ///
103    /// # Examples
104    ///
105    /// ```rust
106    /// use kham_core::keyword::KeyExtractor;
107    ///
108    /// let kex = KeyExtractor::builtin();
109    /// assert!(!kex.extract("กินข้าวกับปลา", 5).is_empty());
110    /// ```
111    pub fn builtin() -> Self {
112        let freq = FreqMap::builtin();
113        let max_corpus_freq = freq.max_freq();
114        Self {
115            tokenizer: Tokenizer::new(),
116            freq,
117            stops: StopwordSet::builtin(),
118            max_corpus_freq,
119        }
120    }
121
122    /// Extract up to `max_n` keywords from `text`, ranked by TF-IDF score.
123    ///
124    /// Returns an empty `Vec` when `text` is empty, contains no eligible
125    /// content words, or `max_n` is zero.
126    ///
127    /// Ties in score are broken alphabetically so results are deterministic.
128    ///
129    /// # Examples
130    ///
131    /// ```rust
132    /// use kham_core::keyword::KeyExtractor;
133    ///
134    /// let kex = KeyExtractor::builtin();
135    ///
136    /// // Edge cases
137    /// assert!(kex.extract("", 5).is_empty());
138    /// assert!(kex.extract("กินข้าวกับปลา", 0).is_empty());
139    ///
140    /// // Score order is non-increasing
141    /// let kws = kex.extract("การเรียนภาษาโปรแกรมมิ่งเป็นทักษะสำคัญสำหรับนักพัฒนา", 10);
142    /// for pair in kws.windows(2) {
143    ///     assert!(
144    ///         pair[0].score >= pair[1].score,
145    ///         "out-of-order: {:?} before {:?}", pair[0], pair[1]
146    ///     );
147    /// }
148    /// ```
149    pub fn extract(&self, text: &str, max_n: usize) -> Vec<Keyword> {
150        if text.is_empty() || max_n == 0 {
151            return Vec::new();
152        }
153
154        let tokens = self.tokenizer.segment(text);
155
156        // Count all content tokens for the TF denominator.
157        // Count candidate tokens (non-stop, len ≥ 2) for keyword scoring.
158        let mut total_content: usize = 0;
159        let mut counts: BTreeMap<String, usize> = BTreeMap::new();
160
161        for token in &tokens {
162            match token.kind {
163                TokenKind::Whitespace
164                | TokenKind::Punctuation
165                | TokenKind::Emoji
166                | TokenKind::Unknown => continue,
167                _ => {}
168            }
169
170            total_content += 1;
171
172            // Single-char tokens and stopwords are counted in the denominator
173            // but excluded from the keyword candidates.
174            if token.text.chars().count() < 2 || self.stops.contains(token.text) {
175                continue;
176            }
177
178            *counts.entry(String::from(token.text)).or_insert(0) += 1;
179        }
180
181        if total_content == 0 || counts.is_empty() {
182            return Vec::new();
183        }
184
185        let total_f = total_content as f32;
186        // IDF numerator: max corpus frequency + 1 (avoids div-by-zero for max entry).
187        let idf_num = self.max_corpus_freq as f32 + 1.0;
188
189        let mut results: Vec<Keyword> = counts
190            .into_iter()
191            .map(|(word, count)| {
192                let tf = count as f32 / total_f;
193                let corpus_freq = self.freq.get(&word);
194                let idf = idf_num / (corpus_freq as f32 + 1.0);
195                Keyword {
196                    word,
197                    score: tf * idf,
198                    count,
199                }
200            })
201            .collect();
202
203        // Sort: score DESC, word ASC for deterministic ties
204        results.sort_unstable_by(|a, b| {
205            b.score
206                .partial_cmp(&a.score)
207                .unwrap_or(core::cmp::Ordering::Equal)
208                .then(a.word.cmp(&b.word))
209        });
210
211        results.truncate(max_n);
212        results
213    }
214}
215
216// ---------------------------------------------------------------------------
217// Tests
218// ---------------------------------------------------------------------------
219
220#[cfg(test)]
221mod tests {
222    use super::*;
223
224    fn kex() -> KeyExtractor {
225        KeyExtractor::builtin()
226    }
227
228    // ── edge cases ──────────────────────────────────────────────────────────
229
230    #[test]
231    fn empty_text_returns_empty() {
232        assert!(kex().extract("", 5).is_empty());
233    }
234
235    #[test]
236    fn zero_max_n_returns_empty() {
237        assert!(kex().extract("กินข้าวกับปลา", 0).is_empty());
238    }
239
240    #[test]
241    fn only_stopwords_returns_empty() {
242        // "และ" "หรือ" "ของ" are all stopwords
243        assert!(kex().extract("และหรือของ", 5).is_empty());
244    }
245
246    #[test]
247    fn only_single_chars_returns_empty() {
248        // Single Thai characters are below the min-length threshold
249        assert!(kex().extract("ก ข ค ง", 5).is_empty());
250    }
251
252    // ── result properties ────────────────────────────────────────────────────
253
254    #[test]
255    fn respects_max_n() {
256        let kws = kex().extract("การพัฒนาซอฟต์แวร์เป็นสิ่งสำคัญในยุคดิจิทัลสำหรับนักพัฒนา", 3);
257        assert!(kws.len() <= 3, "expected ≤ 3 results, got {}", kws.len());
258    }
259
260    #[test]
261    fn results_sorted_by_score_descending() {
262        let kws = kex().extract("การเรียนภาษาโปรแกรมมิ่งเป็นทักษะสำคัญสำหรับนักพัฒนาซอฟต์แวร์", 10);
263        for pair in kws.windows(2) {
264            assert!(
265                pair[0].score >= pair[1].score,
266                "sort order violated: {:?} before {:?}",
267                pair[0],
268                pair[1]
269            );
270        }
271    }
272
273    #[test]
274    fn count_reflects_occurrences() {
275        // "ซอฟต์แวร์" appears 3 times in the input
276        let kws = kex().extract("นักพัฒนาซอฟต์แวร์เขียนซอฟต์แวร์และทดสอบซอฟต์แวร์ทุกวัน", 10);
277        let sw = kws.iter().find(|k| k.word == "ซอฟต์แวร์");
278        assert!(sw.is_some(), "expected ซอฟต์แวร์ in keywords; got: {kws:?}");
279        assert_eq!(sw.unwrap().count, 3, "expected count=3 for ซอฟต์แวร์");
280    }
281
282    #[test]
283    fn stopwords_not_in_results() {
284        let kws = kex().extract("กินข้าวกับปลาและดื่มน้ำ", 20);
285        // "กับ" and "และ" are stopwords and must not appear
286        assert!(
287            kws.iter().all(|k| k.word != "กับ" && k.word != "และ"),
288            "stopword found in results: {kws:?}"
289        );
290    }
291
292    #[test]
293    fn all_scores_positive() {
294        let kws = kex().extract("การพัฒนาซอฟต์แวร์ต้องการทักษะและประสบการณ์", 10);
295        assert!(
296            kws.iter().all(|k| k.score > 0.0),
297            "expected all scores > 0; got: {kws:?}"
298        );
299    }
300
301    // ── IDF weighting ────────────────────────────────────────────────────────
302
303    #[test]
304    fn rare_word_outranks_common_word_with_same_count() {
305        // Both appear once; rare corpus word should score higher.
306        // "ไดโนเสาร์" (dinosaur) is rare in TNC; "คน" (person) is very common.
307        let kws = kex().extract("ไดโนเสาร์กินคน", 10);
308        let rare = kws.iter().find(|k| k.word == "ไดโนเสาร์");
309        let common = kws.iter().find(|k| k.word == "คน");
310        if let (Some(r), Some(c)) = (rare, common) {
311            assert!(
312                r.score > c.score,
313                "expected ไดโนเสาร์ ({}) to outscore คน ({})",
314                r.score,
315                c.score
316            );
317        }
318    }
319
320    #[test]
321    fn repeated_word_scores_higher_than_single_occurrence() {
322        // "ซอฟต์แวร์" ×3 vs "นักพัฒนา" ×1 — same IDF, TF difference wins
323        let kws = kex().extract("นักพัฒนาซอฟต์แวร์เขียนซอฟต์แวร์และทดสอบซอฟต์แวร์", 10);
324        let sw = kws.iter().find(|k| k.word == "ซอฟต์แวร์");
325        let dev = kws.iter().find(|k| k.word == "นักพัฒนา");
326        if let (Some(s), Some(d)) = (sw, dev) {
327            assert!(
328                s.score > d.score,
329                "expected ซอฟต์แวร์ (×3, score {}) > นักพัฒนา (×1, score {})",
330                s.score,
331                d.score
332            );
333        }
334    }
335
336    // ── mixed script ─────────────────────────────────────────────────────────
337
338    #[test]
339    fn latin_tokens_included_as_candidates() {
340        let kws = kex().extract("เขียน Python และใช้ Python ทุกวัน", 10);
341        // "Python" appears twice and is a Latin token — must be in results
342        let py = kws.iter().find(|k| k.word == "Python");
343        assert!(py.is_some(), "expected Python in keywords; got: {kws:?}");
344        assert_eq!(py.unwrap().count, 2);
345    }
346
347    #[test]
348    fn punctuation_not_in_results() {
349        let kws = kex().extract("กินข้าว, ดื่มน้ำ. นอนหลับ!", 20);
350        assert!(
351            kws.iter()
352                .all(|k| !k.word.chars().all(|c| c.is_ascii_punctuation())),
353            "punctuation token found in results: {kws:?}"
354        );
355    }
356}