Skip to main content

kham_core/
fts.rs

1//! Full-text search pipeline for Thai text.
2//!
3//! [`FtsTokenizer`] orchestrates the complete FTS indexing pipeline:
4//! normalise → segment → tag stopwords → expand synonyms → attach position.
5//!
6//! The output [`FtsToken`] slice is consumed by the PostgreSQL `kham-pg`
7//! extension and by any other caller that needs FTS-ready lexemes.
8//!
9//! # Positions
10//!
11//! `position` is the ordinal index of the token in the non-whitespace token
12//! sequence (0-based). Stopwords retain their position so that phrase-distance
13//! scoring remains correct when stopwords are later omitted from the index.
14//!
15//! # Example
16//!
17//! ```rust
18//! use kham_core::fts::{FtsTokenizer, FtsToken};
19//!
20//! let fts = FtsTokenizer::new();
21//! let tokens = fts.segment_for_fts("กินข้าวกับปลา");
22//! for t in &tokens {
23//!     println!("{} pos={} stop={}", t.text, t.position, t.is_stop);
24//! }
25//! ```
26
27use alloc::string::String;
28use alloc::vec::Vec;
29
30use crate::ngram::char_ngrams;
31use crate::stopwords::StopwordSet;
32use crate::synonym::SynonymMap;
33use crate::token::TokenKind;
34use crate::Tokenizer;
35
36/// A token produced by the FTS pipeline, ready for lexeme indexing.
37#[derive(Debug, Clone, PartialEq, Eq)]
38pub struct FtsToken {
39    /// The token text (owned; may be normalised).
40    pub text: String,
41    /// Ordinal position in the token sequence (0-based, gaps for whitespace).
42    pub position: usize,
43    /// Script / category of the original token.
44    pub kind: TokenKind,
45    /// `true` if this token matches the stopword list.
46    pub is_stop: bool,
47    /// Synonym expansions (empty if none configured or no match).
48    pub synonyms: Vec<String>,
49    /// Character trigrams — populated only for [`TokenKind::Unknown`] tokens.
50    pub trigrams: Vec<String>,
51}
52
53/// Builder for [`FtsTokenizer`].
54#[derive(Default)]
55pub struct FtsTokenizerBuilder {
56    stopwords: Option<StopwordSet>,
57    synonyms: Option<SynonymMap>,
58    ngram_size: Option<usize>,
59}
60
61impl FtsTokenizerBuilder {
62    /// Use a custom stopword set instead of the built-in list.
63    pub fn stopwords(mut self, s: StopwordSet) -> Self {
64        self.stopwords = Some(s);
65        self
66    }
67
68    /// Attach a synonym map for expansion.
69    pub fn synonyms(mut self, m: SynonymMap) -> Self {
70        self.synonyms = Some(m);
71        self
72    }
73
74    /// Override the n-gram size used for [`TokenKind::Unknown`] tokens.
75    ///
76    /// Default: 3 (trigrams). Set to 0 to disable n-gram generation.
77    pub fn ngram_size(mut self, n: usize) -> Self {
78        self.ngram_size = Some(n);
79        self
80    }
81
82    /// Consume the builder and return a configured [`FtsTokenizer`].
83    pub fn build(self) -> FtsTokenizer {
84        FtsTokenizer {
85            tokenizer: Tokenizer::new(),
86            stopwords: self.stopwords.unwrap_or_else(StopwordSet::builtin),
87            synonyms: self.synonyms.unwrap_or_else(SynonymMap::empty),
88            ngram_size: self.ngram_size.unwrap_or(3),
89        }
90    }
91}
92
93/// Full-text search tokenizer for Thai text.
94///
95/// Wraps [`Tokenizer`] with stopword filtering, synonym expansion, and n-gram
96/// generation for out-of-vocabulary tokens.
97///
98/// Construct once and reuse:
99///
100/// ```rust
101/// use kham_core::fts::FtsTokenizer;
102///
103/// let fts = FtsTokenizer::new();
104/// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
105/// assert!(!tokens.is_empty());
106/// ```
107pub struct FtsTokenizer {
108    tokenizer: Tokenizer,
109    stopwords: StopwordSet,
110    synonyms: SynonymMap,
111    ngram_size: usize,
112}
113
114impl FtsTokenizer {
115    /// Create an [`FtsTokenizer`] with built-in stopwords and no synonyms.
116    pub fn new() -> Self {
117        FtsTokenizerBuilder::default().build()
118    }
119
120    /// Return a [`FtsTokenizerBuilder`] for custom configuration.
121    pub fn builder() -> FtsTokenizerBuilder {
122        FtsTokenizerBuilder::default()
123    }
124
125    /// Segment `text` and annotate each token for FTS indexing.
126    ///
127    /// Normalises the input text before segmentation so that สระลอย and stacked
128    /// tone marks are handled correctly. Whitespace tokens are excluded.
129    ///
130    /// The returned `Vec<FtsToken>` covers all non-whitespace tokens. Call
131    /// [`index_tokens`] instead when you only need the tokens to be indexed
132    /// (stopwords excluded).
133    ///
134    /// [`index_tokens`]: FtsTokenizer::index_tokens
135    pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken> {
136        let normalized = self.tokenizer.normalize(text);
137        let raw_tokens = self.tokenizer.segment(&normalized);
138
139        let mut result = Vec::with_capacity(raw_tokens.len());
140        let mut position = 0usize;
141
142        for token in &raw_tokens {
143            if token.kind == TokenKind::Whitespace {
144                continue;
145            }
146
147            let is_stop = self.stopwords.contains(token.text);
148            let synonyms = self
149                .synonyms
150                .expand(token.text)
151                .map(|s| s.to_vec())
152                .unwrap_or_default();
153            let trigrams = if token.kind == TokenKind::Unknown && self.ngram_size > 0 {
154                char_ngrams(token.text, self.ngram_size)
155                    .map(String::from)
156                    .collect()
157            } else {
158                Vec::new()
159            };
160
161            result.push(FtsToken {
162                text: String::from(token.text),
163                position,
164                kind: token.kind,
165                is_stop,
166                synonyms,
167                trigrams,
168            });
169
170            position += 1;
171        }
172
173        result
174    }
175
176    /// Return only the tokens to be written into a search index.
177    ///
178    /// Filters out stopwords and whitespace. Each [`FtsToken`] still carries
179    /// its original `position` so phrase-distance scoring remains correct.
180    pub fn index_tokens(&self, text: &str) -> Vec<FtsToken> {
181        self.segment_for_fts(text)
182            .into_iter()
183            .filter(|t| !t.is_stop)
184            .collect()
185    }
186
187    /// Collect all lexeme strings to be stored in a `tsvector`.
188    ///
189    /// Returns one string per non-stop token, plus synonym expansions and
190    /// trigrams for unknown tokens. Duplicates are not removed (the caller or
191    /// PostgreSQL handles deduplication).
192    pub fn lexemes(&self, text: &str) -> Vec<String> {
193        let tokens = self.index_tokens(text);
194        let mut out: Vec<String> = Vec::with_capacity(tokens.len() * 2);
195        for t in tokens {
196            out.push(t.text.clone());
197            out.extend(t.synonyms);
198            out.extend(t.trigrams);
199        }
200        out
201    }
202}
203
204impl Default for FtsTokenizer {
205    fn default() -> Self {
206        Self::new()
207    }
208}
209
210// ---------------------------------------------------------------------------
211// Tests
212// ---------------------------------------------------------------------------
213
214#[cfg(test)]
215mod tests {
216    use super::*;
217    use crate::stopwords::StopwordSet;
218    use crate::synonym::SynonymMap;
219
220    fn fts() -> FtsTokenizer {
221        FtsTokenizer::new()
222    }
223
224    // ── segment_for_fts ───────────────────────────────────────────────────────
225
226    #[test]
227    fn empty_input_returns_empty() {
228        assert!(fts().segment_for_fts("").is_empty());
229    }
230
231    #[test]
232    fn whitespace_tokens_excluded() {
233        let tokens = fts().segment_for_fts("กิน ข้าว");
234        assert!(tokens.iter().all(|t| t.kind != TokenKind::Whitespace));
235    }
236
237    #[test]
238    fn positions_are_sequential() {
239        let tokens = fts().segment_for_fts("กินข้าวกับปลา");
240        for (i, t) in tokens.iter().enumerate() {
241            assert_eq!(t.position, i, "position mismatch at index {i}");
242        }
243    }
244
245    #[test]
246    fn known_stopword_is_tagged() {
247        // "กับ" is a common conjunction and should be in the built-in stopword list
248        let tokens = fts().segment_for_fts("กินข้าวกับปลา");
249        let kap = tokens.iter().find(|t| t.text == "กับ");
250        assert!(kap.is_some(), "expected 'กับ' token");
251        assert!(kap.unwrap().is_stop, "'กับ' should be tagged as stopword");
252    }
253
254    #[test]
255    fn content_words_not_tagged_as_stop() {
256        let tokens = fts().segment_for_fts("โรงพยาบาล");
257        // May be OOV but should not be a stopword
258        for t in &tokens {
259            assert!(!t.is_stop, "'{}' should not be a stopword", t.text);
260        }
261    }
262
263    #[test]
264    fn text_is_reconstructable() {
265        // All tokens joined == normalised input (whitespace dropped)
266        let fts = fts();
267        let text = "กินข้าวกับปลา";
268        let normalized = fts.tokenizer.normalize(text);
269        let tokens = fts.segment_for_fts(text);
270        let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
271        assert_eq!(rebuilt, normalized);
272    }
273
274    // ── synonym expansion ─────────────────────────────────────────────────────
275
276    #[test]
277    fn synonym_expansion_attached() {
278        let synonyms = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
279        let fts = FtsTokenizer::builder()
280            .synonyms(synonyms)
281            .stopwords(StopwordSet::from_text(""))
282            .build();
283        // Segment a text containing "คอม" — need it in dict or it lands as Unknown
284        // Use builder with custom word so the segmenter recognises it
285        let tokens = fts.segment_for_fts("คอม");
286        let t = tokens.iter().find(|t| t.text == "คอม");
287        if let Some(tok) = t {
288            assert!(
289                tok.synonyms.contains(&String::from("คอมพิวเตอร์")),
290                "expected synonym expansion, got {:?}",
291                tok.synonyms
292            );
293        }
294    }
295
296    #[test]
297    fn no_synonyms_when_map_empty() {
298        let tokens = fts().segment_for_fts("กินข้าว");
299        for t in &tokens {
300            assert!(t.synonyms.is_empty());
301        }
302    }
303
304    // ── unknown token trigrams ────────────────────────────────────────────────
305
306    #[test]
307    fn unknown_token_gets_trigrams() {
308        // "กิ" = consonant + sara-i, a single 2-char TCC that is not a word.
309        // With ngram_size=2 the token should yield one bigram ("กิ").
310        // The newmm DP emits Unknown tokens one TCC at a time, so multi-char TCCs
311        // (like "กิ") are the shortest unit that can produce n-grams.
312        let fts = FtsTokenizer::builder()
313            .ngram_size(2)
314            .stopwords(StopwordSet::from_text(""))
315            .build();
316        let tokens = fts.segment_for_fts("กิ");
317        let unknown: Vec<_> = tokens
318            .iter()
319            .filter(|t| t.kind == TokenKind::Unknown && t.text.chars().count() >= 2)
320            .collect();
321        assert!(
322            !unknown.is_empty(),
323            "expected at least one multi-char Unknown token for 'กิ'"
324        );
325        for u in &unknown {
326            assert!(
327                !u.trigrams.is_empty(),
328                "unknown token '{}' ({} chars) should have bigrams",
329                u.text,
330                u.text.chars().count()
331            );
332        }
333    }
334
335    #[test]
336    fn known_thai_token_has_no_trigrams() {
337        let tokens = fts().segment_for_fts("กิน");
338        for t in &tokens {
339            if t.kind == TokenKind::Thai {
340                assert!(
341                    t.trigrams.is_empty(),
342                    "known Thai token '{}' should not have trigrams",
343                    t.text
344                );
345            }
346        }
347    }
348
349    #[test]
350    fn ngram_size_zero_disables_trigrams() {
351        let fts = FtsTokenizer::builder()
352            .ngram_size(0)
353            .stopwords(StopwordSet::from_text(""))
354            .build();
355        let tokens = fts.segment_for_fts("กขคง");
356        for t in &tokens {
357            assert!(t.trigrams.is_empty());
358        }
359    }
360
361    // ── index_tokens ──────────────────────────────────────────────────────────
362
363    #[test]
364    fn index_tokens_excludes_stopwords() {
365        let tokens = fts().index_tokens("กินข้าวกับปลา");
366        assert!(tokens.iter().all(|t| !t.is_stop));
367    }
368
369    #[test]
370    fn index_tokens_preserves_positions() {
371        // Positions in index_tokens must be a subset of segment_for_fts positions
372        let all = fts().segment_for_fts("กินข้าวกับปลา");
373        let indexed = fts().index_tokens("กินข้าวกับปลา");
374        for t in &indexed {
375            assert!(
376                all.iter().any(|a| a.position == t.position),
377                "indexed token at position {} not found in full token list",
378                t.position
379            );
380        }
381    }
382
383    // ── lexemes ───────────────────────────────────────────────────────────────
384
385    #[test]
386    fn lexemes_returns_non_stop_texts() {
387        let lexemes = fts().lexemes("กินข้าวกับปลา");
388        // "กับ" is a stopword — should not appear
389        assert!(!lexemes.contains(&String::from("กับ")));
390        // Content words should appear
391        assert!(
392            lexemes
393                .iter()
394                .any(|l| l == "กิน" || l == "ข้าว" || l == "ปลา"),
395            "expected content words in lexemes: {lexemes:?}"
396        );
397    }
398
399    #[test]
400    fn lexemes_empty_input_is_empty() {
401        assert!(fts().lexemes("").is_empty());
402    }
403
404    // ── builder ───────────────────────────────────────────────────────────────
405
406    #[test]
407    fn builder_custom_stopwords() {
408        let stops = StopwordSet::from_text("กิน\n");
409        let fts = FtsTokenizer::builder().stopwords(stops).build();
410        let tokens = fts.segment_for_fts("กินข้าว");
411        let gin = tokens.iter().find(|t| t.text == "กิน");
412        if let Some(t) = gin {
413            assert!(t.is_stop, "'กิน' should be stop with custom list");
414        }
415    }
416
417    #[test]
418    fn builder_default_equals_new() {
419        // Both paths should produce the same result for a simple input
420        let a = FtsTokenizer::new().lexemes("กินข้าว");
421        let b = FtsTokenizer::builder().build().lexemes("กินข้าว");
422        assert_eq!(a, b);
423    }
424}