Skip to main content

kham_core/
fts.rs

1//! Full-text search pipeline for Thai text.
2//!
3//! [`FtsTokenizer`] orchestrates the complete FTS indexing pipeline:
4//! normalise → segment → tag stopwords → expand synonyms → attach position.
5//!
6//! The output [`FtsToken`] slice is consumed by the PostgreSQL `kham-pg`
7//! extension and by any other caller that needs FTS-ready lexemes.
8//!
9//! # Positions
10//!
11//! `position` is the ordinal index of the token in the non-whitespace token
12//! sequence (0-based). Stopwords retain their position so that phrase-distance
13//! scoring remains correct when stopwords are later omitted from the index.
14//!
15//! # Example
16//!
17//! ```rust
18//! use kham_core::fts::{FtsTokenizer, FtsToken};
19//!
20//! let fts = FtsTokenizer::new();
21//! let tokens = fts.segment_for_fts("กินข้าวกับปลา");
22//! for t in &tokens {
23//!     println!("{} pos={} stop={}", t.text, t.position, t.is_stop);
24//! }
25//! ```
26
27use alloc::string::String;
28use alloc::vec::Vec;
29
30use crate::ne::NeTagger;
31use crate::ngram::char_ngrams;
32use crate::pos::{PosTag, PosTagger};
33use crate::romanizer::RomanizationMap;
34use crate::stopwords::StopwordSet;
35use crate::synonym::SynonymMap;
36use crate::token::{NamedEntityKind, TokenKind};
37use crate::Tokenizer;
38
39/// A token produced by the FTS pipeline, ready for lexeme indexing.
40#[derive(Debug, Clone, PartialEq, Eq)]
41pub struct FtsToken {
42    /// The token text (owned; may be normalised).
43    pub text: String,
44    /// Ordinal position in the token sequence (0-based, gaps for whitespace).
45    pub position: usize,
46    /// Script / category of the original token.
47    pub kind: TokenKind,
48    /// `true` if this token matches the stopword list.
49    pub is_stop: bool,
50    /// Synonym expansions (empty if none configured or no match).
51    pub synonyms: Vec<String>,
52    /// Character trigrams — populated only for [`TokenKind::Unknown`] tokens.
53    pub trigrams: Vec<String>,
54    /// Primary part-of-speech tag from the lookup table, or `None` if the word
55    /// is not in the table (OOV) or is not a Thai token.
56    pub pos: Option<PosTag>,
57    /// Named entity category, or `None` if the token is not in the NE
58    /// gazetteer. When set, `kind` is [`TokenKind::Named`]`(ne)`.
59    pub ne: Option<NamedEntityKind>,
60}
61
62/// Builder for [`FtsTokenizer`].
63#[derive(Default)]
64pub struct FtsTokenizerBuilder {
65    stopwords: Option<StopwordSet>,
66    synonyms: Option<SynonymMap>,
67    ngram_size: Option<usize>,
68    pos_tagger: Option<PosTagger>,
69    ne_tagger: Option<NeTagger>,
70    romanization: Option<RomanizationMap>,
71}
72
73impl FtsTokenizerBuilder {
74    /// Use a custom stopword set instead of the built-in list.
75    pub fn stopwords(mut self, s: StopwordSet) -> Self {
76        self.stopwords = Some(s);
77        self
78    }
79
80    /// Attach a synonym map for expansion.
81    pub fn synonyms(mut self, m: SynonymMap) -> Self {
82        self.synonyms = Some(m);
83        self
84    }
85
86    /// Override the n-gram size used for [`TokenKind::Unknown`] tokens.
87    ///
88    /// Default: 3 (trigrams). Set to 0 to disable n-gram generation.
89    pub fn ngram_size(mut self, n: usize) -> Self {
90        self.ngram_size = Some(n);
91        self
92    }
93
94    /// Use a custom POS tagger instead of the built-in table.
95    pub fn pos_tagger(mut self, t: PosTagger) -> Self {
96        self.pos_tagger = Some(t);
97        self
98    }
99
100    /// Use a custom NE gazetteer instead of the built-in table.
101    pub fn ne_tagger(mut self, t: NeTagger) -> Self {
102        self.ne_tagger = Some(t);
103        self
104    }
105
106    /// Attach a romanization map so RTGS forms are added to [`FtsToken::synonyms`].
107    ///
108    /// When set, each Thai and Named token whose text is found in the map gets its
109    /// RTGS romanization appended to `synonyms`, enabling Latin-script queries
110    /// (e.g. `kin`) to match Thai-script documents (e.g. `กิน`) in PostgreSQL FTS.
111    ///
112    /// Disabled by default — call this method to opt in.
113    pub fn romanization(mut self, m: RomanizationMap) -> Self {
114        self.romanization = Some(m);
115        self
116    }
117
118    /// Consume the builder and return a configured [`FtsTokenizer`].
119    pub fn build(self) -> FtsTokenizer {
120        FtsTokenizer {
121            tokenizer: Tokenizer::new(),
122            stopwords: self.stopwords.unwrap_or_else(StopwordSet::builtin),
123            synonyms: self.synonyms.unwrap_or_else(SynonymMap::empty),
124            ngram_size: self.ngram_size.unwrap_or(3),
125            pos_tagger: self.pos_tagger.unwrap_or_else(PosTagger::builtin),
126            ne_tagger: self.ne_tagger.unwrap_or_else(NeTagger::builtin),
127            romanization: self.romanization,
128        }
129    }
130}
131
132/// Full-text search tokenizer for Thai text.
133///
134/// Wraps [`Tokenizer`] with stopword filtering, synonym expansion, and n-gram
135/// generation for out-of-vocabulary tokens.
136///
137/// Construct once and reuse:
138///
139/// ```rust
140/// use kham_core::fts::FtsTokenizer;
141///
142/// let fts = FtsTokenizer::new();
143/// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
144/// assert!(!tokens.is_empty());
145/// ```
146pub struct FtsTokenizer {
147    tokenizer: Tokenizer,
148    stopwords: StopwordSet,
149    synonyms: SynonymMap,
150    ngram_size: usize,
151    pos_tagger: PosTagger,
152    ne_tagger: NeTagger,
153    romanization: Option<RomanizationMap>,
154}
155
156impl FtsTokenizer {
157    /// Create an [`FtsTokenizer`] with built-in stopwords and no synonyms.
158    pub fn new() -> Self {
159        FtsTokenizerBuilder::default().build()
160    }
161
162    /// Return a [`FtsTokenizerBuilder`] for custom configuration.
163    pub fn builder() -> FtsTokenizerBuilder {
164        FtsTokenizerBuilder::default()
165    }
166
167    /// Segment `text` and annotate each token for FTS indexing.
168    ///
169    /// Normalises the input text before segmentation so that สระลอย and stacked
170    /// tone marks are handled correctly. Whitespace tokens are excluded.
171    ///
172    /// The returned `Vec<FtsToken>` covers all non-whitespace tokens. Call
173    /// [`index_tokens`] instead when you only need the tokens to be indexed
174    /// (stopwords excluded).
175    ///
176    /// [`index_tokens`]: FtsTokenizer::index_tokens
177    pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken> {
178        let normalized = self.tokenizer.normalize(text);
179        let raw_tokens = self
180            .ne_tagger
181            .tag_tokens(self.tokenizer.segment(&normalized), &normalized);
182
183        let mut result = Vec::with_capacity(raw_tokens.len());
184        let mut position = 0usize;
185
186        for token in &raw_tokens {
187            if token.kind == TokenKind::Whitespace {
188                continue;
189            }
190
191            let is_stop = self.stopwords.contains(token.text);
192            let is_thai_or_named = matches!(token.kind, TokenKind::Thai | TokenKind::Named(_));
193            let mut synonyms = self
194                .synonyms
195                .expand(token.text)
196                .map(|s| s.to_vec())
197                .unwrap_or_default();
198            if is_thai_or_named {
199                if let Some(ref rom) = self.romanization {
200                    if let Some(rtgs) = rom.romanize(token.text) {
201                        synonyms.push(String::from(rtgs));
202                    }
203                }
204            }
205            let trigrams = if token.kind == TokenKind::Unknown && self.ngram_size > 0 {
206                char_ngrams(token.text, self.ngram_size)
207                    .map(String::from)
208                    .collect()
209            } else {
210                Vec::new()
211            };
212            let ne = if let TokenKind::Named(k) = token.kind {
213                Some(k)
214            } else {
215                None
216            };
217            let pos = if token.kind == TokenKind::Thai {
218                self.pos_tagger.tag(token.text)
219            } else {
220                None
221            };
222
223            result.push(FtsToken {
224                text: String::from(token.text),
225                position,
226                kind: token.kind,
227                is_stop,
228                synonyms,
229                trigrams,
230                pos,
231                ne,
232            });
233
234            position += 1;
235        }
236
237        result
238    }
239
240    /// Return only the tokens to be written into a search index.
241    ///
242    /// Filters out stopwords and whitespace. Each [`FtsToken`] still carries
243    /// its original `position` so phrase-distance scoring remains correct.
244    pub fn index_tokens(&self, text: &str) -> Vec<FtsToken> {
245        self.segment_for_fts(text)
246            .into_iter()
247            .filter(|t| !t.is_stop)
248            .collect()
249    }
250
251    /// Collect all lexeme strings to be stored in a `tsvector`.
252    ///
253    /// Returns one string per non-stop token, plus synonym expansions and
254    /// trigrams for unknown tokens. Duplicates are not removed (the caller or
255    /// PostgreSQL handles deduplication).
256    pub fn lexemes(&self, text: &str) -> Vec<String> {
257        let tokens = self.index_tokens(text);
258        let mut out: Vec<String> = Vec::with_capacity(tokens.len() * 2);
259        for t in tokens {
260            out.push(t.text.clone());
261            out.extend(t.synonyms);
262            out.extend(t.trigrams);
263        }
264        out
265    }
266}
267
268impl Default for FtsTokenizer {
269    fn default() -> Self {
270        Self::new()
271    }
272}
273
274// ---------------------------------------------------------------------------
275// Tests
276// ---------------------------------------------------------------------------
277
278#[cfg(test)]
279mod tests {
280    use super::*;
281    use crate::stopwords::StopwordSet;
282    use crate::synonym::SynonymMap;
283
284    fn fts() -> FtsTokenizer {
285        FtsTokenizer::new()
286    }
287
288    // ── segment_for_fts ───────────────────────────────────────────────────────
289
290    #[test]
291    fn empty_input_returns_empty() {
292        assert!(fts().segment_for_fts("").is_empty());
293    }
294
295    #[test]
296    fn whitespace_tokens_excluded() {
297        let tokens = fts().segment_for_fts("กิน ข้าว");
298        assert!(tokens.iter().all(|t| t.kind != TokenKind::Whitespace));
299    }
300
301    #[test]
302    fn positions_are_sequential() {
303        let tokens = fts().segment_for_fts("กินข้าวกับปลา");
304        for (i, t) in tokens.iter().enumerate() {
305            assert_eq!(t.position, i, "position mismatch at index {i}");
306        }
307    }
308
309    #[test]
310    fn known_stopword_is_tagged() {
311        // "กับ" is a common conjunction and should be in the built-in stopword list
312        let tokens = fts().segment_for_fts("กินข้าวกับปลา");
313        let kap = tokens.iter().find(|t| t.text == "กับ");
314        assert!(kap.is_some(), "expected 'กับ' token");
315        assert!(kap.unwrap().is_stop, "'กับ' should be tagged as stopword");
316    }
317
318    #[test]
319    fn content_words_not_tagged_as_stop() {
320        let tokens = fts().segment_for_fts("โรงพยาบาล");
321        // May be OOV but should not be a stopword
322        for t in &tokens {
323            assert!(!t.is_stop, "'{}' should not be a stopword", t.text);
324        }
325    }
326
327    #[test]
328    fn text_is_reconstructable() {
329        // All tokens joined == normalised input (whitespace dropped)
330        let fts = fts();
331        let text = "กินข้าวกับปลา";
332        let normalized = fts.tokenizer.normalize(text);
333        let tokens = fts.segment_for_fts(text);
334        let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
335        assert_eq!(rebuilt, normalized);
336    }
337
338    // ── synonym expansion ─────────────────────────────────────────────────────
339
340    #[test]
341    fn synonym_expansion_attached() {
342        let synonyms = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
343        let fts = FtsTokenizer::builder()
344            .synonyms(synonyms)
345            .stopwords(StopwordSet::from_text(""))
346            .build();
347        // Segment a text containing "คอม" — need it in dict or it lands as Unknown
348        // Use builder with custom word so the segmenter recognises it
349        let tokens = fts.segment_for_fts("คอม");
350        let t = tokens.iter().find(|t| t.text == "คอม");
351        if let Some(tok) = t {
352            assert!(
353                tok.synonyms.contains(&String::from("คอมพิวเตอร์")),
354                "expected synonym expansion, got {:?}",
355                tok.synonyms
356            );
357        }
358    }
359
360    #[test]
361    fn no_synonyms_when_map_empty() {
362        let tokens = fts().segment_for_fts("กินข้าว");
363        for t in &tokens {
364            assert!(t.synonyms.is_empty());
365        }
366    }
367
368    // ── unknown token trigrams ────────────────────────────────────────────────
369
370    #[test]
371    fn unknown_token_gets_trigrams() {
372        // "กิ" = consonant + sara-i, a single 2-char TCC that is not a word.
373        // With ngram_size=2 the token should yield one bigram ("กิ").
374        // The newmm DP emits Unknown tokens one TCC at a time, so multi-char TCCs
375        // (like "กิ") are the shortest unit that can produce n-grams.
376        let fts = FtsTokenizer::builder()
377            .ngram_size(2)
378            .stopwords(StopwordSet::from_text(""))
379            .build();
380        let tokens = fts.segment_for_fts("กิ");
381        let unknown: Vec<_> = tokens
382            .iter()
383            .filter(|t| t.kind == TokenKind::Unknown && t.text.chars().count() >= 2)
384            .collect();
385        assert!(
386            !unknown.is_empty(),
387            "expected at least one multi-char Unknown token for 'กิ'"
388        );
389        for u in &unknown {
390            assert!(
391                !u.trigrams.is_empty(),
392                "unknown token '{}' ({} chars) should have bigrams",
393                u.text,
394                u.text.chars().count()
395            );
396        }
397    }
398
399    #[test]
400    fn known_thai_token_has_no_trigrams() {
401        let tokens = fts().segment_for_fts("กิน");
402        for t in &tokens {
403            if t.kind == TokenKind::Thai {
404                assert!(
405                    t.trigrams.is_empty(),
406                    "known Thai token '{}' should not have trigrams",
407                    t.text
408                );
409            }
410        }
411    }
412
413    #[test]
414    fn ngram_size_zero_disables_trigrams() {
415        let fts = FtsTokenizer::builder()
416            .ngram_size(0)
417            .stopwords(StopwordSet::from_text(""))
418            .build();
419        let tokens = fts.segment_for_fts("กขคง");
420        for t in &tokens {
421            assert!(t.trigrams.is_empty());
422        }
423    }
424
425    // ── index_tokens ──────────────────────────────────────────────────────────
426
427    #[test]
428    fn index_tokens_excludes_stopwords() {
429        let tokens = fts().index_tokens("กินข้าวกับปลา");
430        assert!(tokens.iter().all(|t| !t.is_stop));
431    }
432
433    #[test]
434    fn index_tokens_preserves_positions() {
435        // Positions in index_tokens must be a subset of segment_for_fts positions
436        let all = fts().segment_for_fts("กินข้าวกับปลา");
437        let indexed = fts().index_tokens("กินข้าวกับปลา");
438        for t in &indexed {
439            assert!(
440                all.iter().any(|a| a.position == t.position),
441                "indexed token at position {} not found in full token list",
442                t.position
443            );
444        }
445    }
446
447    // ── lexemes ───────────────────────────────────────────────────────────────
448
449    #[test]
450    fn lexemes_returns_non_stop_texts() {
451        let lexemes = fts().lexemes("กินข้าวกับปลา");
452        // "กับ" is a stopword — should not appear
453        assert!(!lexemes.contains(&String::from("กับ")));
454        // Content words should appear
455        assert!(
456            lexemes
457                .iter()
458                .any(|l| l == "กิน" || l == "ข้าว" || l == "ปลา"),
459            "expected content words in lexemes: {lexemes:?}"
460        );
461    }
462
463    #[test]
464    fn lexemes_empty_input_is_empty() {
465        assert!(fts().lexemes("").is_empty());
466    }
467
468    // ── multi-token NE ────────────────────────────────────────────────────────
469
470    #[test]
471    fn multi_token_ne_merged_in_pipeline() {
472        // กรุงเทพ is in the NE gazetteer as PLACE; the segmenter splits it
473        // into กรุง+เทพ. The FTS pipeline must merge them into one Named token.
474        let fts = FtsTokenizer::new();
475        let tokens = fts.segment_for_fts("ไปกรุงเทพ");
476        let named: Vec<_> = tokens
477            .iter()
478            .filter(|t| matches!(t.kind, TokenKind::Named(_)))
479            .collect();
480        assert!(
481            named.iter().any(|t| t.text == "กรุงเทพ"),
482            "กรุงเทพ should be tagged Named after multi-token merge, tokens: {:?}",
483            tokens
484                .iter()
485                .map(|t| (&t.text, &t.kind))
486                .collect::<alloc::vec::Vec<_>>()
487        );
488    }
489
490    #[test]
491    fn multi_token_ne_reconstructable() {
492        // Texts of all non-whitespace tokens must still reconstruct the normalized input.
493        let fts = FtsTokenizer::new();
494        let text = "ไปกรุงเทพ";
495        let normalized = fts.tokenizer.normalize(text);
496        let tokens = fts.segment_for_fts(text);
497        let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
498        assert_eq!(rebuilt, normalized);
499    }
500
501    // ── builder ───────────────────────────────────────────────────────────────
502
503    #[test]
504    fn builder_custom_stopwords() {
505        let stops = StopwordSet::from_text("กิน\n");
506        let fts = FtsTokenizer::builder().stopwords(stops).build();
507        let tokens = fts.segment_for_fts("กินข้าว");
508        let gin = tokens.iter().find(|t| t.text == "กิน");
509        if let Some(t) = gin {
510            assert!(t.is_stop, "'กิน' should be stop with custom list");
511        }
512    }
513
514    #[test]
515    fn builder_default_equals_new() {
516        // Both paths should produce the same result for a simple input
517        let a = FtsTokenizer::new().lexemes("กินข้าว");
518        let b = FtsTokenizer::builder().build().lexemes("กินข้าว");
519        assert_eq!(a, b);
520    }
521}