kham_core/
fts.rs

1//! Full-text search pipeline for Thai text.
2//!
3//! [`FtsTokenizer`] orchestrates the complete FTS indexing pipeline:
4//! normalise → segment → tag stopwords → expand synonyms → attach position.
5//!
6//! The output [`FtsToken`] slice is consumed by the PostgreSQL `kham-pg`
7//! extension and by any other caller that needs FTS-ready lexemes.
8//!
9//! # Positions
10//!
11//! `position` is the ordinal index of the token in the non-whitespace token
12//! sequence (0-based). Stopwords retain their position so that phrase-distance
13//! scoring remains correct when stopwords are later omitted from the index.
14//!
15//! # Example
16//!
17//! ```rust
18//! use kham_core::fts::{FtsTokenizer, FtsToken};
19//!
20//! let fts = FtsTokenizer::new();
21//! let tokens = fts.segment_for_fts("กินข้าวกับปลา");
22//! for t in &tokens {
23//!     println!("{} pos={} stop={}", t.text, t.position, t.is_stop);
24//! }
25//! ```
26
27use alloc::string::String;
28use alloc::vec::Vec;
29
30use crate::abbrev::AbbrevMap;
31use crate::ne::NeTagger;
32use crate::ngram::char_ngrams;
33use crate::number::{thai_digits_to_ascii, thai_word_to_decimal};
34use crate::pos::{PosTag, PosTagger};
35use crate::romanizer::RomanizationMap;
36use crate::stopwords::StopwordSet;
37use crate::synonym::SynonymMap;
38use crate::token::{NamedEntityKind, TokenKind};
39use crate::Tokenizer;
40
41/// A token produced by the FTS pipeline, ready for lexeme indexing.
42#[derive(Debug, Clone, PartialEq, Eq)]
43pub struct FtsToken {
44    /// The token text (owned; may be normalised).
45    pub text: String,
46    /// Ordinal position in the token sequence (0-based, gaps for whitespace).
47    pub position: usize,
48    /// Script / category of the original token.
49    pub kind: TokenKind,
50    /// `true` if this token matches the stopword list.
51    pub is_stop: bool,
52    /// Synonym expansions (empty if none configured or no match).
53    pub synonyms: Vec<String>,
54    /// Character trigrams — populated only for [`TokenKind::Unknown`] tokens.
55    pub trigrams: Vec<String>,
56    /// Primary part-of-speech tag from the lookup table, or `None` if the word
57    /// is not in the table (OOV) or is not a Thai token.
58    pub pos: Option<PosTag>,
59    /// Named entity category, or `None` if the token is not in the NE
60    /// gazetteer. When set, `kind` is [`TokenKind::Named`]`(ne)`.
61    pub ne: Option<NamedEntityKind>,
62}
63
64/// Builder for [`FtsTokenizer`].
65#[derive(Default)]
66pub struct FtsTokenizerBuilder {
67    stopwords: Option<StopwordSet>,
68    synonyms: Option<SynonymMap>,
69    ngram_size: Option<usize>,
70    pos_tagger: Option<PosTagger>,
71    ne_tagger: Option<NeTagger>,
72    romanization: Option<RomanizationMap>,
73    abbrev_map: Option<AbbrevMap>,
74    /// `None` means "use default (true)".
75    number_normalize: Option<bool>,
76}
77
78impl FtsTokenizerBuilder {
79    /// Use a custom stopword set instead of the built-in list.
80    pub fn stopwords(mut self, s: StopwordSet) -> Self {
81        self.stopwords = Some(s);
82        self
83    }
84
85    /// Attach a synonym map for expansion.
86    pub fn synonyms(mut self, m: SynonymMap) -> Self {
87        self.synonyms = Some(m);
88        self
89    }
90
91    /// Override the n-gram size used for [`TokenKind::Unknown`] tokens.
92    ///
93    /// Default: 3 (trigrams). Set to 0 to disable n-gram generation.
94    pub fn ngram_size(mut self, n: usize) -> Self {
95        self.ngram_size = Some(n);
96        self
97    }
98
99    /// Use a custom POS tagger instead of the built-in table.
100    pub fn pos_tagger(mut self, t: PosTagger) -> Self {
101        self.pos_tagger = Some(t);
102        self
103    }
104
105    /// Use a custom NE gazetteer instead of the built-in table.
106    pub fn ne_tagger(mut self, t: NeTagger) -> Self {
107        self.ne_tagger = Some(t);
108        self
109    }
110
111    /// Attach a romanization map so RTGS forms are added to [`FtsToken::synonyms`].
112    ///
113    /// When set, each Thai and Named token whose text is found in the map gets its
114    /// RTGS romanization appended to `synonyms`, enabling Latin-script queries
115    /// (e.g. `kin`) to match Thai-script documents (e.g. `กิน`) in PostgreSQL FTS.
116    ///
117    /// Disabled by default — call this method to opt in.
118    pub fn romanization(mut self, m: RomanizationMap) -> Self {
119        self.romanization = Some(m);
120        self
121    }
122
123    /// Attach an abbreviation map for pre-tokenisation expansion.
124    ///
125    /// When set, [`FtsTokenizer::segment_for_fts`] calls
126    /// [`AbbrevMap::expand_text`] on the normalised input before segmentation.
127    /// This replaces abbreviated forms (e.g. `ก.ค.`) with their canonical
128    /// expansions (`กรกฎาคม`) so they are indexed and searchable by full form.
129    ///
130    /// Disabled by default — call this method to opt in.
131    pub fn abbrevs(mut self, m: AbbrevMap) -> Self {
132        self.abbrev_map = Some(m);
133        self
134    }
135
136    /// Enable or disable number normalization (default: `true`).
137    ///
138    /// When enabled:
139    /// - [`TokenKind::Number`] tokens that contain Thai digits (๐–๙) get the
140    ///   ASCII digit string added to their [`FtsToken::synonyms`]
141    ///   (e.g. `๑๒๓` → synonym `"123"`).
142    /// - [`TokenKind::Thai`] tokens that are recognised Thai cardinal number
143    ///   words get their decimal value added to `synonyms`
144    ///   (e.g. `หนึ่งร้อย` → synonym `"100"`).
145    ///
146    /// This lets queries using either script match documents written in the
147    /// other. Set to `false` to opt out.
148    pub fn number_normalize(mut self, v: bool) -> Self {
149        self.number_normalize = Some(v);
150        self
151    }
152
153    /// Consume the builder and return a configured [`FtsTokenizer`].
154    pub fn build(self) -> FtsTokenizer {
155        FtsTokenizer {
156            tokenizer: Tokenizer::new(),
157            stopwords: self.stopwords.unwrap_or_else(StopwordSet::builtin),
158            synonyms: self.synonyms.unwrap_or_else(SynonymMap::empty),
159            ngram_size: self.ngram_size.unwrap_or(3),
160            pos_tagger: self.pos_tagger.unwrap_or_else(PosTagger::builtin),
161            ne_tagger: self.ne_tagger.unwrap_or_else(NeTagger::builtin),
162            romanization: self.romanization,
163            abbrev_map: self.abbrev_map,
164            number_normalize: self.number_normalize.unwrap_or(true),
165        }
166    }
167}
168
169/// Full-text search tokenizer for Thai text.
170///
171/// Wraps [`Tokenizer`] with stopword filtering, synonym expansion, and n-gram
172/// generation for out-of-vocabulary tokens.
173///
174/// Construct once and reuse:
175///
176/// ```rust
177/// use kham_core::fts::FtsTokenizer;
178///
179/// let fts = FtsTokenizer::new();
180/// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
181/// assert!(!tokens.is_empty());
182/// ```
183pub struct FtsTokenizer {
184    tokenizer: Tokenizer,
185    stopwords: StopwordSet,
186    synonyms: SynonymMap,
187    ngram_size: usize,
188    pos_tagger: PosTagger,
189    ne_tagger: NeTagger,
190    romanization: Option<RomanizationMap>,
191    abbrev_map: Option<AbbrevMap>,
192    number_normalize: bool,
193}
194
195impl FtsTokenizer {
196    /// Create an [`FtsTokenizer`] with built-in stopwords and no synonyms.
197    pub fn new() -> Self {
198        FtsTokenizerBuilder::default().build()
199    }
200
201    /// Return a [`FtsTokenizerBuilder`] for custom configuration.
202    pub fn builder() -> FtsTokenizerBuilder {
203        FtsTokenizerBuilder::default()
204    }
205
206    /// Segment `text` and annotate each token for FTS indexing.
207    ///
208    /// Normalises the input text before segmentation so that สระลอย and stacked
209    /// tone marks are handled correctly. Whitespace tokens are excluded.
210    ///
211    /// The returned `Vec<FtsToken>` covers all non-whitespace tokens. Call
212    /// [`index_tokens`] instead when you only need the tokens to be indexed
213    /// (stopwords excluded).
214    ///
215    /// [`index_tokens`]: FtsTokenizer::index_tokens
216    pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken> {
217        let normalized = self.tokenizer.normalize(text);
218        // Expand abbreviations (e.g. ก.ค. → กรกฎาคม) before segmentation so
219        // dot-containing patterns are replaced as single units.
220        let expanded = match self.abbrev_map.as_ref() {
221            Some(am) => am.expand_text(&normalized),
222            None => normalized,
223        };
224        let raw_tokens = self
225            .ne_tagger
226            .tag_tokens(self.tokenizer.segment(&expanded), &expanded);
227
228        let mut result = Vec::with_capacity(raw_tokens.len());
229        let mut position = 0usize;
230
231        for token in &raw_tokens {
232            if token.kind == TokenKind::Whitespace {
233                continue;
234            }
235
236            let is_stop = self.stopwords.contains(token.text);
237            let is_thai_or_named = matches!(token.kind, TokenKind::Thai | TokenKind::Named(_));
238            let mut synonyms = self
239                .synonyms
240                .expand(token.text)
241                .map(|s| s.to_vec())
242                .unwrap_or_default();
243            if is_thai_or_named {
244                if let Some(ref rom) = self.romanization {
245                    if let Some(rtgs) = rom.romanize(token.text) {
246                        synonyms.push(String::from(rtgs));
247                    }
248                }
249            }
250            if self.number_normalize {
251                match token.kind {
252                    // Number token with Thai digits → add ASCII form as synonym.
253                    TokenKind::Number => {
254                        let ascii = thai_digits_to_ascii(token.text);
255                        if ascii != token.text {
256                            synonyms.push(ascii);
257                        }
258                    }
259                    // Thai token that is a recognised number word → add decimal string.
260                    TokenKind::Thai => {
261                        if let Some(decimal) = thai_word_to_decimal(token.text) {
262                            synonyms.push(decimal);
263                        }
264                    }
265                    _ => {}
266                }
267            }
268            let trigrams = if token.kind == TokenKind::Unknown && self.ngram_size > 0 {
269                char_ngrams(token.text, self.ngram_size)
270                    .map(String::from)
271                    .collect()
272            } else {
273                Vec::new()
274            };
275            let ne = if let TokenKind::Named(k) = token.kind {
276                Some(k)
277            } else {
278                None
279            };
280            let pos = if token.kind == TokenKind::Thai {
281                self.pos_tagger.tag(token.text)
282            } else {
283                None
284            };
285
286            result.push(FtsToken {
287                text: String::from(token.text),
288                position,
289                kind: token.kind,
290                is_stop,
291                synonyms,
292                trigrams,
293                pos,
294                ne,
295            });
296
297            position += 1;
298        }
299
300        result
301    }
302
303    /// Return only the tokens to be written into a search index.
304    ///
305    /// Filters out stopwords and whitespace. Each [`FtsToken`] still carries
306    /// its original `position` so phrase-distance scoring remains correct.
307    pub fn index_tokens(&self, text: &str) -> Vec<FtsToken> {
308        self.segment_for_fts(text)
309            .into_iter()
310            .filter(|t| !t.is_stop)
311            .collect()
312    }
313
314    /// Collect all lexeme strings to be stored in a `tsvector`.
315    ///
316    /// Returns one string per non-stop token, plus synonym expansions and
317    /// trigrams for unknown tokens. Duplicates are not removed (the caller or
318    /// PostgreSQL handles deduplication).
319    pub fn lexemes(&self, text: &str) -> Vec<String> {
320        let tokens = self.index_tokens(text);
321        let mut out: Vec<String> = Vec::with_capacity(tokens.len() * 2);
322        for t in tokens {
323            out.push(t.text.clone());
324            out.extend(t.synonyms);
325            out.extend(t.trigrams);
326        }
327        out
328    }
329}
330
331impl Default for FtsTokenizer {
332    fn default() -> Self {
333        Self::new()
334    }
335}
336
337// ---------------------------------------------------------------------------
338// Tests
339// ---------------------------------------------------------------------------
340
341#[cfg(test)]
342mod tests {
343    use super::*;
344    use crate::stopwords::StopwordSet;
345    use crate::synonym::SynonymMap;
346
347    fn fts() -> FtsTokenizer {
348        FtsTokenizer::new()
349    }
350
351    // ── segment_for_fts ───────────────────────────────────────────────────────
352
353    #[test]
354    fn empty_input_returns_empty() {
355        assert!(fts().segment_for_fts("").is_empty());
356    }
357
358    #[test]
359    fn whitespace_tokens_excluded() {
360        let tokens = fts().segment_for_fts("กิน ข้าว");
361        assert!(tokens.iter().all(|t| t.kind != TokenKind::Whitespace));
362    }
363
364    #[test]
365    fn positions_are_sequential() {
366        let tokens = fts().segment_for_fts("กินข้าวกับปลา");
367        for (i, t) in tokens.iter().enumerate() {
368            assert_eq!(t.position, i, "position mismatch at index {i}");
369        }
370    }
371
372    #[test]
373    fn known_stopword_is_tagged() {
374        // "กับ" is a common conjunction and should be in the built-in stopword list
375        let tokens = fts().segment_for_fts("กินข้าวกับปลา");
376        let kap = tokens.iter().find(|t| t.text == "กับ");
377        assert!(kap.is_some(), "expected 'กับ' token");
378        assert!(kap.unwrap().is_stop, "'กับ' should be tagged as stopword");
379    }
380
381    #[test]
382    fn content_words_not_tagged_as_stop() {
383        let tokens = fts().segment_for_fts("โรงพยาบาล");
384        // May be OOV but should not be a stopword
385        for t in &tokens {
386            assert!(!t.is_stop, "'{}' should not be a stopword", t.text);
387        }
388    }
389
390    #[test]
391    fn text_is_reconstructable() {
392        // All tokens joined == normalised input (whitespace dropped)
393        let fts = fts();
394        let text = "กินข้าวกับปลา";
395        let normalized = fts.tokenizer.normalize(text);
396        let tokens = fts.segment_for_fts(text);
397        let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
398        assert_eq!(rebuilt, normalized);
399    }
400
401    // ── synonym expansion ─────────────────────────────────────────────────────
402
403    #[test]
404    fn synonym_expansion_attached() {
405        let synonyms = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
406        let fts = FtsTokenizer::builder()
407            .synonyms(synonyms)
408            .stopwords(StopwordSet::from_text(""))
409            .build();
410        // Segment a text containing "คอม" — need it in dict or it lands as Unknown
411        // Use builder with custom word so the segmenter recognises it
412        let tokens = fts.segment_for_fts("คอม");
413        let t = tokens.iter().find(|t| t.text == "คอม");
414        if let Some(tok) = t {
415            assert!(
416                tok.synonyms.contains(&String::from("คอมพิวเตอร์")),
417                "expected synonym expansion, got {:?}",
418                tok.synonyms
419            );
420        }
421    }
422
423    #[test]
424    fn no_synonyms_when_map_empty() {
425        let tokens = fts().segment_for_fts("กินข้าว");
426        for t in &tokens {
427            assert!(t.synonyms.is_empty());
428        }
429    }
430
431    // ── unknown token trigrams ────────────────────────────────────────────────
432
433    #[test]
434    fn unknown_token_gets_trigrams() {
435        // "กิ" = consonant + sara-i, a single 2-char TCC that is not a word.
436        // With ngram_size=2 the token should yield one bigram ("กิ").
437        // The newmm DP emits Unknown tokens one TCC at a time, so multi-char TCCs
438        // (like "กิ") are the shortest unit that can produce n-grams.
439        let fts = FtsTokenizer::builder()
440            .ngram_size(2)
441            .stopwords(StopwordSet::from_text(""))
442            .build();
443        let tokens = fts.segment_for_fts("กิ");
444        let unknown: Vec<_> = tokens
445            .iter()
446            .filter(|t| t.kind == TokenKind::Unknown && t.text.chars().count() >= 2)
447            .collect();
448        assert!(
449            !unknown.is_empty(),
450            "expected at least one multi-char Unknown token for 'กิ'"
451        );
452        for u in &unknown {
453            assert!(
454                !u.trigrams.is_empty(),
455                "unknown token '{}' ({} chars) should have bigrams",
456                u.text,
457                u.text.chars().count()
458            );
459        }
460    }
461
462    #[test]
463    fn known_thai_token_has_no_trigrams() {
464        let tokens = fts().segment_for_fts("กิน");
465        for t in &tokens {
466            if t.kind == TokenKind::Thai {
467                assert!(
468                    t.trigrams.is_empty(),
469                    "known Thai token '{}' should not have trigrams",
470                    t.text
471                );
472            }
473        }
474    }
475
476    #[test]
477    fn ngram_size_zero_disables_trigrams() {
478        let fts = FtsTokenizer::builder()
479            .ngram_size(0)
480            .stopwords(StopwordSet::from_text(""))
481            .build();
482        let tokens = fts.segment_for_fts("กขคง");
483        for t in &tokens {
484            assert!(t.trigrams.is_empty());
485        }
486    }
487
488    // ── index_tokens ──────────────────────────────────────────────────────────
489
490    #[test]
491    fn index_tokens_excludes_stopwords() {
492        let tokens = fts().index_tokens("กินข้าวกับปลา");
493        assert!(tokens.iter().all(|t| !t.is_stop));
494    }
495
496    #[test]
497    fn index_tokens_preserves_positions() {
498        // Positions in index_tokens must be a subset of segment_for_fts positions
499        let all = fts().segment_for_fts("กินข้าวกับปลา");
500        let indexed = fts().index_tokens("กินข้าวกับปลา");
501        for t in &indexed {
502            assert!(
503                all.iter().any(|a| a.position == t.position),
504                "indexed token at position {} not found in full token list",
505                t.position
506            );
507        }
508    }
509
510    // ── lexemes ───────────────────────────────────────────────────────────────
511
512    #[test]
513    fn lexemes_returns_non_stop_texts() {
514        let lexemes = fts().lexemes("กินข้าวกับปลา");
515        // "กับ" is a stopword — should not appear
516        assert!(!lexemes.contains(&String::from("กับ")));
517        // Content words should appear
518        assert!(
519            lexemes
520                .iter()
521                .any(|l| l == "กิน" || l == "ข้าว" || l == "ปลา"),
522            "expected content words in lexemes: {lexemes:?}"
523        );
524    }
525
526    #[test]
527    fn lexemes_empty_input_is_empty() {
528        assert!(fts().lexemes("").is_empty());
529    }
530
531    // ── multi-token NE ────────────────────────────────────────────────────────
532
533    #[test]
534    fn multi_token_ne_merged_in_pipeline() {
535        // กรุงเทพ is in the NE gazetteer as PLACE; the segmenter splits it
536        // into กรุง+เทพ. The FTS pipeline must merge them into one Named token.
537        let fts = FtsTokenizer::new();
538        let tokens = fts.segment_for_fts("ไปกรุงเทพ");
539        let named: Vec<_> = tokens
540            .iter()
541            .filter(|t| matches!(t.kind, TokenKind::Named(_)))
542            .collect();
543        assert!(
544            named.iter().any(|t| t.text == "กรุงเทพ"),
545            "กรุงเทพ should be tagged Named after multi-token merge, tokens: {:?}",
546            tokens
547                .iter()
548                .map(|t| (&t.text, &t.kind))
549                .collect::<alloc::vec::Vec<_>>()
550        );
551    }
552
553    #[test]
554    fn multi_token_ne_reconstructable() {
555        // Texts of all non-whitespace tokens must still reconstruct the normalized input.
556        let fts = FtsTokenizer::new();
557        let text = "ไปกรุงเทพ";
558        let normalized = fts.tokenizer.normalize(text);
559        let tokens = fts.segment_for_fts(text);
560        let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
561        assert_eq!(rebuilt, normalized);
562    }
563
564    // ── builder ───────────────────────────────────────────────────────────────
565
566    #[test]
567    fn builder_custom_stopwords() {
568        let stops = StopwordSet::from_text("กิน\n");
569        let fts = FtsTokenizer::builder().stopwords(stops).build();
570        let tokens = fts.segment_for_fts("กินข้าว");
571        let gin = tokens.iter().find(|t| t.text == "กิน");
572        if let Some(t) = gin {
573            assert!(t.is_stop, "'กิน' should be stop with custom list");
574        }
575    }
576
577    #[test]
578    fn builder_default_equals_new() {
579        // Both paths should produce the same result for a simple input
580        let a = FtsTokenizer::new().lexemes("กินข้าว");
581        let b = FtsTokenizer::builder().build().lexemes("กินข้าว");
582        assert_eq!(a, b);
583    }
584
585    // ── number normalization ──────────────────────────────────────────────────
586
587    #[test]
588    fn thai_digit_token_gets_ascii_synonym() {
589        let fts = FtsTokenizer::new();
590        let tokens = fts.segment_for_fts("๑๒๓");
591        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
592        assert!(num.is_some(), "expected a Number token");
593        let t = num.unwrap();
594        assert!(
595            t.synonyms.contains(&String::from("123")),
596            "Thai digit token should have ASCII synonym, got {:?}",
597            t.synonyms
598        );
599    }
600
601    #[test]
602    fn ascii_digit_token_has_no_extra_synonym() {
603        // ASCII digits need no conversion — synonyms should be empty (no map, no rom).
604        let fts = FtsTokenizer::new();
605        let tokens = fts.segment_for_fts("123");
606        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
607        assert!(num.is_some(), "expected a Number token");
608        assert!(
609            !num.unwrap().synonyms.contains(&String::from("123")),
610            "ASCII digit token should not duplicate itself as a synonym"
611        );
612    }
613
614    #[test]
615    fn thai_number_word_gets_decimal_synonym() {
616        // หนึ่งร้อย may segment as a single Thai token or multiple tokens depending
617        // on the dictionary. We check that at least one token carries "100" in synonyms.
618        let fts = FtsTokenizer::new();
619        let tokens = fts.segment_for_fts("หนึ่งร้อย");
620        let has_hundred = tokens
621            .iter()
622            .any(|t| t.synonyms.contains(&String::from("100")));
623        // หนึ่ง alone = Some(1), ร้อย alone = Some(100) — at least ร้อย should match.
624        assert!(
625            has_hundred,
626            "expected a token with decimal synonym '100', tokens: {:?}",
627            tokens
628                .iter()
629                .map(|t| (&t.text, &t.synonyms))
630                .collect::<alloc::vec::Vec<_>>()
631        );
632    }
633
634    #[test]
635    fn number_normalize_false_disables_conversion() {
636        let fts = FtsTokenizer::builder()
637            .number_normalize(false)
638            .stopwords(StopwordSet::from_text(""))
639            .build();
640        let tokens = fts.segment_for_fts("๑๒๓");
641        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
642        assert!(num.is_some());
643        assert!(
644            !num.unwrap().synonyms.contains(&String::from("123")),
645            "number_normalize=false should suppress ASCII synonym"
646        );
647    }
648
649    #[test]
650    fn mixed_thai_digit_in_context() {
651        // "ธนาคาร๑๐๐แห่ง" — the ๑๐๐ part should be a Number token with synonym "100"
652        let fts = FtsTokenizer::new();
653        let tokens = fts.segment_for_fts("ธนาคาร๑๐๐แห่ง");
654        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
655        assert!(num.is_some(), "expected Number token in mixed string");
656        assert!(
657            num.unwrap().synonyms.contains(&String::from("100")),
658            "expected ASCII synonym '100' for ๑๐๐"
659        );
660    }
661
662    // ── abbreviation expansion ────────────────────────────────────────────────
663
664    #[test]
665    fn abbrev_map_expands_before_segmentation() {
666        use crate::abbrev::AbbrevMap;
667        let fts = FtsTokenizer::builder()
668            .abbrevs(AbbrevMap::builtin())
669            .stopwords(StopwordSet::from_text(""))
670            .build();
671        // ก.ค. → กรกฎาคม before segmentation. The segmenter may split the
672        // expansion further (กรกฎา + คม) — what matters is that dots are gone
673        // and the Thai characters of กรกฎาคม are present.
674        let tokens = fts.segment_for_fts("ก.ค.");
675        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
676        let joined: String = texts.concat();
677        assert!(
678            joined.contains("กรกฎา") || joined.contains("กรกฎาคม"),
679            "expected กรกฎา(คม) characters after abbrev expansion, got: {texts:?}"
680        );
681        assert!(
682            !texts.contains(&"."),
683            "dots should be consumed by abbrev expansion, got: {texts:?}"
684        );
685    }
686
687    #[test]
688    fn abbrev_expansion_disabled_by_default() {
689        // FtsTokenizer::new() has no abbrev_map — ก.ค. stays as individual tokens.
690        let fts = FtsTokenizer::new();
691        let tokens = fts.segment_for_fts("ก.ค.");
692        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
693        // Without expansion the dot(s) must still be present as punctuation tokens.
694        assert!(
695            texts.contains(&"."),
696            "without abbrev expansion, dots should remain as tokens, got: {texts:?}"
697        );
698    }
699
700    #[test]
701    fn abbrev_expansion_date_sentence() {
702        use crate::abbrev::AbbrevMap;
703        let fts = FtsTokenizer::builder()
704            .abbrevs(AbbrevMap::builtin())
705            .stopwords(StopwordSet::from_text(""))
706            .build();
707        // พ.ศ. → พุทธศักราช; the segmenter may split it further — verify the
708        // chars are present and dots are gone.
709        let tokens = fts.segment_for_fts("พ.ศ.2567");
710        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
711        let joined: String = texts.concat();
712        assert!(
713            joined.contains("พุทธ") || joined.contains("พุทธศักราช"),
714            "expected พุทธ(ศักราช) chars after expanding พ.ศ., got: {texts:?}"
715        );
716        assert!(
717            !texts.contains(&"."),
718            "dots should be consumed by expansion, got: {texts:?}"
719        );
720    }
721}
kham_core/fts.rs

kham_core/
fts.rs