kham_core/
fts.rs

1//! Full-text search pipeline for Thai text.
2//!
3//! [`FtsTokenizer`] orchestrates the complete FTS indexing pipeline:
4//! normalise → segment → tag stopwords → expand synonyms → attach position.
5//!
6//! The output [`FtsToken`] slice is consumed by the PostgreSQL `kham-pg`
7//! extension and by any other caller that needs FTS-ready lexemes.
8//!
9//! # Positions
10//!
11//! `position` is the ordinal index of the token in the non-whitespace token
12//! sequence (0-based). Stopwords retain their position so that phrase-distance
13//! scoring remains correct when stopwords are later omitted from the index.
14//!
15//! # Example
16//!
17//! ```rust
18//! use kham_core::fts::{FtsTokenizer, FtsToken};
19//!
20//! let fts = FtsTokenizer::new();
21//! let tokens = fts.segment_for_fts("กินข้าวกับปลา");
22//! for t in &tokens {
23//!     println!("{} pos={} stop={}", t.text, t.position, t.is_stop);
24//! }
25//! ```
26
27use alloc::string::String;
28use alloc::vec::Vec;
29
30use crate::abbrev::AbbrevMap;
31use crate::ne::NeTagger;
32use crate::ngram::char_ngrams;
33use crate::number::{thai_digits_to_ascii, thai_word_to_decimal};
34use crate::pos::{PosTag, PosTagger};
35use crate::romanizer::RomanizationMap;
36use crate::soundex::{soundex, SoundexAlgorithm};
37use crate::stopwords::StopwordSet;
38use crate::synonym::SynonymMap;
39use crate::token::{NamedEntityKind, TokenKind};
40use crate::Tokenizer;
41
42/// A token produced by the FTS pipeline, ready for lexeme indexing.
43#[derive(Debug, Clone, PartialEq, Eq)]
44pub struct FtsToken {
45    /// The token text (owned; may be normalised).
46    pub text: String,
47    /// Ordinal position in the token sequence (0-based, gaps for whitespace).
48    pub position: usize,
49    /// Script / category of the original token.
50    pub kind: TokenKind,
51    /// `true` if this token matches the stopword list.
52    pub is_stop: bool,
53    /// Synonym expansions (empty if none configured or no match).
54    pub synonyms: Vec<String>,
55    /// Character trigrams — populated only for [`TokenKind::Unknown`] tokens.
56    pub trigrams: Vec<String>,
57    /// Primary part-of-speech tag from the lookup table, or `None` if the word
58    /// is not in the table (OOV) or is not a Thai token.
59    pub pos: Option<PosTag>,
60    /// Named entity category, or `None` if the token is not in the NE
61    /// gazetteer. When set, `kind` is [`TokenKind::Named`]`(ne)`.
62    pub ne: Option<NamedEntityKind>,
63}
64
65/// Builder for [`FtsTokenizer`].
66#[derive(Default)]
67pub struct FtsTokenizerBuilder {
68    stopwords: Option<StopwordSet>,
69    synonyms: Option<SynonymMap>,
70    ngram_size: Option<usize>,
71    pos_tagger: Option<PosTagger>,
72    ne_tagger: Option<NeTagger>,
73    romanization: Option<RomanizationMap>,
74    abbrev_map: Option<AbbrevMap>,
75    /// `None` means "use default (true)".
76    number_normalize: Option<bool>,
77    soundex: Option<SoundexAlgorithm>,
78}
79
80impl FtsTokenizerBuilder {
81    /// Use a custom stopword set instead of the built-in list.
82    pub fn stopwords(mut self, s: StopwordSet) -> Self {
83        self.stopwords = Some(s);
84        self
85    }
86
87    /// Attach a synonym map for expansion.
88    pub fn synonyms(mut self, m: SynonymMap) -> Self {
89        self.synonyms = Some(m);
90        self
91    }
92
93    /// Override the n-gram size used for [`TokenKind::Unknown`] tokens.
94    ///
95    /// Default: 3 (trigrams). Set to 0 to disable n-gram generation.
96    pub fn ngram_size(mut self, n: usize) -> Self {
97        self.ngram_size = Some(n);
98        self
99    }
100
101    /// Use a custom POS tagger instead of the built-in table.
102    pub fn pos_tagger(mut self, t: PosTagger) -> Self {
103        self.pos_tagger = Some(t);
104        self
105    }
106
107    /// Use a custom NE gazetteer instead of the built-in table.
108    pub fn ne_tagger(mut self, t: NeTagger) -> Self {
109        self.ne_tagger = Some(t);
110        self
111    }
112
113    /// Attach a romanization map so RTGS forms are added to [`FtsToken::synonyms`].
114    ///
115    /// When set, each Thai and Named token whose text is found in the map gets its
116    /// RTGS romanization appended to `synonyms`, enabling Latin-script queries
117    /// (e.g. `kin`) to match Thai-script documents (e.g. `กิน`) in PostgreSQL FTS.
118    ///
119    /// Disabled by default — call this method to opt in.
120    pub fn romanization(mut self, m: RomanizationMap) -> Self {
121        self.romanization = Some(m);
122        self
123    }
124
125    /// Attach an abbreviation map for pre-tokenisation expansion.
126    ///
127    /// When set, [`FtsTokenizer::segment_for_fts`] calls
128    /// [`AbbrevMap::expand_text`] on the normalised input before segmentation.
129    /// This replaces abbreviated forms (e.g. `ก.ค.`) with their canonical
130    /// expansions (`กรกฎาคม`) so they are indexed and searchable by full form.
131    ///
132    /// Disabled by default — call this method to opt in.
133    pub fn abbrevs(mut self, m: AbbrevMap) -> Self {
134        self.abbrev_map = Some(m);
135        self
136    }
137
138    /// Enable or disable number normalization (default: `true`).
139    ///
140    /// When enabled:
141    /// - [`TokenKind::Number`] tokens that contain Thai digits (๐–๙) get the
142    ///   ASCII digit string added to their [`FtsToken::synonyms`]
143    ///   (e.g. `๑๒๓` → synonym `"123"`).
144    /// - [`TokenKind::Thai`] tokens that are recognised Thai cardinal number
145    ///   words get their decimal value added to `synonyms`
146    ///   (e.g. `หนึ่งร้อย` → synonym `"100"`).
147    ///
148    /// This lets queries using either script match documents written in the
149    /// other. Set to `false` to opt out.
150    pub fn number_normalize(mut self, v: bool) -> Self {
151        self.number_normalize = Some(v);
152        self
153    }
154
155    /// Emit a Thai phonetic soundex code as an additional synonym for Thai and Named tokens.
156    ///
157    /// When set, each Thai and Named token whose text contains Thai consonants gets its
158    /// soundex code appended to [`FtsToken::synonyms`], enabling phonetic fuzzy matching
159    /// in full-text search (e.g. querying `"1600"` matches กาน, ขาน, and คาน with lk82).
160    ///
161    /// [`SoundexAlgorithm::Lk82`] and [`SoundexAlgorithm::Udom83`] produce fixed
162    /// 4-character codes and are the recommended choices for FTS indexing.
163    /// [`SoundexAlgorithm::MetaSound`] produces variable-length codes and is more
164    /// collision-prone at word level — prefer lk82 or udom83 for general FTS use.
165    ///
166    /// Disabled by default — call this method to opt in.
167    pub fn soundex(mut self, algo: SoundexAlgorithm) -> Self {
168        self.soundex = Some(algo);
169        self
170    }
171
172    /// Consume the builder and return a configured [`FtsTokenizer`].
173    pub fn build(self) -> FtsTokenizer {
174        FtsTokenizer {
175            tokenizer: Tokenizer::new(),
176            stopwords: self.stopwords.unwrap_or_else(StopwordSet::builtin),
177            synonyms: self.synonyms.unwrap_or_else(SynonymMap::empty),
178            ngram_size: self.ngram_size.unwrap_or(3),
179            pos_tagger: self.pos_tagger.unwrap_or_else(PosTagger::builtin),
180            ne_tagger: self.ne_tagger.unwrap_or_else(NeTagger::builtin),
181            romanization: self.romanization,
182            abbrev_map: self.abbrev_map,
183            number_normalize: self.number_normalize.unwrap_or(true),
184            soundex: self.soundex,
185        }
186    }
187}
188
189/// Full-text search tokenizer for Thai text.
190///
191/// Wraps [`Tokenizer`] with stopword filtering, synonym expansion, and n-gram
192/// generation for out-of-vocabulary tokens.
193///
194/// Construct once and reuse:
195///
196/// ```rust
197/// use kham_core::fts::FtsTokenizer;
198///
199/// let fts = FtsTokenizer::new();
200/// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
201/// assert!(!tokens.is_empty());
202/// ```
203pub struct FtsTokenizer {
204    tokenizer: Tokenizer,
205    stopwords: StopwordSet,
206    synonyms: SynonymMap,
207    ngram_size: usize,
208    pos_tagger: PosTagger,
209    ne_tagger: NeTagger,
210    romanization: Option<RomanizationMap>,
211    abbrev_map: Option<AbbrevMap>,
212    number_normalize: bool,
213    soundex: Option<SoundexAlgorithm>,
214}
215
216impl FtsTokenizer {
217    /// Create an [`FtsTokenizer`] with built-in stopwords and no synonyms.
218    pub fn new() -> Self {
219        FtsTokenizerBuilder::default().build()
220    }
221
222    /// Return a [`FtsTokenizerBuilder`] for custom configuration.
223    pub fn builder() -> FtsTokenizerBuilder {
224        FtsTokenizerBuilder::default()
225    }
226
227    /// Segment `text` and annotate each token for FTS indexing.
228    ///
229    /// Normalises the input text before segmentation so that สระลอย and stacked
230    /// tone marks are handled correctly. Whitespace tokens are excluded.
231    ///
232    /// The returned `Vec<FtsToken>` covers all non-whitespace tokens. Call
233    /// [`index_tokens`] instead when you only need the tokens to be indexed
234    /// (stopwords excluded).
235    ///
236    /// [`index_tokens`]: FtsTokenizer::index_tokens
237    pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken> {
238        let normalized = self.tokenizer.normalize(text);
239        // Expand abbreviations (e.g. ก.ค. → กรกฎาคม) before segmentation so
240        // dot-containing patterns are replaced as single units.
241        let expanded = match self.abbrev_map.as_ref() {
242            Some(am) => am.expand_text(&normalized),
243            None => normalized,
244        };
245        let raw_tokens = self
246            .ne_tagger
247            .tag_tokens(self.tokenizer.segment(&expanded), &expanded);
248
249        let mut result = Vec::with_capacity(raw_tokens.len());
250        let mut position = 0usize;
251
252        for token in &raw_tokens {
253            if token.kind == TokenKind::Whitespace {
254                continue;
255            }
256
257            let is_stop = self.stopwords.contains(token.text);
258            let is_thai_or_named = matches!(token.kind, TokenKind::Thai | TokenKind::Named(_));
259            let mut synonyms = self
260                .synonyms
261                .expand(token.text)
262                .map(|s| s.to_vec())
263                .unwrap_or_default();
264            if is_thai_or_named {
265                if let Some(ref rom) = self.romanization {
266                    if let Some(rtgs) = rom.romanize(token.text) {
267                        synonyms.push(String::from(rtgs));
268                    }
269                }
270                if let Some(algo) = self.soundex {
271                    let code = soundex(token.text, algo);
272                    if !code.chars().all(|c| c == '0') {
273                        synonyms.push(code);
274                    }
275                }
276            }
277            if self.number_normalize {
278                match token.kind {
279                    // Number token with Thai digits → add ASCII form as synonym.
280                    TokenKind::Number => {
281                        let ascii = thai_digits_to_ascii(token.text);
282                        if ascii != token.text {
283                            synonyms.push(ascii);
284                        }
285                    }
286                    // Thai token that is a recognised number word → add decimal string.
287                    TokenKind::Thai => {
288                        if let Some(decimal) = thai_word_to_decimal(token.text) {
289                            synonyms.push(decimal);
290                        }
291                    }
292                    _ => {}
293                }
294            }
295            let trigrams = if token.kind == TokenKind::Unknown && self.ngram_size > 0 {
296                char_ngrams(token.text, self.ngram_size)
297                    .map(String::from)
298                    .collect()
299            } else {
300                Vec::new()
301            };
302            let ne = if let TokenKind::Named(k) = token.kind {
303                Some(k)
304            } else {
305                None
306            };
307            let pos = if token.kind == TokenKind::Thai {
308                self.pos_tagger.tag(token.text)
309            } else {
310                None
311            };
312
313            result.push(FtsToken {
314                text: String::from(token.text),
315                position,
316                kind: token.kind,
317                is_stop,
318                synonyms,
319                trigrams,
320                pos,
321                ne,
322            });
323
324            position += 1;
325        }
326
327        result
328    }
329
330    /// Return only the tokens to be written into a search index.
331    ///
332    /// Filters out stopwords and whitespace. Each [`FtsToken`] still carries
333    /// its original `position` so phrase-distance scoring remains correct.
334    pub fn index_tokens(&self, text: &str) -> Vec<FtsToken> {
335        self.segment_for_fts(text)
336            .into_iter()
337            .filter(|t| !t.is_stop)
338            .collect()
339    }
340
341    /// Collect all lexeme strings to be stored in a `tsvector`.
342    ///
343    /// Returns one string per non-stop token, plus synonym expansions and
344    /// trigrams for unknown tokens. Duplicates are not removed (the caller or
345    /// PostgreSQL handles deduplication).
346    pub fn lexemes(&self, text: &str) -> Vec<String> {
347        let tokens = self.index_tokens(text);
348        let mut out: Vec<String> = Vec::with_capacity(tokens.len() * 2);
349        for t in tokens {
350            out.push(t.text.clone());
351            out.extend(t.synonyms);
352            out.extend(t.trigrams);
353        }
354        out
355    }
356}
357
358impl Default for FtsTokenizer {
359    fn default() -> Self {
360        Self::new()
361    }
362}
363
364// ---------------------------------------------------------------------------
365// Tests
366// ---------------------------------------------------------------------------
367
368#[cfg(test)]
369mod tests {
370    use super::*;
371    use crate::stopwords::StopwordSet;
372    use crate::synonym::SynonymMap;
373
374    fn fts() -> FtsTokenizer {
375        FtsTokenizer::new()
376    }
377
378    // ── segment_for_fts ───────────────────────────────────────────────────────
379
380    #[test]
381    fn empty_input_returns_empty() {
382        assert!(fts().segment_for_fts("").is_empty());
383    }
384
385    #[test]
386    fn whitespace_tokens_excluded() {
387        let tokens = fts().segment_for_fts("กิน ข้าว");
388        assert!(tokens.iter().all(|t| t.kind != TokenKind::Whitespace));
389    }
390
391    #[test]
392    fn positions_are_sequential() {
393        let tokens = fts().segment_for_fts("กินข้าวกับปลา");
394        for (i, t) in tokens.iter().enumerate() {
395            assert_eq!(t.position, i, "position mismatch at index {i}");
396        }
397    }
398
399    #[test]
400    fn known_stopword_is_tagged() {
401        // "กับ" is a common conjunction and should be in the built-in stopword list
402        let tokens = fts().segment_for_fts("กินข้าวกับปลา");
403        let kap = tokens.iter().find(|t| t.text == "กับ");
404        assert!(kap.is_some(), "expected 'กับ' token");
405        assert!(kap.unwrap().is_stop, "'กับ' should be tagged as stopword");
406    }
407
408    #[test]
409    fn content_words_not_tagged_as_stop() {
410        let tokens = fts().segment_for_fts("โรงพยาบาล");
411        // May be OOV but should not be a stopword
412        for t in &tokens {
413            assert!(!t.is_stop, "'{}' should not be a stopword", t.text);
414        }
415    }
416
417    #[test]
418    fn text_is_reconstructable() {
419        // All tokens joined == normalised input (whitespace dropped)
420        let fts = fts();
421        let text = "กินข้าวกับปลา";
422        let normalized = fts.tokenizer.normalize(text);
423        let tokens = fts.segment_for_fts(text);
424        let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
425        assert_eq!(rebuilt, normalized);
426    }
427
428    // ── synonym expansion ─────────────────────────────────────────────────────
429
430    #[test]
431    fn synonym_expansion_attached() {
432        let synonyms = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
433        let fts = FtsTokenizer::builder()
434            .synonyms(synonyms)
435            .stopwords(StopwordSet::from_text(""))
436            .build();
437        // Segment a text containing "คอม" — need it in dict or it lands as Unknown
438        // Use builder with custom word so the segmenter recognises it
439        let tokens = fts.segment_for_fts("คอม");
440        let t = tokens.iter().find(|t| t.text == "คอม");
441        if let Some(tok) = t {
442            assert!(
443                tok.synonyms.contains(&String::from("คอมพิวเตอร์")),
444                "expected synonym expansion, got {:?}",
445                tok.synonyms
446            );
447        }
448    }
449
450    #[test]
451    fn no_synonyms_when_map_empty() {
452        let tokens = fts().segment_for_fts("กินข้าว");
453        for t in &tokens {
454            assert!(t.synonyms.is_empty());
455        }
456    }
457
458    // ── unknown token trigrams ────────────────────────────────────────────────
459
460    #[test]
461    fn unknown_token_gets_trigrams() {
462        // "กิ" = consonant + sara-i, a single 2-char TCC that is not a word.
463        // With ngram_size=2 the token should yield one bigram ("กิ").
464        // The newmm DP emits Unknown tokens one TCC at a time, so multi-char TCCs
465        // (like "กิ") are the shortest unit that can produce n-grams.
466        let fts = FtsTokenizer::builder()
467            .ngram_size(2)
468            .stopwords(StopwordSet::from_text(""))
469            .build();
470        let tokens = fts.segment_for_fts("กิ");
471        let unknown: Vec<_> = tokens
472            .iter()
473            .filter(|t| t.kind == TokenKind::Unknown && t.text.chars().count() >= 2)
474            .collect();
475        assert!(
476            !unknown.is_empty(),
477            "expected at least one multi-char Unknown token for 'กิ'"
478        );
479        for u in &unknown {
480            assert!(
481                !u.trigrams.is_empty(),
482                "unknown token '{}' ({} chars) should have bigrams",
483                u.text,
484                u.text.chars().count()
485            );
486        }
487    }
488
489    #[test]
490    fn known_thai_token_has_no_trigrams() {
491        let tokens = fts().segment_for_fts("กิน");
492        for t in &tokens {
493            if t.kind == TokenKind::Thai {
494                assert!(
495                    t.trigrams.is_empty(),
496                    "known Thai token '{}' should not have trigrams",
497                    t.text
498                );
499            }
500        }
501    }
502
503    #[test]
504    fn ngram_size_zero_disables_trigrams() {
505        let fts = FtsTokenizer::builder()
506            .ngram_size(0)
507            .stopwords(StopwordSet::from_text(""))
508            .build();
509        let tokens = fts.segment_for_fts("กขคง");
510        for t in &tokens {
511            assert!(t.trigrams.is_empty());
512        }
513    }
514
515    // ── index_tokens ──────────────────────────────────────────────────────────
516
517    #[test]
518    fn index_tokens_excludes_stopwords() {
519        let tokens = fts().index_tokens("กินข้าวกับปลา");
520        assert!(tokens.iter().all(|t| !t.is_stop));
521    }
522
523    #[test]
524    fn index_tokens_preserves_positions() {
525        // Positions in index_tokens must be a subset of segment_for_fts positions
526        let all = fts().segment_for_fts("กินข้าวกับปลา");
527        let indexed = fts().index_tokens("กินข้าวกับปลา");
528        for t in &indexed {
529            assert!(
530                all.iter().any(|a| a.position == t.position),
531                "indexed token at position {} not found in full token list",
532                t.position
533            );
534        }
535    }
536
537    // ── lexemes ───────────────────────────────────────────────────────────────
538
539    #[test]
540    fn lexemes_returns_non_stop_texts() {
541        let lexemes = fts().lexemes("กินข้าวกับปลา");
542        // "กับ" is a stopword — should not appear
543        assert!(!lexemes.contains(&String::from("กับ")));
544        // Content words should appear
545        assert!(
546            lexemes
547                .iter()
548                .any(|l| l == "กิน" || l == "ข้าว" || l == "ปลา"),
549            "expected content words in lexemes: {lexemes:?}"
550        );
551    }
552
553    #[test]
554    fn lexemes_empty_input_is_empty() {
555        assert!(fts().lexemes("").is_empty());
556    }
557
558    // ── multi-token NE ────────────────────────────────────────────────────────
559
560    #[test]
561    fn multi_token_ne_merged_in_pipeline() {
562        // กรุงเทพ is in the NE gazetteer as PLACE; the segmenter splits it
563        // into กรุง+เทพ. The FTS pipeline must merge them into one Named token.
564        let fts = FtsTokenizer::new();
565        let tokens = fts.segment_for_fts("ไปกรุงเทพ");
566        let named: Vec<_> = tokens
567            .iter()
568            .filter(|t| matches!(t.kind, TokenKind::Named(_)))
569            .collect();
570        assert!(
571            named.iter().any(|t| t.text == "กรุงเทพ"),
572            "กรุงเทพ should be tagged Named after multi-token merge, tokens: {:?}",
573            tokens
574                .iter()
575                .map(|t| (&t.text, &t.kind))
576                .collect::<alloc::vec::Vec<_>>()
577        );
578    }
579
580    #[test]
581    fn multi_token_ne_reconstructable() {
582        // Texts of all non-whitespace tokens must still reconstruct the normalized input.
583        let fts = FtsTokenizer::new();
584        let text = "ไปกรุงเทพ";
585        let normalized = fts.tokenizer.normalize(text);
586        let tokens = fts.segment_for_fts(text);
587        let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
588        assert_eq!(rebuilt, normalized);
589    }
590
591    // ── builder ───────────────────────────────────────────────────────────────
592
593    #[test]
594    fn builder_custom_stopwords() {
595        let stops = StopwordSet::from_text("กิน\n");
596        let fts = FtsTokenizer::builder().stopwords(stops).build();
597        let tokens = fts.segment_for_fts("กินข้าว");
598        let gin = tokens.iter().find(|t| t.text == "กิน");
599        if let Some(t) = gin {
600            assert!(t.is_stop, "'กิน' should be stop with custom list");
601        }
602    }
603
604    #[test]
605    fn builder_default_equals_new() {
606        // Both paths should produce the same result for a simple input
607        let a = FtsTokenizer::new().lexemes("กินข้าว");
608        let b = FtsTokenizer::builder().build().lexemes("กินข้าว");
609        assert_eq!(a, b);
610    }
611
612    // ── number normalization ──────────────────────────────────────────────────
613
614    #[test]
615    fn thai_digit_token_gets_ascii_synonym() {
616        let fts = FtsTokenizer::new();
617        let tokens = fts.segment_for_fts("๑๒๓");
618        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
619        assert!(num.is_some(), "expected a Number token");
620        let t = num.unwrap();
621        assert!(
622            t.synonyms.contains(&String::from("123")),
623            "Thai digit token should have ASCII synonym, got {:?}",
624            t.synonyms
625        );
626    }
627
628    #[test]
629    fn ascii_digit_token_has_no_extra_synonym() {
630        // ASCII digits need no conversion — synonyms should be empty (no map, no rom).
631        let fts = FtsTokenizer::new();
632        let tokens = fts.segment_for_fts("123");
633        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
634        assert!(num.is_some(), "expected a Number token");
635        assert!(
636            !num.unwrap().synonyms.contains(&String::from("123")),
637            "ASCII digit token should not duplicate itself as a synonym"
638        );
639    }
640
641    #[test]
642    fn thai_number_word_gets_decimal_synonym() {
643        // หนึ่งร้อย may segment as a single Thai token or multiple tokens depending
644        // on the dictionary. We check that at least one token carries "100" in synonyms.
645        let fts = FtsTokenizer::new();
646        let tokens = fts.segment_for_fts("หนึ่งร้อย");
647        let has_hundred = tokens
648            .iter()
649            .any(|t| t.synonyms.contains(&String::from("100")));
650        // หนึ่ง alone = Some(1), ร้อย alone = Some(100) — at least ร้อย should match.
651        assert!(
652            has_hundred,
653            "expected a token with decimal synonym '100', tokens: {:?}",
654            tokens
655                .iter()
656                .map(|t| (&t.text, &t.synonyms))
657                .collect::<alloc::vec::Vec<_>>()
658        );
659    }
660
661    #[test]
662    fn number_normalize_false_disables_conversion() {
663        let fts = FtsTokenizer::builder()
664            .number_normalize(false)
665            .stopwords(StopwordSet::from_text(""))
666            .build();
667        let tokens = fts.segment_for_fts("๑๒๓");
668        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
669        assert!(num.is_some());
670        assert!(
671            !num.unwrap().synonyms.contains(&String::from("123")),
672            "number_normalize=false should suppress ASCII synonym"
673        );
674    }
675
676    #[test]
677    fn mixed_thai_digit_in_context() {
678        // "ธนาคาร๑๐๐แห่ง" — the ๑๐๐ part should be a Number token with synonym "100"
679        let fts = FtsTokenizer::new();
680        let tokens = fts.segment_for_fts("ธนาคาร๑๐๐แห่ง");
681        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
682        assert!(num.is_some(), "expected Number token in mixed string");
683        assert!(
684            num.unwrap().synonyms.contains(&String::from("100")),
685            "expected ASCII synonym '100' for ๑๐๐"
686        );
687    }
688
689    // ── abbreviation expansion ────────────────────────────────────────────────
690
691    #[test]
692    fn abbrev_map_expands_before_segmentation() {
693        use crate::abbrev::AbbrevMap;
694        let fts = FtsTokenizer::builder()
695            .abbrevs(AbbrevMap::builtin())
696            .stopwords(StopwordSet::from_text(""))
697            .build();
698        // ก.ค. → กรกฎาคม before segmentation. The segmenter may split the
699        // expansion further (กรกฎา + คม) — what matters is that dots are gone
700        // and the Thai characters of กรกฎาคม are present.
701        let tokens = fts.segment_for_fts("ก.ค.");
702        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
703        let joined: String = texts.concat();
704        assert!(
705            joined.contains("กรกฎา") || joined.contains("กรกฎาคม"),
706            "expected กรกฎา(คม) characters after abbrev expansion, got: {texts:?}"
707        );
708        assert!(
709            !texts.contains(&"."),
710            "dots should be consumed by abbrev expansion, got: {texts:?}"
711        );
712    }
713
714    #[test]
715    fn abbrev_expansion_disabled_by_default() {
716        // FtsTokenizer::new() has no abbrev_map — ก.ค. stays as individual tokens.
717        let fts = FtsTokenizer::new();
718        let tokens = fts.segment_for_fts("ก.ค.");
719        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
720        // Without expansion the dot(s) must still be present as punctuation tokens.
721        assert!(
722            texts.contains(&"."),
723            "without abbrev expansion, dots should remain as tokens, got: {texts:?}"
724        );
725    }
726
727    // ── soundex synonyms ──────────────────────────────────────────────────────
728
729    #[test]
730    fn soundex_lk82_appended_to_thai_synonyms() {
731        use crate::soundex::lk82;
732        let fts = FtsTokenizer::builder()
733            .soundex(SoundexAlgorithm::Lk82)
734            .stopwords(StopwordSet::from_text(""))
735            .build();
736        let tokens = fts.segment_for_fts("กิน");
737        let t = tokens.iter().find(|t| t.text == "กิน");
738        assert!(t.is_some(), "expected token 'กิน'");
739        let expected_code = lk82("กิน");
740        assert!(
741            t.unwrap().synonyms.contains(&expected_code),
742            "expected lk82 code '{expected_code}' in synonyms, got {:?}",
743            t.unwrap().synonyms
744        );
745    }
746
747    #[test]
748    fn soundex_not_emitted_by_default() {
749        // Without .soundex() in the builder, no soundex codes should appear.
750        let fts = FtsTokenizer::new();
751        let tokens = fts.segment_for_fts("กินข้าว");
752        for t in &tokens {
753            // A soundex code is 4 ASCII chars (lk82/udom83); no synonym should look like one.
754            for syn in &t.synonyms {
755                let looks_like_soundex =
756                    syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
757                assert!(
758                    !looks_like_soundex,
759                    "unexpected soundex-like synonym '{}' on token '{}'",
760                    syn, t.text
761                );
762            }
763        }
764    }
765
766    #[test]
767    fn soundex_same_sounding_words_share_code_in_index() {
768        // กาน and ขาน share lk82 code "1600"; both should carry it as a synonym.
769        use crate::soundex::lk82;
770        let fts = FtsTokenizer::builder()
771            .soundex(SoundexAlgorithm::Lk82)
772            .stopwords(StopwordSet::from_text(""))
773            .build();
774        let code = lk82("กาน");
775        for word in &["กาน", "ขาน", "คาน"] {
776            let tokens = fts.segment_for_fts(word);
777            let t = tokens.first().expect("expected at least one token");
778            assert!(
779                t.synonyms.contains(&code),
780                "'{word}' should carry lk82 code '{code}', got {:?}",
781                t.synonyms
782            );
783        }
784    }
785
786    #[test]
787    fn soundex_not_emitted_for_non_thai_tokens() {
788        let fts = FtsTokenizer::builder()
789            .soundex(SoundexAlgorithm::Lk82)
790            .stopwords(StopwordSet::from_text(""))
791            .build();
792        let tokens = fts.segment_for_fts("hello 123");
793        for t in &tokens {
794            for syn in &t.synonyms {
795                let looks_like_soundex =
796                    syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
797                assert!(
798                    !looks_like_soundex,
799                    "non-Thai token '{}' should not get a soundex synonym, got '{syn}'",
800                    t.text
801                );
802            }
803        }
804    }
805
806    #[test]
807    fn soundex_udom83_appended() {
808        use crate::soundex::udom83;
809        let fts = FtsTokenizer::builder()
810            .soundex(SoundexAlgorithm::Udom83)
811            .stopwords(StopwordSet::from_text(""))
812            .build();
813        let tokens = fts.segment_for_fts("กิน");
814        let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
815        let expected = udom83("กิน");
816        assert!(
817            t.synonyms.contains(&expected),
818            "expected udom83 code '{expected}' in synonyms, got {:?}",
819            t.synonyms
820        );
821    }
822
823    #[test]
824    fn abbrev_expansion_date_sentence() {
825        use crate::abbrev::AbbrevMap;
826        let fts = FtsTokenizer::builder()
827            .abbrevs(AbbrevMap::builtin())
828            .stopwords(StopwordSet::from_text(""))
829            .build();
830        // พ.ศ. → พุทธศักราช; the segmenter may split it further — verify the
831        // chars are present and dots are gone.
832        let tokens = fts.segment_for_fts("พ.ศ.2567");
833        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
834        let joined: String = texts.concat();
835        assert!(
836            joined.contains("พุทธ") || joined.contains("พุทธศักราช"),
837            "expected พุทธ(ศักราช) chars after expanding พ.ศ., got: {texts:?}"
838        );
839        assert!(
840            !texts.contains(&"."),
841            "dots should be consumed by expansion, got: {texts:?}"
842        );
843    }
844}
kham_core/fts.rs

kham_core/
fts.rs