kham_core/
fts.rs

1//! Full-text search pipeline for Thai text.
2//!
3//! [`FtsTokenizer`] orchestrates the complete FTS indexing pipeline:
4//! normalise → segment → tag stopwords → expand synonyms → attach position.
5//!
6//! The output [`FtsToken`] slice is consumed by the PostgreSQL `kham-pg`
7//! extension and by any other caller that needs FTS-ready lexemes.
8//!
9//! # Positions
10//!
11//! `position` is the ordinal index of the token in the non-whitespace token
12//! sequence (0-based). Stopwords retain their position so that phrase-distance
13//! scoring remains correct when stopwords are later omitted from the index.
14//!
15//! # Example
16//!
17//! ```rust
18//! use kham_core::fts::{FtsTokenizer, FtsToken};
19//!
20//! let fts = FtsTokenizer::new();
21//! let tokens = fts.segment_for_fts("กินข้าวกับปลา");
22//! for t in &tokens {
23//!     println!("{} pos={} stop={}", t.text, t.position, t.is_stop);
24//! }
25//! ```
26
27use alloc::string::String;
28use alloc::vec::Vec;
29
30use crate::abbrev::AbbrevMap;
31use crate::ne::NeTagger;
32use crate::ngram::char_ngrams;
33use crate::number::{thai_digits_to_ascii, thai_word_to_decimal};
34use crate::pos::{PosTag, PosTagger};
35use crate::romanizer::RomanizationMap;
36use crate::soundex::{soundex, SoundexAlgorithm};
37use crate::stopwords::StopwordSet;
38use crate::synonym::SynonymMap;
39use crate::token::{NamedEntityKind, TokenKind};
40use crate::Tokenizer;
41
42/// A token produced by the FTS pipeline, ready for lexeme indexing.
43#[derive(Debug, Clone, PartialEq, Eq)]
44pub struct FtsToken {
45    /// The token text (owned; may be normalised).
46    pub text: String,
47    /// Ordinal position in the token sequence (0-based, gaps for whitespace).
48    pub position: usize,
49    /// Script / category of the original token.
50    pub kind: TokenKind,
51    /// `true` if this token matches the stopword list.
52    pub is_stop: bool,
53    /// Synonym expansions (empty if none configured or no match).
54    pub synonyms: Vec<String>,
55    /// Character trigrams — populated only for [`TokenKind::Unknown`] tokens.
56    pub trigrams: Vec<String>,
57    /// Primary part-of-speech tag from the lookup table, or `None` if the word
58    /// is not in the table (OOV) or is not a Thai token.
59    pub pos: Option<PosTag>,
60    /// Named entity category, or `None` if the token is not in the NE
61    /// gazetteer. When set, `kind` is [`TokenKind::Named`]`(ne)`.
62    pub ne: Option<NamedEntityKind>,
63}
64
65/// Builder for [`FtsTokenizer`].
66#[derive(Default)]
67pub struct FtsTokenizerBuilder {
68    stopwords: Option<StopwordSet>,
69    synonyms: Option<SynonymMap>,
70    ngram_size: Option<usize>,
71    pos_tagger: Option<PosTagger>,
72    ne_tagger: Option<NeTagger>,
73    romanization: Option<RomanizationMap>,
74    abbrev_map: Option<AbbrevMap>,
75    /// `None` means "use default (true)".
76    number_normalize: Option<bool>,
77    soundex: Option<SoundexAlgorithm>,
78}
79
80impl FtsTokenizerBuilder {
81    /// Use a custom stopword set instead of the built-in list.
82    ///
83    /// # Example
84    ///
85    /// ```rust
86    /// use kham_core::fts::FtsTokenizer;
87    /// use kham_core::stopwords::StopwordSet;
88    ///
89    /// let stops = StopwordSet::from_text("กิน\nข้าว\n");
90    /// let fts = FtsTokenizer::builder().stopwords(stops).build();
91    /// let tokens = fts.segment_for_fts("กินข้าว");
92    /// assert!(tokens.iter().all(|t| t.is_stop || t.text != "กิน"));
93    /// ```
94    pub fn stopwords(mut self, s: StopwordSet) -> Self {
95        self.stopwords = Some(s);
96        self
97    }
98
99    /// Attach a synonym map for expansion.
100    ///
101    /// # Example
102    ///
103    /// ```rust
104    /// use kham_core::fts::FtsTokenizer;
105    /// use kham_core::synonym::SynonymMap;
106    ///
107    /// // TSV: canonical TAB synonym1 TAB synonym2 …
108    /// let syns = SynonymMap::from_tsv("รถ\tรถยนต์\tยานพาหนะ\n");
109    /// let fts = FtsTokenizer::builder().synonyms(syns).build();
110    /// let tokens = fts.segment_for_fts("รถ");
111    /// let t = tokens.iter().find(|t| t.text == "รถ").unwrap();
112    /// assert!(t.synonyms.contains(&String::from("รถยนต์")));
113    /// ```
114    pub fn synonyms(mut self, m: SynonymMap) -> Self {
115        self.synonyms = Some(m);
116        self
117    }
118
119    /// Override the n-gram size used for [`TokenKind::Unknown`] tokens.
120    ///
121    /// Default: 3 (trigrams). Set to 0 to disable n-gram generation.
122    ///
123    /// # Example
124    ///
125    /// ```rust
126    /// use kham_core::fts::FtsTokenizer;
127    /// use kham_core::stopwords::StopwordSet;
128    ///
129    /// // Disable n-grams entirely — useful when index size must be small
130    /// let fts = FtsTokenizer::builder()
131    ///     .ngram_size(0)
132    ///     .stopwords(StopwordSet::from_text(""))
133    ///     .build();
134    /// let tokens = fts.segment_for_fts("กขคง"); // unknown word → no trigrams
135    /// assert!(tokens.iter().all(|t| t.trigrams.is_empty()));
136    /// ```
137    pub fn ngram_size(mut self, n: usize) -> Self {
138        self.ngram_size = Some(n);
139        self
140    }
141
142    /// Use a custom POS tagger instead of the built-in table.
143    ///
144    /// # Example
145    ///
146    /// ```rust
147    /// use kham_core::fts::FtsTokenizer;
148    /// use kham_core::pos::{PosTag, PosTagger};
149    ///
150    /// // Custom TSV: word TAB POS_TAG
151    /// let tagger = PosTagger::from_tsv("กิน\tVERB\n");
152    /// let fts = FtsTokenizer::builder().pos_tagger(tagger).build();
153    /// // Segment กิน alone so it is not merged into a compound
154    /// let tokens = fts.segment_for_fts("กิน");
155    /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
156    /// assert_eq!(t.pos, Some(PosTag::Verb));
157    /// ```
158    pub fn pos_tagger(mut self, t: PosTagger) -> Self {
159        self.pos_tagger = Some(t);
160        self
161    }
162
163    /// Use a custom NE gazetteer instead of the built-in table.
164    ///
165    /// # Example
166    ///
167    /// ```rust
168    /// use kham_core::fts::FtsTokenizer;
169    /// use kham_core::ne::NeTagger;
170    /// use kham_core::TokenKind;
171    ///
172    /// // Domain-specific NE list: word TAB NE_TAG
173    /// let ne = NeTagger::from_tsv("เซเรน่า\tPERSON\n");
174    /// let fts = FtsTokenizer::builder().ne_tagger(ne).build();
175    /// let tokens = fts.segment_for_fts("เซเรน่า");
176    /// assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));
177    /// ```
178    pub fn ne_tagger(mut self, t: NeTagger) -> Self {
179        self.ne_tagger = Some(t);
180        self
181    }
182
183    /// Attach a romanization map so RTGS forms are added to [`FtsToken::synonyms`].
184    ///
185    /// When set, each Thai and Named token whose text is found in the map gets its
186    /// RTGS romanization appended to `synonyms`, enabling Latin-script queries
187    /// (e.g. `kin`) to match Thai-script documents (e.g. `กิน`) in PostgreSQL FTS.
188    ///
189    /// Disabled by default — call this method to opt in.
190    ///
191    /// # Example
192    ///
193    /// ```rust
194    /// use kham_core::fts::FtsTokenizer;
195    /// use kham_core::romanizer::RomanizationMap;
196    ///
197    /// // TSV: Thai word TAB RTGS romanization
198    /// let rom = RomanizationMap::from_tsv("กิน\tkin\n");
199    /// let fts = FtsTokenizer::builder().romanization(rom).build();
200    /// let tokens = fts.segment_for_fts("กิน");
201    /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
202    /// // Latin synonym "kin" enables queries like `WHERE doc @@ 'kin'`
203    /// assert!(t.synonyms.contains(&String::from("kin")));
204    /// ```
205    pub fn romanization(mut self, m: RomanizationMap) -> Self {
206        self.romanization = Some(m);
207        self
208    }
209
210    /// Attach an abbreviation map for pre-tokenisation expansion.
211    ///
212    /// When set, [`FtsTokenizer::segment_for_fts`] calls
213    /// [`AbbrevMap::expand_text`] on the normalised input before segmentation.
214    /// This replaces abbreviated forms (e.g. `ก.ค.`) with their canonical
215    /// expansions (`กรกฎาคม`) so they are indexed and searchable by full form.
216    ///
217    /// Disabled by default — call this method to opt in.
218    ///
219    /// # Example
220    ///
221    /// ```rust
222    /// use kham_core::fts::FtsTokenizer;
223    /// use kham_core::abbrev::AbbrevMap;
224    /// use kham_core::stopwords::StopwordSet;
225    ///
226    /// let fts = FtsTokenizer::builder()
227    ///     .abbrevs(AbbrevMap::builtin())
228    ///     .stopwords(StopwordSet::from_text(""))
229    ///     .build();
230    /// // ก.ค. expands to กรกฎาคม before segmentation — dots disappear
231    /// let tokens = fts.segment_for_fts("ก.ค.");
232    /// let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
233    /// assert!(!texts.contains(&"."), "dots should be consumed by expansion");
234    /// ```
235    pub fn abbrevs(mut self, m: AbbrevMap) -> Self {
236        self.abbrev_map = Some(m);
237        self
238    }
239
240    /// Enable or disable number normalization (default: `true`).
241    ///
242    /// When enabled:
243    /// - [`TokenKind::Number`] tokens that contain Thai digits (๐–๙) get the
244    ///   ASCII digit string added to their [`FtsToken::synonyms`]
245    ///   (e.g. `๑๒๓` → synonym `"123"`).
246    /// - [`TokenKind::Thai`] tokens that are recognised Thai cardinal number
247    ///   words get their decimal value added to `synonyms`
248    ///   (e.g. `หนึ่งร้อย` → synonym `"100"`).
249    ///
250    /// This lets queries using either script match documents written in the
251    /// other. Set to `false` to opt out.
252    ///
253    /// # Example
254    ///
255    /// ```rust
256    /// use kham_core::fts::FtsTokenizer;
257    /// use kham_core::TokenKind;
258    ///
259    /// // Default (true): ๑๒๓ gets ASCII synonym "123"
260    /// let fts = FtsTokenizer::new();
261    /// let tokens = fts.segment_for_fts("๑๒๓");
262    /// let num = tokens.iter().find(|t| t.kind == TokenKind::Number).unwrap();
263    /// assert!(num.synonyms.contains(&String::from("123")));
264    ///
265    /// // Opt out: no conversion performed
266    /// let fts_off = FtsTokenizer::builder().number_normalize(false).build();
267    /// let tokens_off = fts_off.segment_for_fts("๑๒๓");
268    /// let num_off = tokens_off.iter().find(|t| t.kind == TokenKind::Number).unwrap();
269    /// assert!(!num_off.synonyms.contains(&String::from("123")));
270    /// ```
271    pub fn number_normalize(mut self, v: bool) -> Self {
272        self.number_normalize = Some(v);
273        self
274    }
275
276    /// Emit a Thai phonetic soundex code as an additional synonym for Thai and Named tokens.
277    ///
278    /// When set, each Thai and Named token whose text contains Thai consonants gets its
279    /// soundex code appended to [`FtsToken::synonyms`], enabling phonetic fuzzy matching
280    /// in full-text search (e.g. querying `"1600"` matches กาน, ขาน, and คาน with lk82).
281    ///
282    /// [`SoundexAlgorithm::Lk82`] and [`SoundexAlgorithm::Udom83`] produce fixed
283    /// 4-character codes and are the recommended choices for FTS indexing.
284    /// [`SoundexAlgorithm::MetaSound`] produces variable-length codes and is more
285    /// collision-prone at word level — prefer lk82 or udom83 for general FTS use.
286    ///
287    /// Disabled by default — call this method to opt in.
288    ///
289    /// # Example
290    ///
291    /// ```rust
292    /// use kham_core::fts::FtsTokenizer;
293    /// use kham_core::soundex::{lk82, SoundexAlgorithm};
294    /// use kham_core::stopwords::StopwordSet;
295    ///
296    /// let fts = FtsTokenizer::builder()
297    ///     .soundex(SoundexAlgorithm::Lk82)
298    ///     .stopwords(StopwordSet::from_text(""))
299    ///     .build();
300    /// // กาน / ขาน / คาน all map to the same lk82 code — stored once per token
301    /// for word in &["กาน", "ขาน", "คาน"] {
302    ///     let tokens = fts.segment_for_fts(word);
303    ///     let t = tokens.first().unwrap();
304    ///     assert!(t.synonyms.contains(&lk82(word)), "{word} missing lk82 synonym");
305    /// }
306    /// ```
307    pub fn soundex(mut self, algo: SoundexAlgorithm) -> Self {
308        self.soundex = Some(algo);
309        self
310    }
311
312    /// Consume the builder and return a configured [`FtsTokenizer`].
313    ///
314    /// # Example
315    ///
316    /// ```rust
317    /// use kham_core::fts::FtsTokenizer;
318    /// use kham_core::soundex::SoundexAlgorithm;
319    /// use kham_core::stopwords::StopwordSet;
320    ///
321    /// let fts = FtsTokenizer::builder()
322    ///     .soundex(SoundexAlgorithm::Lk82)
323    ///     .stopwords(StopwordSet::from_text(""))
324    ///     .build();
325    /// assert!(!fts.segment_for_fts("กินข้าว").is_empty());
326    /// ```
327    pub fn build(self) -> FtsTokenizer {
328        FtsTokenizer {
329            tokenizer: Tokenizer::new(),
330            stopwords: self.stopwords.unwrap_or_else(StopwordSet::builtin),
331            synonyms: self.synonyms.unwrap_or_else(SynonymMap::empty),
332            ngram_size: self.ngram_size.unwrap_or(3),
333            pos_tagger: self.pos_tagger.unwrap_or_else(PosTagger::builtin),
334            ne_tagger: self.ne_tagger.unwrap_or_else(NeTagger::builtin),
335            romanization: self.romanization,
336            abbrev_map: self.abbrev_map,
337            number_normalize: self.number_normalize.unwrap_or(true),
338            soundex: self.soundex,
339        }
340    }
341}
342
343/// Full-text search tokenizer for Thai text.
344///
345/// Wraps [`Tokenizer`] with stopword filtering, synonym expansion, and n-gram
346/// generation for out-of-vocabulary tokens.
347///
348/// Construct once and reuse:
349///
350/// ```rust
351/// use kham_core::fts::FtsTokenizer;
352///
353/// let fts = FtsTokenizer::new();
354/// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
355/// assert!(!tokens.is_empty());
356/// ```
357pub struct FtsTokenizer {
358    tokenizer: Tokenizer,
359    stopwords: StopwordSet,
360    synonyms: SynonymMap,
361    ngram_size: usize,
362    pos_tagger: PosTagger,
363    ne_tagger: NeTagger,
364    romanization: Option<RomanizationMap>,
365    abbrev_map: Option<AbbrevMap>,
366    number_normalize: bool,
367    soundex: Option<SoundexAlgorithm>,
368}
369
370impl FtsTokenizer {
371    /// Create an [`FtsTokenizer`] with built-in stopwords and no synonyms.
372    ///
373    /// # Example
374    ///
375    /// ```rust
376    /// use kham_core::fts::FtsTokenizer;
377    ///
378    /// let fts = FtsTokenizer::new();
379    /// let lexemes = fts.lexemes("กินข้าวกับปลา");
380    /// // Built-in stopword กับ is excluded; content words are present
381    /// assert!(!lexemes.contains(&String::from("กับ")));
382    /// assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
383    /// ```
384    pub fn new() -> Self {
385        FtsTokenizerBuilder::default().build()
386    }
387
388    /// Return a [`FtsTokenizerBuilder`] for custom configuration.
389    ///
390    /// # Example
391    ///
392    /// ```rust
393    /// use kham_core::fts::FtsTokenizer;
394    /// use kham_core::soundex::SoundexAlgorithm;
395    /// use kham_core::synonym::SynonymMap;
396    ///
397    /// let fts = FtsTokenizer::builder()
398    ///     .synonyms(SynonymMap::from_tsv("รถ\tรถยนต์\n"))
399    ///     .soundex(SoundexAlgorithm::Lk82)
400    ///     .build();
401    /// assert!(!fts.segment_for_fts("รถ").is_empty());
402    /// ```
403    pub fn builder() -> FtsTokenizerBuilder {
404        FtsTokenizerBuilder::default()
405    }
406
407    /// Segment `text` and annotate each token for FTS indexing.
408    ///
409    /// Normalises the input text before segmentation so that สระลอย and stacked
410    /// tone marks are handled correctly. Whitespace tokens are excluded.
411    ///
412    /// The returned `Vec<FtsToken>` covers all non-whitespace tokens. Call
413    /// [`index_tokens`] instead when you only need the tokens to be indexed
414    /// (stopwords excluded).
415    ///
416    /// [`index_tokens`]: FtsTokenizer::index_tokens
417    ///
418    /// # Examples
419    ///
420    /// ```rust
421    /// use kham_core::fts::FtsTokenizer;
422    ///
423    /// let fts = FtsTokenizer::new();
424    /// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
425    /// // Positions are 0-based and sequential across non-whitespace tokens
426    /// for (i, t) in tokens.iter().enumerate() {
427    ///     assert_eq!(t.position, i);
428    /// }
429    /// // กับ is a common conjunction — marked as a stopword
430    /// let kap = tokens.iter().find(|t| t.text == "กับ").unwrap();
431    /// assert!(kap.is_stop);
432    /// ```
433    ///
434    /// Named entities are tagged automatically — `kind` becomes `TokenKind::Named`:
435    ///
436    /// ```rust
437    /// use kham_core::fts::FtsTokenizer;
438    /// use kham_core::TokenKind;
439    ///
440    /// let fts = FtsTokenizer::new();
441    /// let tokens = fts.segment_for_fts("ไปกรุงเทพ");
442    /// assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));
443    /// ```
444    ///
445    /// Enable phonetic synonyms with [`FtsTokenizerBuilder::soundex`]:
446    ///
447    /// ```rust
448    /// use kham_core::fts::FtsTokenizer;
449    /// use kham_core::soundex::SoundexAlgorithm;
450    ///
451    /// let fts = FtsTokenizer::builder()
452    ///     .soundex(SoundexAlgorithm::Lk82)
453    ///     .build();
454    /// let tokens = fts.segment_for_fts("กิน");
455    /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
456    /// // synonyms now contains the lk82 code, enabling fuzzy phonetic matching
457    /// assert!(!t.synonyms.is_empty());
458    /// ```
459    pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken> {
460        let normalized = self.tokenizer.normalize(text);
461        // Expand abbreviations (e.g. ก.ค. → กรกฎาคม) before segmentation so
462        // dot-containing patterns are replaced as single units.
463        let expanded = match self.abbrev_map.as_ref() {
464            Some(am) => am.expand_text(&normalized),
465            None => normalized,
466        };
467        let raw_tokens = self
468            .ne_tagger
469            .tag_tokens(self.tokenizer.segment(&expanded), &expanded);
470
471        let mut result = Vec::with_capacity(raw_tokens.len());
472        let mut position = 0usize;
473
474        for token in &raw_tokens {
475            if token.kind == TokenKind::Whitespace {
476                continue;
477            }
478
479            let is_stop = self.stopwords.contains(token.text);
480            let is_thai_or_named = matches!(token.kind, TokenKind::Thai | TokenKind::Named(_));
481            let mut synonyms = self
482                .synonyms
483                .expand(token.text)
484                .map(|s| s.to_vec())
485                .unwrap_or_default();
486            if is_thai_or_named {
487                if let Some(ref rom) = self.romanization {
488                    if let Some(rtgs) = rom.romanize(token.text) {
489                        synonyms.push(String::from(rtgs));
490                    }
491                }
492                if let Some(algo) = self.soundex {
493                    let code = soundex(token.text, algo);
494                    if !code.chars().all(|c| c == '0') {
495                        synonyms.push(code);
496                    }
497                }
498            }
499            if self.number_normalize {
500                match token.kind {
501                    // Number token with Thai digits → add ASCII form as synonym.
502                    TokenKind::Number => {
503                        let ascii = thai_digits_to_ascii(token.text);
504                        if ascii != token.text {
505                            synonyms.push(ascii);
506                        }
507                    }
508                    // Thai token that is a recognised number word → add decimal string.
509                    TokenKind::Thai => {
510                        if let Some(decimal) = thai_word_to_decimal(token.text) {
511                            synonyms.push(decimal);
512                        }
513                    }
514                    _ => {}
515                }
516            }
517            let trigrams = if token.kind == TokenKind::Unknown && self.ngram_size > 0 {
518                char_ngrams(token.text, self.ngram_size)
519                    .map(String::from)
520                    .collect()
521            } else {
522                Vec::new()
523            };
524            let ne = if let TokenKind::Named(k) = token.kind {
525                Some(k)
526            } else {
527                None
528            };
529            let pos = if token.kind == TokenKind::Thai {
530                self.pos_tagger.tag(token.text)
531            } else {
532                None
533            };
534
535            result.push(FtsToken {
536                text: String::from(token.text),
537                position,
538                kind: token.kind,
539                is_stop,
540                synonyms,
541                trigrams,
542                pos,
543                ne,
544            });
545
546            position += 1;
547        }
548
549        result
550    }
551
552    /// Return only the tokens to be written into a search index.
553    ///
554    /// Filters out stopwords and whitespace. Each [`FtsToken`] still carries
555    /// its original `position` so phrase-distance scoring remains correct.
556    ///
557    /// # Example
558    ///
559    /// ```rust
560    /// use kham_core::fts::FtsTokenizer;
561    ///
562    /// let fts = FtsTokenizer::new();
563    /// let tokens = fts.index_tokens("กินข้าวกับปลา");
564    /// // No stopwords in the index
565    /// assert!(tokens.iter().all(|t| !t.is_stop));
566    /// // Positions are preserved from the full sequence for phrase scoring
567    /// let positions: Vec<usize> = tokens.iter().map(|t| t.position).collect();
568    /// assert!(positions.windows(2).all(|w| w[0] < w[1]));
569    /// ```
570    pub fn index_tokens(&self, text: &str) -> Vec<FtsToken> {
571        self.segment_for_fts(text)
572            .into_iter()
573            .filter(|t| !t.is_stop)
574            .collect()
575    }
576
577    /// Collect all lexeme strings to be stored in a `tsvector`.
578    ///
579    /// Returns one string per non-stop token, plus synonym expansions and
580    /// trigrams for unknown tokens. Duplicates are not removed (the caller or
581    /// PostgreSQL handles deduplication).
582    ///
583    /// # Example
584    ///
585    /// ```rust
586    /// use kham_core::fts::FtsTokenizer;
587    ///
588    /// let fts = FtsTokenizer::new();
589    /// let lexemes = fts.lexemes("กินข้าวกับปลา");
590    /// // Content words are present; stopword กับ is absent
591    /// assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
592    /// assert!(!lexemes.contains(&String::from("กับ")));
593    /// ```
594    ///
595    /// With Thai digit normalization (enabled by default), both scripts match:
596    ///
597    /// ```rust
598    /// use kham_core::fts::FtsTokenizer;
599    ///
600    /// let fts = FtsTokenizer::new();
601    /// let lexemes = fts.lexemes("ธนาคาร๑๐๐แห่ง");
602    /// // ๑๐๐ (Thai digits) → synonym "100" (ASCII) — both appear in lexemes
603    /// assert!(lexemes.contains(&String::from("100")));
604    /// ```
605    pub fn lexemes(&self, text: &str) -> Vec<String> {
606        let tokens = self.index_tokens(text);
607        let mut out: Vec<String> = Vec::with_capacity(tokens.len() * 2);
608        for t in tokens {
609            out.push(t.text.clone());
610            out.extend(t.synonyms);
611            out.extend(t.trigrams);
612        }
613        out
614    }
615}
616
617impl Default for FtsTokenizer {
618    fn default() -> Self {
619        Self::new()
620    }
621}
622
623// ---------------------------------------------------------------------------
624// Tests
625// ---------------------------------------------------------------------------
626
627#[cfg(test)]
628mod tests {
629    use super::*;
630    use crate::stopwords::StopwordSet;
631    use crate::synonym::SynonymMap;
632
633    fn fts() -> FtsTokenizer {
634        FtsTokenizer::new()
635    }
636
637    // ── segment_for_fts ───────────────────────────────────────────────────────
638
639    #[test]
640    fn empty_input_returns_empty() {
641        assert!(fts().segment_for_fts("").is_empty());
642    }
643
644    #[test]
645    fn whitespace_tokens_excluded() {
646        let tokens = fts().segment_for_fts("กิน ข้าว");
647        assert!(tokens.iter().all(|t| t.kind != TokenKind::Whitespace));
648    }
649
650    #[test]
651    fn positions_are_sequential() {
652        let tokens = fts().segment_for_fts("กินข้าวกับปลา");
653        for (i, t) in tokens.iter().enumerate() {
654            assert_eq!(t.position, i, "position mismatch at index {i}");
655        }
656    }
657
658    #[test]
659    fn known_stopword_is_tagged() {
660        // "กับ" is a common conjunction and should be in the built-in stopword list
661        let tokens = fts().segment_for_fts("กินข้าวกับปลา");
662        let kap = tokens.iter().find(|t| t.text == "กับ");
663        assert!(kap.is_some(), "expected 'กับ' token");
664        assert!(kap.unwrap().is_stop, "'กับ' should be tagged as stopword");
665    }
666
667    #[test]
668    fn content_words_not_tagged_as_stop() {
669        let tokens = fts().segment_for_fts("โรงพยาบาล");
670        // May be OOV but should not be a stopword
671        for t in &tokens {
672            assert!(!t.is_stop, "'{}' should not be a stopword", t.text);
673        }
674    }
675
676    #[test]
677    fn text_is_reconstructable() {
678        // All tokens joined == normalised input (whitespace dropped)
679        let fts = fts();
680        let text = "กินข้าวกับปลา";
681        let normalized = fts.tokenizer.normalize(text);
682        let tokens = fts.segment_for_fts(text);
683        let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
684        assert_eq!(rebuilt, normalized);
685    }
686
687    // ── synonym expansion ─────────────────────────────────────────────────────
688
689    #[test]
690    fn synonym_expansion_attached() {
691        let synonyms = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
692        let fts = FtsTokenizer::builder()
693            .synonyms(synonyms)
694            .stopwords(StopwordSet::from_text(""))
695            .build();
696        // Segment a text containing "คอม" — need it in dict or it lands as Unknown
697        // Use builder with custom word so the segmenter recognises it
698        let tokens = fts.segment_for_fts("คอม");
699        let t = tokens.iter().find(|t| t.text == "คอม");
700        if let Some(tok) = t {
701            assert!(
702                tok.synonyms.contains(&String::from("คอมพิวเตอร์")),
703                "expected synonym expansion, got {:?}",
704                tok.synonyms
705            );
706        }
707    }
708
709    #[test]
710    fn no_synonyms_when_map_empty() {
711        let tokens = fts().segment_for_fts("กินข้าว");
712        for t in &tokens {
713            assert!(t.synonyms.is_empty());
714        }
715    }
716
717    // ── unknown token trigrams ────────────────────────────────────────────────
718
719    #[test]
720    fn unknown_token_gets_trigrams() {
721        // "กิ" = consonant + sara-i, a single 2-char TCC that is not a word.
722        // With ngram_size=2 the token should yield one bigram ("กิ").
723        // The newmm DP emits Unknown tokens one TCC at a time, so multi-char TCCs
724        // (like "กิ") are the shortest unit that can produce n-grams.
725        let fts = FtsTokenizer::builder()
726            .ngram_size(2)
727            .stopwords(StopwordSet::from_text(""))
728            .build();
729        let tokens = fts.segment_for_fts("กิ");
730        let unknown: Vec<_> = tokens
731            .iter()
732            .filter(|t| t.kind == TokenKind::Unknown && t.text.chars().count() >= 2)
733            .collect();
734        assert!(
735            !unknown.is_empty(),
736            "expected at least one multi-char Unknown token for 'กิ'"
737        );
738        for u in &unknown {
739            assert!(
740                !u.trigrams.is_empty(),
741                "unknown token '{}' ({} chars) should have bigrams",
742                u.text,
743                u.text.chars().count()
744            );
745        }
746    }
747
748    #[test]
749    fn known_thai_token_has_no_trigrams() {
750        let tokens = fts().segment_for_fts("กิน");
751        for t in &tokens {
752            if t.kind == TokenKind::Thai {
753                assert!(
754                    t.trigrams.is_empty(),
755                    "known Thai token '{}' should not have trigrams",
756                    t.text
757                );
758            }
759        }
760    }
761
762    #[test]
763    fn ngram_size_zero_disables_trigrams() {
764        let fts = FtsTokenizer::builder()
765            .ngram_size(0)
766            .stopwords(StopwordSet::from_text(""))
767            .build();
768        let tokens = fts.segment_for_fts("กขคง");
769        for t in &tokens {
770            assert!(t.trigrams.is_empty());
771        }
772    }
773
774    // ── index_tokens ──────────────────────────────────────────────────────────
775
776    #[test]
777    fn index_tokens_excludes_stopwords() {
778        let tokens = fts().index_tokens("กินข้าวกับปลา");
779        assert!(tokens.iter().all(|t| !t.is_stop));
780    }
781
782    #[test]
783    fn index_tokens_preserves_positions() {
784        // Positions in index_tokens must be a subset of segment_for_fts positions
785        let all = fts().segment_for_fts("กินข้าวกับปลา");
786        let indexed = fts().index_tokens("กินข้าวกับปลา");
787        for t in &indexed {
788            assert!(
789                all.iter().any(|a| a.position == t.position),
790                "indexed token at position {} not found in full token list",
791                t.position
792            );
793        }
794    }
795
796    // ── lexemes ───────────────────────────────────────────────────────────────
797
798    #[test]
799    fn lexemes_returns_non_stop_texts() {
800        let lexemes = fts().lexemes("กินข้าวกับปลา");
801        // "กับ" is a stopword — should not appear
802        assert!(!lexemes.contains(&String::from("กับ")));
803        // Content words should appear
804        assert!(
805            lexemes
806                .iter()
807                .any(|l| l == "กิน" || l == "ข้าว" || l == "ปลา"),
808            "expected content words in lexemes: {lexemes:?}"
809        );
810    }
811
812    #[test]
813    fn lexemes_empty_input_is_empty() {
814        assert!(fts().lexemes("").is_empty());
815    }
816
817    // ── multi-token NE ────────────────────────────────────────────────────────
818
819    #[test]
820    fn multi_token_ne_merged_in_pipeline() {
821        // กรุงเทพ is in the NE gazetteer as PLACE; the segmenter splits it
822        // into กรุง+เทพ. The FTS pipeline must merge them into one Named token.
823        let fts = FtsTokenizer::new();
824        let tokens = fts.segment_for_fts("ไปกรุงเทพ");
825        let named: Vec<_> = tokens
826            .iter()
827            .filter(|t| matches!(t.kind, TokenKind::Named(_)))
828            .collect();
829        assert!(
830            named.iter().any(|t| t.text == "กรุงเทพ"),
831            "กรุงเทพ should be tagged Named after multi-token merge, tokens: {:?}",
832            tokens
833                .iter()
834                .map(|t| (&t.text, &t.kind))
835                .collect::<alloc::vec::Vec<_>>()
836        );
837    }
838
839    #[test]
840    fn multi_token_ne_reconstructable() {
841        // Texts of all non-whitespace tokens must still reconstruct the normalized input.
842        let fts = FtsTokenizer::new();
843        let text = "ไปกรุงเทพ";
844        let normalized = fts.tokenizer.normalize(text);
845        let tokens = fts.segment_for_fts(text);
846        let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
847        assert_eq!(rebuilt, normalized);
848    }
849
850    // ── builder ───────────────────────────────────────────────────────────────
851
852    #[test]
853    fn builder_custom_stopwords() {
854        let stops = StopwordSet::from_text("กิน\n");
855        let fts = FtsTokenizer::builder().stopwords(stops).build();
856        let tokens = fts.segment_for_fts("กินข้าว");
857        let gin = tokens.iter().find(|t| t.text == "กิน");
858        if let Some(t) = gin {
859            assert!(t.is_stop, "'กิน' should be stop with custom list");
860        }
861    }
862
863    #[test]
864    fn builder_default_equals_new() {
865        // Both paths should produce the same result for a simple input
866        let a = FtsTokenizer::new().lexemes("กินข้าว");
867        let b = FtsTokenizer::builder().build().lexemes("กินข้าว");
868        assert_eq!(a, b);
869    }
870
871    // ── number normalization ──────────────────────────────────────────────────
872
873    #[test]
874    fn thai_digit_token_gets_ascii_synonym() {
875        let fts = FtsTokenizer::new();
876        let tokens = fts.segment_for_fts("๑๒๓");
877        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
878        assert!(num.is_some(), "expected a Number token");
879        let t = num.unwrap();
880        assert!(
881            t.synonyms.contains(&String::from("123")),
882            "Thai digit token should have ASCII synonym, got {:?}",
883            t.synonyms
884        );
885    }
886
887    #[test]
888    fn ascii_digit_token_has_no_extra_synonym() {
889        // ASCII digits need no conversion — synonyms should be empty (no map, no rom).
890        let fts = FtsTokenizer::new();
891        let tokens = fts.segment_for_fts("123");
892        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
893        assert!(num.is_some(), "expected a Number token");
894        assert!(
895            !num.unwrap().synonyms.contains(&String::from("123")),
896            "ASCII digit token should not duplicate itself as a synonym"
897        );
898    }
899
900    #[test]
901    fn thai_number_word_gets_decimal_synonym() {
902        // หนึ่งร้อย may segment as a single Thai token or multiple tokens depending
903        // on the dictionary. We check that at least one token carries "100" in synonyms.
904        let fts = FtsTokenizer::new();
905        let tokens = fts.segment_for_fts("หนึ่งร้อย");
906        let has_hundred = tokens
907            .iter()
908            .any(|t| t.synonyms.contains(&String::from("100")));
909        // หนึ่ง alone = Some(1), ร้อย alone = Some(100) — at least ร้อย should match.
910        assert!(
911            has_hundred,
912            "expected a token with decimal synonym '100', tokens: {:?}",
913            tokens
914                .iter()
915                .map(|t| (&t.text, &t.synonyms))
916                .collect::<alloc::vec::Vec<_>>()
917        );
918    }
919
920    #[test]
921    fn number_normalize_false_disables_conversion() {
922        let fts = FtsTokenizer::builder()
923            .number_normalize(false)
924            .stopwords(StopwordSet::from_text(""))
925            .build();
926        let tokens = fts.segment_for_fts("๑๒๓");
927        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
928        assert!(num.is_some());
929        assert!(
930            !num.unwrap().synonyms.contains(&String::from("123")),
931            "number_normalize=false should suppress ASCII synonym"
932        );
933    }
934
935    #[test]
936    fn mixed_thai_digit_in_context() {
937        // "ธนาคาร๑๐๐แห่ง" — the ๑๐๐ part should be a Number token with synonym "100"
938        let fts = FtsTokenizer::new();
939        let tokens = fts.segment_for_fts("ธนาคาร๑๐๐แห่ง");
940        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
941        assert!(num.is_some(), "expected Number token in mixed string");
942        assert!(
943            num.unwrap().synonyms.contains(&String::from("100")),
944            "expected ASCII synonym '100' for ๑๐๐"
945        );
946    }
947
948    // ── abbreviation expansion ────────────────────────────────────────────────
949
950    #[test]
951    fn abbrev_map_expands_before_segmentation() {
952        use crate::abbrev::AbbrevMap;
953        let fts = FtsTokenizer::builder()
954            .abbrevs(AbbrevMap::builtin())
955            .stopwords(StopwordSet::from_text(""))
956            .build();
957        // ก.ค. → กรกฎาคม before segmentation. The segmenter may split the
958        // expansion further (กรกฎา + คม) — what matters is that dots are gone
959        // and the Thai characters of กรกฎาคม are present.
960        let tokens = fts.segment_for_fts("ก.ค.");
961        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
962        let joined: String = texts.concat();
963        assert!(
964            joined.contains("กรกฎา") || joined.contains("กรกฎาคม"),
965            "expected กรกฎา(คม) characters after abbrev expansion, got: {texts:?}"
966        );
967        assert!(
968            !texts.contains(&"."),
969            "dots should be consumed by abbrev expansion, got: {texts:?}"
970        );
971    }
972
973    #[test]
974    fn abbrev_expansion_disabled_by_default() {
975        // FtsTokenizer::new() has no abbrev_map — ก.ค. stays as individual tokens.
976        let fts = FtsTokenizer::new();
977        let tokens = fts.segment_for_fts("ก.ค.");
978        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
979        // Without expansion the dot(s) must still be present as punctuation tokens.
980        assert!(
981            texts.contains(&"."),
982            "without abbrev expansion, dots should remain as tokens, got: {texts:?}"
983        );
984    }
985
986    // ── soundex synonyms ──────────────────────────────────────────────────────
987
988    #[test]
989    fn soundex_lk82_appended_to_thai_synonyms() {
990        use crate::soundex::lk82;
991        let fts = FtsTokenizer::builder()
992            .soundex(SoundexAlgorithm::Lk82)
993            .stopwords(StopwordSet::from_text(""))
994            .build();
995        let tokens = fts.segment_for_fts("กิน");
996        let t = tokens.iter().find(|t| t.text == "กิน");
997        assert!(t.is_some(), "expected token 'กิน'");
998        let expected_code = lk82("กิน");
999        assert!(
1000            t.unwrap().synonyms.contains(&expected_code),
1001            "expected lk82 code '{expected_code}' in synonyms, got {:?}",
1002            t.unwrap().synonyms
1003        );
1004    }
1005
1006    #[test]
1007    fn soundex_not_emitted_by_default() {
1008        // Without .soundex() in the builder, no soundex codes should appear.
1009        let fts = FtsTokenizer::new();
1010        let tokens = fts.segment_for_fts("กินข้าว");
1011        for t in &tokens {
1012            // A soundex code is 4 ASCII chars (lk82/udom83); no synonym should look like one.
1013            for syn in &t.synonyms {
1014                let looks_like_soundex =
1015                    syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
1016                assert!(
1017                    !looks_like_soundex,
1018                    "unexpected soundex-like synonym '{}' on token '{}'",
1019                    syn, t.text
1020                );
1021            }
1022        }
1023    }
1024
1025    #[test]
1026    fn soundex_same_sounding_words_share_code_in_index() {
1027        // กาน and ขาน share lk82 code "1600"; both should carry it as a synonym.
1028        use crate::soundex::lk82;
1029        let fts = FtsTokenizer::builder()
1030            .soundex(SoundexAlgorithm::Lk82)
1031            .stopwords(StopwordSet::from_text(""))
1032            .build();
1033        let code = lk82("กาน");
1034        for word in &["กาน", "ขาน", "คาน"] {
1035            let tokens = fts.segment_for_fts(word);
1036            let t = tokens.first().expect("expected at least one token");
1037            assert!(
1038                t.synonyms.contains(&code),
1039                "'{word}' should carry lk82 code '{code}', got {:?}",
1040                t.synonyms
1041            );
1042        }
1043    }
1044
1045    #[test]
1046    fn soundex_not_emitted_for_non_thai_tokens() {
1047        let fts = FtsTokenizer::builder()
1048            .soundex(SoundexAlgorithm::Lk82)
1049            .stopwords(StopwordSet::from_text(""))
1050            .build();
1051        let tokens = fts.segment_for_fts("hello 123");
1052        for t in &tokens {
1053            for syn in &t.synonyms {
1054                let looks_like_soundex =
1055                    syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
1056                assert!(
1057                    !looks_like_soundex,
1058                    "non-Thai token '{}' should not get a soundex synonym, got '{syn}'",
1059                    t.text
1060                );
1061            }
1062        }
1063    }
1064
1065    #[test]
1066    fn soundex_udom83_appended() {
1067        use crate::soundex::udom83;
1068        let fts = FtsTokenizer::builder()
1069            .soundex(SoundexAlgorithm::Udom83)
1070            .stopwords(StopwordSet::from_text(""))
1071            .build();
1072        let tokens = fts.segment_for_fts("กิน");
1073        let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
1074        let expected = udom83("กิน");
1075        assert!(
1076            t.synonyms.contains(&expected),
1077            "expected udom83 code '{expected}' in synonyms, got {:?}",
1078            t.synonyms
1079        );
1080    }
1081
1082    #[test]
1083    fn abbrev_expansion_date_sentence() {
1084        use crate::abbrev::AbbrevMap;
1085        let fts = FtsTokenizer::builder()
1086            .abbrevs(AbbrevMap::builtin())
1087            .stopwords(StopwordSet::from_text(""))
1088            .build();
1089        // พ.ศ. → พุทธศักราช; the segmenter may split it further — verify the
1090        // chars are present and dots are gone.
1091        let tokens = fts.segment_for_fts("พ.ศ.2567");
1092        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1093        let joined: String = texts.concat();
1094        assert!(
1095            joined.contains("พุทธ") || joined.contains("พุทธศักราช"),
1096            "expected พุทธ(ศักราช) chars after expanding พ.ศ., got: {texts:?}"
1097        );
1098        assert!(
1099            !texts.contains(&"."),
1100            "dots should be consumed by expansion, got: {texts:?}"
1101        );
1102    }
1103}
kham_core/fts.rs

kham_core/
fts.rs