kham_core/
fts.rs

1//! Full-text search pipeline for Thai text.
2//!
3//! [`FtsTokenizer`] orchestrates the complete FTS indexing pipeline:
4//! normalise → segment → tag stopwords → expand synonyms → attach position.
5//!
6//! The output [`FtsToken`] slice is consumed by the PostgreSQL `kham-pg`
7//! extension and by any other caller that needs FTS-ready lexemes.
8//!
9//! # Positions
10//!
11//! `position` is the ordinal index of the token in the non-whitespace token
12//! sequence (0-based). Stopwords retain their position so that phrase-distance
13//! scoring remains correct when stopwords are later omitted from the index.
14//!
15//! # Example
16//!
17//! ```rust
18//! use kham_core::fts::{FtsTokenizer, FtsToken};
19//!
20//! let fts = FtsTokenizer::new();
21//! let tokens = fts.segment_for_fts("กินข้าวกับปลา");
22//! for t in &tokens {
23//!     println!("{} pos={} stop={}", t.text, t.position, t.is_stop);
24//! }
25//! ```
26
27use alloc::string::String;
28use alloc::vec::Vec;
29
30use crate::abbrev::AbbrevMap;
31use crate::ne::NeTagger;
32use crate::ngram::char_ngrams;
33use crate::number::{thai_digits_to_ascii, thai_word_to_decimal};
34use crate::pos::{PosTag, PosTagger};
35use crate::romanizer::RomanizationMap;
36use crate::soundex::{soundex, SoundexAlgorithm};
37use crate::stopwords::StopwordSet;
38use crate::synonym::SynonymMap;
39use crate::token::{NamedEntityKind, TokenKind};
40use crate::Tokenizer;
41
42/// A token produced by the FTS pipeline, ready for lexeme indexing.
43#[derive(Debug, Clone, PartialEq, Eq)]
44pub struct FtsToken {
45    /// The token text (owned; may be normalised).
46    pub text: String,
47    /// Ordinal position in the token sequence (0-based, gaps for whitespace).
48    pub position: usize,
49    /// Script / category of the original token.
50    pub kind: TokenKind,
51    /// `true` if this token matches the stopword list.
52    pub is_stop: bool,
53    /// Synonym expansions (empty if none configured or no match).
54    pub synonyms: Vec<String>,
55    /// Character trigrams — populated only for [`TokenKind::Unknown`] tokens.
56    pub trigrams: Vec<String>,
57    /// Primary part-of-speech tag from the lookup table, or `None` if the word
58    /// is not in the table (OOV) or is not a Thai token.
59    pub pos: Option<PosTag>,
60    /// Named entity category, or `None` if the token is not in the NE
61    /// gazetteer. When set, `kind` is [`TokenKind::Named`]`(ne)`.
62    pub ne: Option<NamedEntityKind>,
63}
64
65/// Builder for [`FtsTokenizer`].
66#[derive(Default)]
67pub struct FtsTokenizerBuilder {
68    stopwords: Option<StopwordSet>,
69    synonyms: Option<SynonymMap>,
70    ngram_size: Option<usize>,
71    pos_tagger: Option<PosTagger>,
72    ne_tagger: Option<NeTagger>,
73    romanization: Option<RomanizationMap>,
74    abbrev_map: Option<AbbrevMap>,
75    /// `None` means "use default (true)".
76    number_normalize: Option<bool>,
77    soundex: Option<SoundexAlgorithm>,
78    /// Extra words to overlay on top of the built-in dictionary (fast path).
79    dict_merge: Option<String>,
80}
81
82impl FtsTokenizerBuilder {
83    /// Use a custom stopword set instead of the built-in list.
84    ///
85    /// # Example
86    ///
87    /// ```rust
88    /// use kham_core::fts::FtsTokenizer;
89    /// use kham_core::stopwords::StopwordSet;
90    ///
91    /// let stops = StopwordSet::from_text("กิน\nข้าว\n");
92    /// let fts = FtsTokenizer::builder().stopwords(stops).build();
93    /// let tokens = fts.segment_for_fts("กินข้าว");
94    /// assert!(tokens.iter().all(|t| t.is_stop || t.text != "กิน"));
95    /// ```
96    pub fn stopwords(mut self, s: StopwordSet) -> Self {
97        self.stopwords = Some(s);
98        self
99    }
100
101    /// Attach a synonym map for expansion.
102    ///
103    /// # Example
104    ///
105    /// ```rust
106    /// use kham_core::fts::FtsTokenizer;
107    /// use kham_core::synonym::SynonymMap;
108    ///
109    /// // TSV: canonical TAB synonym1 TAB synonym2 …
110    /// let syns = SynonymMap::from_tsv("รถ\tรถยนต์\tยานพาหนะ\n");
111    /// let fts = FtsTokenizer::builder().synonyms(syns).build();
112    /// let tokens = fts.segment_for_fts("รถ");
113    /// let t = tokens.iter().find(|t| t.text == "รถ").unwrap();
114    /// assert!(t.synonyms.contains(&String::from("รถยนต์")));
115    /// ```
116    pub fn synonyms(mut self, m: SynonymMap) -> Self {
117        self.synonyms = Some(m);
118        self
119    }
120
121    /// Override the n-gram size used for [`TokenKind::Unknown`] tokens.
122    ///
123    /// Default: 3 (trigrams). Set to 0 to disable n-gram generation.
124    ///
125    /// # Example
126    ///
127    /// ```rust
128    /// use kham_core::fts::FtsTokenizer;
129    /// use kham_core::stopwords::StopwordSet;
130    ///
131    /// // Disable n-grams entirely — useful when index size must be small
132    /// let fts = FtsTokenizer::builder()
133    ///     .ngram_size(0)
134    ///     .stopwords(StopwordSet::from_text(""))
135    ///     .build();
136    /// let tokens = fts.segment_for_fts("กขคง"); // unknown word → no trigrams
137    /// assert!(tokens.iter().all(|t| t.trigrams.is_empty()));
138    /// ```
139    pub fn ngram_size(mut self, n: usize) -> Self {
140        self.ngram_size = Some(n);
141        self
142    }
143
144    /// Use a custom POS tagger instead of the built-in table.
145    ///
146    /// # Example
147    ///
148    /// ```rust
149    /// use kham_core::fts::FtsTokenizer;
150    /// use kham_core::pos::{PosTag, PosTagger};
151    ///
152    /// // Custom TSV: word TAB POS_TAG
153    /// let tagger = PosTagger::from_tsv("กิน\tVERB\n");
154    /// let fts = FtsTokenizer::builder().pos_tagger(tagger).build();
155    /// // Segment กิน alone so it is not merged into a compound
156    /// let tokens = fts.segment_for_fts("กิน");
157    /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
158    /// assert_eq!(t.pos, Some(PosTag::Verb));
159    /// ```
160    pub fn pos_tagger(mut self, t: PosTagger) -> Self {
161        self.pos_tagger = Some(t);
162        self
163    }
164
165    /// Use a custom NE gazetteer instead of the built-in table.
166    ///
167    /// # Example
168    ///
169    /// ```rust
170    /// use kham_core::fts::FtsTokenizer;
171    /// use kham_core::ne::NeTagger;
172    /// use kham_core::TokenKind;
173    ///
174    /// // Domain-specific NE list: word TAB NE_TAG
175    /// let ne = NeTagger::from_tsv("เซเรน่า\tPERSON\n");
176    /// let fts = FtsTokenizer::builder().ne_tagger(ne).build();
177    /// let tokens = fts.segment_for_fts("เซเรน่า");
178    /// assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));
179    /// ```
180    pub fn ne_tagger(mut self, t: NeTagger) -> Self {
181        self.ne_tagger = Some(t);
182        self
183    }
184
185    /// Attach a romanization map so RTGS forms are added to [`FtsToken::synonyms`].
186    ///
187    /// When set, each Thai and Named token whose text is found in the map gets its
188    /// RTGS romanization appended to `synonyms`, enabling Latin-script queries
189    /// (e.g. `kin`) to match Thai-script documents (e.g. `กิน`) in PostgreSQL FTS.
190    ///
191    /// Disabled by default — call this method to opt in.
192    ///
193    /// # Example
194    ///
195    /// ```rust
196    /// use kham_core::fts::FtsTokenizer;
197    /// use kham_core::romanizer::RomanizationMap;
198    ///
199    /// // TSV: Thai word TAB RTGS romanization
200    /// let rom = RomanizationMap::from_tsv("กิน\tkin\n");
201    /// let fts = FtsTokenizer::builder().romanization(rom).build();
202    /// let tokens = fts.segment_for_fts("กิน");
203    /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
204    /// // Latin synonym "kin" enables queries like `WHERE doc @@ 'kin'`
205    /// assert!(t.synonyms.contains(&String::from("kin")));
206    /// ```
207    pub fn romanization(mut self, m: RomanizationMap) -> Self {
208        self.romanization = Some(m);
209        self
210    }
211
212    /// Attach an abbreviation map for pre-tokenisation expansion.
213    ///
214    /// When set, [`FtsTokenizer::segment_for_fts`] calls
215    /// [`AbbrevMap::expand_text`] on the normalised input before segmentation.
216    /// This replaces abbreviated forms (e.g. `ก.ค.`) with their canonical
217    /// expansions (`กรกฎาคม`) so they are indexed and searchable by full form.
218    ///
219    /// Disabled by default — call this method to opt in.
220    ///
221    /// # Example
222    ///
223    /// ```rust
224    /// use kham_core::fts::FtsTokenizer;
225    /// use kham_core::abbrev::AbbrevMap;
226    /// use kham_core::stopwords::StopwordSet;
227    ///
228    /// let fts = FtsTokenizer::builder()
229    ///     .abbrevs(AbbrevMap::builtin())
230    ///     .stopwords(StopwordSet::from_text(""))
231    ///     .build();
232    /// // ก.ค. expands to กรกฎาคม before segmentation — dots disappear
233    /// let tokens = fts.segment_for_fts("ก.ค.");
234    /// let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
235    /// assert!(!texts.contains(&"."), "dots should be consumed by expansion");
236    /// ```
237    pub fn abbrevs(mut self, m: AbbrevMap) -> Self {
238        self.abbrev_map = Some(m);
239        self
240    }
241
242    /// Enable or disable number normalization (default: `true`).
243    ///
244    /// When enabled:
245    /// - [`TokenKind::Number`] tokens that contain Thai digits (๐–๙) get the
246    ///   ASCII digit string added to their [`FtsToken::synonyms`]
247    ///   (e.g. `๑๒๓` → synonym `"123"`).
248    /// - [`TokenKind::Thai`] tokens that are recognised Thai cardinal number
249    ///   words get their decimal value added to `synonyms`
250    ///   (e.g. `หนึ่งร้อย` → synonym `"100"`).
251    ///
252    /// This lets queries using either script match documents written in the
253    /// other. Set to `false` to opt out.
254    ///
255    /// # Example
256    ///
257    /// ```rust
258    /// use kham_core::fts::FtsTokenizer;
259    /// use kham_core::TokenKind;
260    ///
261    /// // Default (true): ๑๒๓ gets ASCII synonym "123"
262    /// let fts = FtsTokenizer::new();
263    /// let tokens = fts.segment_for_fts("๑๒๓");
264    /// let num = tokens.iter().find(|t| t.kind == TokenKind::Number).unwrap();
265    /// assert!(num.synonyms.contains(&String::from("123")));
266    ///
267    /// // Opt out: no conversion performed
268    /// let fts_off = FtsTokenizer::builder().number_normalize(false).build();
269    /// let tokens_off = fts_off.segment_for_fts("๑๒๓");
270    /// let num_off = tokens_off.iter().find(|t| t.kind == TokenKind::Number).unwrap();
271    /// assert!(!num_off.synonyms.contains(&String::from("123")));
272    /// ```
273    pub fn number_normalize(mut self, v: bool) -> Self {
274        self.number_normalize = Some(v);
275        self
276    }
277
278    /// Emit a Thai phonetic soundex code as an additional synonym for Thai and Named tokens.
279    ///
280    /// When set, each Thai and Named token whose text contains Thai consonants gets its
281    /// soundex code appended to [`FtsToken::synonyms`], enabling phonetic fuzzy matching
282    /// in full-text search (e.g. querying `"1600"` matches กาน, ขาน, and คาน with lk82).
283    ///
284    /// [`SoundexAlgorithm::Lk82`] and [`SoundexAlgorithm::Udom83`] produce fixed
285    /// 4-character codes and are the recommended choices for FTS indexing.
286    /// [`SoundexAlgorithm::MetaSound`] produces variable-length codes and is more
287    /// collision-prone at word level — prefer lk82 or udom83 for general FTS use.
288    ///
289    /// Disabled by default — call this method to opt in.
290    ///
291    /// # Example
292    ///
293    /// ```rust
294    /// use kham_core::fts::FtsTokenizer;
295    /// use kham_core::soundex::{lk82, SoundexAlgorithm};
296    /// use kham_core::stopwords::StopwordSet;
297    ///
298    /// let fts = FtsTokenizer::builder()
299    ///     .soundex(SoundexAlgorithm::Lk82)
300    ///     .stopwords(StopwordSet::from_text(""))
301    ///     .build();
302    /// // กาน / ขาน / คาน all map to the same lk82 code — stored once per token
303    /// for word in &["กาน", "ขาน", "คาน"] {
304    ///     let tokens = fts.segment_for_fts(word);
305    ///     let t = tokens.first().unwrap();
306    ///     assert!(t.synonyms.contains(&lk82(word)), "{word} missing lk82 synonym");
307    /// }
308    /// ```
309    pub fn soundex(mut self, algo: SoundexAlgorithm) -> Self {
310        self.soundex = Some(algo);
311        self
312    }
313
314    /// Overlay extra words on the built-in dictionary without a full trie rebuild.
315    ///
316    /// Words are stored in a sorted list alongside the pre-compiled trie.
317    /// Prefer this over a full rebuild when adding a small domain-specific
318    /// vocabulary (e.g. product names, technical terms).
319    ///
320    /// Newline-separated; `#` lines are ignored.
321    ///
322    /// # Example
323    ///
324    /// ```rust
325    /// use kham_core::fts::FtsTokenizer;
326    /// use kham_core::TokenKind;
327    ///
328    /// let fts = FtsTokenizer::builder()
329    ///     .dict_merge("โปรแกรมเมอร์\nปัญญาประดิษฐ์\n")
330    ///     .build();
331    /// let tokens = fts.segment_for_fts("โปรแกรมเมอร์ไทย");
332    /// assert!(tokens.iter().any(|t| t.text == "โปรแกรมเมอร์" && t.kind == TokenKind::Thai));
333    /// ```
334    pub fn dict_merge(mut self, words: &str) -> Self {
335        self.dict_merge = Some(String::from(words));
336        self
337    }
338
339    /// Consume the builder and return a configured [`FtsTokenizer`].
340    ///
341    /// # Example
342    ///
343    /// ```rust
344    /// use kham_core::fts::FtsTokenizer;
345    /// use kham_core::soundex::SoundexAlgorithm;
346    /// use kham_core::stopwords::StopwordSet;
347    ///
348    /// let fts = FtsTokenizer::builder()
349    ///     .soundex(SoundexAlgorithm::Lk82)
350    ///     .stopwords(StopwordSet::from_text(""))
351    ///     .build();
352    /// assert!(!fts.segment_for_fts("กินข้าว").is_empty());
353    /// ```
354    pub fn build(self) -> FtsTokenizer {
355        let tokenizer = if let Some(ref words) = self.dict_merge {
356            Tokenizer::builder().dict_merge(words).build()
357        } else {
358            Tokenizer::new()
359        };
360        FtsTokenizer {
361            tokenizer,
362            stopwords: self.stopwords.unwrap_or_else(StopwordSet::builtin),
363            synonyms: self.synonyms.unwrap_or_else(SynonymMap::empty),
364            ngram_size: self.ngram_size.unwrap_or(3),
365            pos_tagger: self.pos_tagger.unwrap_or_else(PosTagger::builtin),
366            ne_tagger: self.ne_tagger.unwrap_or_else(NeTagger::builtin),
367            romanization: self.romanization,
368            abbrev_map: self.abbrev_map,
369            number_normalize: self.number_normalize.unwrap_or(true),
370            soundex: self.soundex,
371        }
372    }
373}
374
375/// Full-text search tokenizer for Thai text.
376///
377/// Wraps [`Tokenizer`] with stopword filtering, synonym expansion, and n-gram
378/// generation for out-of-vocabulary tokens.
379///
380/// Construct once and reuse:
381///
382/// ```rust
383/// use kham_core::fts::FtsTokenizer;
384///
385/// let fts = FtsTokenizer::new();
386/// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
387/// assert!(!tokens.is_empty());
388/// ```
389pub struct FtsTokenizer {
390    tokenizer: Tokenizer,
391    stopwords: StopwordSet,
392    synonyms: SynonymMap,
393    ngram_size: usize,
394    pos_tagger: PosTagger,
395    ne_tagger: NeTagger,
396    romanization: Option<RomanizationMap>,
397    abbrev_map: Option<AbbrevMap>,
398    number_normalize: bool,
399    soundex: Option<SoundexAlgorithm>,
400}
401
402impl FtsTokenizer {
403    /// Create an [`FtsTokenizer`] with built-in stopwords and no synonyms.
404    ///
405    /// # Example
406    ///
407    /// ```rust
408    /// use kham_core::fts::FtsTokenizer;
409    ///
410    /// let fts = FtsTokenizer::new();
411    /// let lexemes = fts.lexemes("กินข้าวกับปลา");
412    /// // Built-in stopword กับ is excluded; content words are present
413    /// assert!(!lexemes.contains(&String::from("กับ")));
414    /// assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
415    /// ```
416    pub fn new() -> Self {
417        FtsTokenizerBuilder::default().build()
418    }
419
420    /// Return a [`FtsTokenizerBuilder`] for custom configuration.
421    ///
422    /// # Example
423    ///
424    /// ```rust
425    /// use kham_core::fts::FtsTokenizer;
426    /// use kham_core::soundex::SoundexAlgorithm;
427    /// use kham_core::synonym::SynonymMap;
428    ///
429    /// let fts = FtsTokenizer::builder()
430    ///     .synonyms(SynonymMap::from_tsv("รถ\tรถยนต์\n"))
431    ///     .soundex(SoundexAlgorithm::Lk82)
432    ///     .build();
433    /// assert!(!fts.segment_for_fts("รถ").is_empty());
434    /// ```
435    pub fn builder() -> FtsTokenizerBuilder {
436        FtsTokenizerBuilder::default()
437    }
438
439    /// Segment `text` and annotate each token for FTS indexing.
440    ///
441    /// Normalises the input text before segmentation so that สระลอย and stacked
442    /// tone marks are handled correctly. Whitespace tokens are excluded.
443    ///
444    /// The returned `Vec<FtsToken>` covers all non-whitespace tokens. Call
445    /// [`index_tokens`] instead when you only need the tokens to be indexed
446    /// (stopwords excluded).
447    ///
448    /// [`index_tokens`]: FtsTokenizer::index_tokens
449    ///
450    /// # Examples
451    ///
452    /// ```rust
453    /// use kham_core::fts::FtsTokenizer;
454    ///
455    /// let fts = FtsTokenizer::new();
456    /// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
457    /// // Positions are 0-based and sequential across non-whitespace tokens
458    /// for (i, t) in tokens.iter().enumerate() {
459    ///     assert_eq!(t.position, i);
460    /// }
461    /// // กับ is a common conjunction — marked as a stopword
462    /// let kap = tokens.iter().find(|t| t.text == "กับ").unwrap();
463    /// assert!(kap.is_stop);
464    /// ```
465    ///
466    /// Named entities are tagged automatically — `kind` becomes `TokenKind::Named`:
467    ///
468    /// ```rust
469    /// use kham_core::fts::FtsTokenizer;
470    /// use kham_core::TokenKind;
471    ///
472    /// let fts = FtsTokenizer::new();
473    /// let tokens = fts.segment_for_fts("ไปกรุงเทพ");
474    /// assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));
475    /// ```
476    ///
477    /// Enable phonetic synonyms with [`FtsTokenizerBuilder::soundex`]:
478    ///
479    /// ```rust
480    /// use kham_core::fts::FtsTokenizer;
481    /// use kham_core::soundex::SoundexAlgorithm;
482    ///
483    /// let fts = FtsTokenizer::builder()
484    ///     .soundex(SoundexAlgorithm::Lk82)
485    ///     .build();
486    /// let tokens = fts.segment_for_fts("กิน");
487    /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
488    /// // synonyms now contains the lk82 code, enabling fuzzy phonetic matching
489    /// assert!(!t.synonyms.is_empty());
490    /// ```
491    pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken> {
492        let normalized = self.tokenizer.normalize(text);
493        // Expand abbreviations (e.g. ก.ค. → กรกฎาคม) before segmentation so
494        // dot-containing patterns are replaced as single units.
495        let expanded = match self.abbrev_map.as_ref() {
496            Some(am) => am.expand_text(&normalized),
497            None => normalized,
498        };
499        let raw_tokens = self
500            .ne_tagger
501            .tag_tokens(self.tokenizer.segment(&expanded), &expanded);
502
503        let mut result = Vec::with_capacity(raw_tokens.len());
504        let mut position = 0usize;
505
506        for token in &raw_tokens {
507            if token.kind == TokenKind::Whitespace {
508                continue;
509            }
510
511            let is_stop = self.stopwords.contains(token.text);
512            let is_thai_or_named = matches!(token.kind, TokenKind::Thai | TokenKind::Named(_));
513            let mut synonyms = self
514                .synonyms
515                .expand(token.text)
516                .map(|s| s.to_vec())
517                .unwrap_or_default();
518            if is_thai_or_named {
519                if let Some(ref rom) = self.romanization {
520                    if let Some(rtgs) = rom.romanize(token.text) {
521                        synonyms.push(String::from(rtgs));
522                    }
523                }
524                if let Some(algo) = self.soundex {
525                    let code = soundex(token.text, algo);
526                    if !code.chars().all(|c| c == '0') {
527                        synonyms.push(code);
528                    }
529                }
530            }
531            if self.number_normalize {
532                match token.kind {
533                    // Number token with Thai digits → add ASCII form as synonym.
534                    TokenKind::Number => {
535                        let ascii = thai_digits_to_ascii(token.text);
536                        if ascii != token.text {
537                            synonyms.push(ascii);
538                        }
539                    }
540                    // Thai token that is a recognised number word → add decimal string.
541                    TokenKind::Thai => {
542                        if let Some(decimal) = thai_word_to_decimal(token.text) {
543                            synonyms.push(decimal);
544                        }
545                    }
546                    _ => {}
547                }
548            }
549            let trigrams = if token.kind == TokenKind::Unknown && self.ngram_size > 0 {
550                char_ngrams(token.text, self.ngram_size)
551                    .map(String::from)
552                    .collect()
553            } else {
554                Vec::new()
555            };
556            let ne = if let TokenKind::Named(k) = token.kind {
557                Some(k)
558            } else {
559                None
560            };
561            let pos = if token.kind == TokenKind::Thai {
562                self.pos_tagger.tag(token.text)
563            } else {
564                None
565            };
566
567            result.push(FtsToken {
568                text: String::from(token.text),
569                position,
570                kind: token.kind,
571                is_stop,
572                synonyms,
573                trigrams,
574                pos,
575                ne,
576            });
577
578            position += 1;
579        }
580
581        result
582    }
583
584    /// Return only the tokens to be written into a search index.
585    ///
586    /// Filters out stopwords and whitespace. Each [`FtsToken`] still carries
587    /// its original `position` so phrase-distance scoring remains correct.
588    ///
589    /// # Example
590    ///
591    /// ```rust
592    /// use kham_core::fts::FtsTokenizer;
593    ///
594    /// let fts = FtsTokenizer::new();
595    /// let tokens = fts.index_tokens("กินข้าวกับปลา");
596    /// // No stopwords in the index
597    /// assert!(tokens.iter().all(|t| !t.is_stop));
598    /// // Positions are preserved from the full sequence for phrase scoring
599    /// let positions: Vec<usize> = tokens.iter().map(|t| t.position).collect();
600    /// assert!(positions.windows(2).all(|w| w[0] < w[1]));
601    /// ```
602    pub fn index_tokens(&self, text: &str) -> Vec<FtsToken> {
603        self.segment_for_fts(text)
604            .into_iter()
605            .filter(|t| !t.is_stop)
606            .collect()
607    }
608
609    /// Collect all lexeme strings to be stored in a `tsvector`.
610    ///
611    /// Returns one string per non-stop token, plus synonym expansions and
612    /// trigrams for unknown tokens. Duplicates are not removed (the caller or
613    /// PostgreSQL handles deduplication).
614    ///
615    /// # Example
616    ///
617    /// ```rust
618    /// use kham_core::fts::FtsTokenizer;
619    ///
620    /// let fts = FtsTokenizer::new();
621    /// let lexemes = fts.lexemes("กินข้าวกับปลา");
622    /// // Content words are present; stopword กับ is absent
623    /// assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
624    /// assert!(!lexemes.contains(&String::from("กับ")));
625    /// ```
626    ///
627    /// With Thai digit normalization (enabled by default), both scripts match:
628    ///
629    /// ```rust
630    /// use kham_core::fts::FtsTokenizer;
631    ///
632    /// let fts = FtsTokenizer::new();
633    /// let lexemes = fts.lexemes("ธนาคาร๑๐๐แห่ง");
634    /// // ๑๐๐ (Thai digits) → synonym "100" (ASCII) — both appear in lexemes
635    /// assert!(lexemes.contains(&String::from("100")));
636    /// ```
637    pub fn lexemes(&self, text: &str) -> Vec<String> {
638        let tokens = self.index_tokens(text);
639        let mut out: Vec<String> = Vec::with_capacity(tokens.len() * 2);
640        for t in tokens {
641            out.push(t.text.clone());
642            out.extend(t.synonyms);
643            out.extend(t.trigrams);
644        }
645        out
646    }
647}
648
649impl Default for FtsTokenizer {
650    fn default() -> Self {
651        Self::new()
652    }
653}
654
655// ---------------------------------------------------------------------------
656// Tests
657// ---------------------------------------------------------------------------
658
659#[cfg(test)]
660mod tests {
661    use super::*;
662    use crate::stopwords::StopwordSet;
663    use crate::synonym::SynonymMap;
664
665    fn fts() -> FtsTokenizer {
666        FtsTokenizer::new()
667    }
668
669    // ── segment_for_fts ───────────────────────────────────────────────────────
670
671    #[test]
672    fn empty_input_returns_empty() {
673        assert!(fts().segment_for_fts("").is_empty());
674    }
675
676    #[test]
677    fn whitespace_tokens_excluded() {
678        let tokens = fts().segment_for_fts("กิน ข้าว");
679        assert!(tokens.iter().all(|t| t.kind != TokenKind::Whitespace));
680    }
681
682    #[test]
683    fn positions_are_sequential() {
684        let tokens = fts().segment_for_fts("กินข้าวกับปลา");
685        for (i, t) in tokens.iter().enumerate() {
686            assert_eq!(t.position, i, "position mismatch at index {i}");
687        }
688    }
689
690    #[test]
691    fn known_stopword_is_tagged() {
692        // "กับ" is a common conjunction and should be in the built-in stopword list
693        let tokens = fts().segment_for_fts("กินข้าวกับปลา");
694        let kap = tokens.iter().find(|t| t.text == "กับ");
695        assert!(kap.is_some(), "expected 'กับ' token");
696        assert!(kap.unwrap().is_stop, "'กับ' should be tagged as stopword");
697    }
698
699    #[test]
700    fn content_words_not_tagged_as_stop() {
701        let tokens = fts().segment_for_fts("โรงพยาบาล");
702        // May be OOV but should not be a stopword
703        for t in &tokens {
704            assert!(!t.is_stop, "'{}' should not be a stopword", t.text);
705        }
706    }
707
708    #[test]
709    fn text_is_reconstructable() {
710        // All tokens joined == normalised input (whitespace dropped)
711        let fts = fts();
712        let text = "กินข้าวกับปลา";
713        let normalized = fts.tokenizer.normalize(text);
714        let tokens = fts.segment_for_fts(text);
715        let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
716        assert_eq!(rebuilt, normalized);
717    }
718
719    // ── synonym expansion ─────────────────────────────────────────────────────
720
721    #[test]
722    fn synonym_expansion_attached() {
723        let synonyms = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
724        let fts = FtsTokenizer::builder()
725            .synonyms(synonyms)
726            .stopwords(StopwordSet::from_text(""))
727            .build();
728        // Segment a text containing "คอม" — need it in dict or it lands as Unknown
729        // Use builder with custom word so the segmenter recognises it
730        let tokens = fts.segment_for_fts("คอม");
731        let t = tokens.iter().find(|t| t.text == "คอม");
732        if let Some(tok) = t {
733            assert!(
734                tok.synonyms.contains(&String::from("คอมพิวเตอร์")),
735                "expected synonym expansion, got {:?}",
736                tok.synonyms
737            );
738        }
739    }
740
741    #[test]
742    fn no_synonyms_when_map_empty() {
743        let tokens = fts().segment_for_fts("กินข้าว");
744        for t in &tokens {
745            assert!(t.synonyms.is_empty());
746        }
747    }
748
749    // ── unknown token trigrams ────────────────────────────────────────────────
750
751    #[test]
752    fn unknown_token_gets_trigrams() {
753        // "กิ" = consonant + sara-i, a single 2-char TCC that is not a word.
754        // With ngram_size=2 the token should yield one bigram ("กิ").
755        // The newmm DP emits Unknown tokens one TCC at a time, so multi-char TCCs
756        // (like "กิ") are the shortest unit that can produce n-grams.
757        let fts = FtsTokenizer::builder()
758            .ngram_size(2)
759            .stopwords(StopwordSet::from_text(""))
760            .build();
761        let tokens = fts.segment_for_fts("กิ");
762        let unknown: Vec<_> = tokens
763            .iter()
764            .filter(|t| t.kind == TokenKind::Unknown && t.text.chars().count() >= 2)
765            .collect();
766        assert!(
767            !unknown.is_empty(),
768            "expected at least one multi-char Unknown token for 'กิ'"
769        );
770        for u in &unknown {
771            assert!(
772                !u.trigrams.is_empty(),
773                "unknown token '{}' ({} chars) should have bigrams",
774                u.text,
775                u.text.chars().count()
776            );
777        }
778    }
779
780    #[test]
781    fn known_thai_token_has_no_trigrams() {
782        let tokens = fts().segment_for_fts("กิน");
783        for t in &tokens {
784            if t.kind == TokenKind::Thai {
785                assert!(
786                    t.trigrams.is_empty(),
787                    "known Thai token '{}' should not have trigrams",
788                    t.text
789                );
790            }
791        }
792    }
793
794    #[test]
795    fn ngram_size_zero_disables_trigrams() {
796        let fts = FtsTokenizer::builder()
797            .ngram_size(0)
798            .stopwords(StopwordSet::from_text(""))
799            .build();
800        let tokens = fts.segment_for_fts("กขคง");
801        for t in &tokens {
802            assert!(t.trigrams.is_empty());
803        }
804    }
805
806    // ── index_tokens ──────────────────────────────────────────────────────────
807
808    #[test]
809    fn index_tokens_excludes_stopwords() {
810        let tokens = fts().index_tokens("กินข้าวกับปลา");
811        assert!(tokens.iter().all(|t| !t.is_stop));
812    }
813
814    #[test]
815    fn index_tokens_preserves_positions() {
816        // Positions in index_tokens must be a subset of segment_for_fts positions
817        let all = fts().segment_for_fts("กินข้าวกับปลา");
818        let indexed = fts().index_tokens("กินข้าวกับปลา");
819        for t in &indexed {
820            assert!(
821                all.iter().any(|a| a.position == t.position),
822                "indexed token at position {} not found in full token list",
823                t.position
824            );
825        }
826    }
827
828    // ── lexemes ───────────────────────────────────────────────────────────────
829
830    #[test]
831    fn lexemes_returns_non_stop_texts() {
832        let lexemes = fts().lexemes("กินข้าวกับปลา");
833        // "กับ" is a stopword — should not appear
834        assert!(!lexemes.contains(&String::from("กับ")));
835        // Content words should appear
836        assert!(
837            lexemes
838                .iter()
839                .any(|l| l == "กิน" || l == "ข้าว" || l == "ปลา"),
840            "expected content words in lexemes: {lexemes:?}"
841        );
842    }
843
844    #[test]
845    fn lexemes_empty_input_is_empty() {
846        assert!(fts().lexemes("").is_empty());
847    }
848
849    // ── multi-token NE ────────────────────────────────────────────────────────
850
851    #[test]
852    fn multi_token_ne_merged_in_pipeline() {
853        // กรุงเทพ is in the NE gazetteer as PLACE; the segmenter splits it
854        // into กรุง+เทพ. The FTS pipeline must merge them into one Named token.
855        let fts = FtsTokenizer::new();
856        let tokens = fts.segment_for_fts("ไปกรุงเทพ");
857        let named: Vec<_> = tokens
858            .iter()
859            .filter(|t| matches!(t.kind, TokenKind::Named(_)))
860            .collect();
861        assert!(
862            named.iter().any(|t| t.text == "กรุงเทพ"),
863            "กรุงเทพ should be tagged Named after multi-token merge, tokens: {:?}",
864            tokens
865                .iter()
866                .map(|t| (&t.text, &t.kind))
867                .collect::<alloc::vec::Vec<_>>()
868        );
869    }
870
871    #[test]
872    fn multi_token_ne_reconstructable() {
873        // Texts of all non-whitespace tokens must still reconstruct the normalized input.
874        let fts = FtsTokenizer::new();
875        let text = "ไปกรุงเทพ";
876        let normalized = fts.tokenizer.normalize(text);
877        let tokens = fts.segment_for_fts(text);
878        let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
879        assert_eq!(rebuilt, normalized);
880    }
881
882    // ── builder ───────────────────────────────────────────────────────────────
883
884    #[test]
885    fn builder_custom_stopwords() {
886        let stops = StopwordSet::from_text("กิน\n");
887        let fts = FtsTokenizer::builder().stopwords(stops).build();
888        let tokens = fts.segment_for_fts("กินข้าว");
889        let gin = tokens.iter().find(|t| t.text == "กิน");
890        if let Some(t) = gin {
891            assert!(t.is_stop, "'กิน' should be stop with custom list");
892        }
893    }
894
895    #[test]
896    fn builder_default_equals_new() {
897        // Both paths should produce the same result for a simple input
898        let a = FtsTokenizer::new().lexemes("กินข้าว");
899        let b = FtsTokenizer::builder().build().lexemes("กินข้าว");
900        assert_eq!(a, b);
901    }
902
903    // ── number normalization ──────────────────────────────────────────────────
904
905    #[test]
906    fn thai_digit_token_gets_ascii_synonym() {
907        let fts = FtsTokenizer::new();
908        let tokens = fts.segment_for_fts("๑๒๓");
909        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
910        assert!(num.is_some(), "expected a Number token");
911        let t = num.unwrap();
912        assert!(
913            t.synonyms.contains(&String::from("123")),
914            "Thai digit token should have ASCII synonym, got {:?}",
915            t.synonyms
916        );
917    }
918
919    #[test]
920    fn ascii_digit_token_has_no_extra_synonym() {
921        // ASCII digits need no conversion — synonyms should be empty (no map, no rom).
922        let fts = FtsTokenizer::new();
923        let tokens = fts.segment_for_fts("123");
924        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
925        assert!(num.is_some(), "expected a Number token");
926        assert!(
927            !num.unwrap().synonyms.contains(&String::from("123")),
928            "ASCII digit token should not duplicate itself as a synonym"
929        );
930    }
931
932    #[test]
933    fn thai_number_word_gets_decimal_synonym() {
934        // หนึ่งร้อย may segment as a single Thai token or multiple tokens depending
935        // on the dictionary. We check that at least one token carries "100" in synonyms.
936        let fts = FtsTokenizer::new();
937        let tokens = fts.segment_for_fts("หนึ่งร้อย");
938        let has_hundred = tokens
939            .iter()
940            .any(|t| t.synonyms.contains(&String::from("100")));
941        // หนึ่ง alone = Some(1), ร้อย alone = Some(100) — at least ร้อย should match.
942        assert!(
943            has_hundred,
944            "expected a token with decimal synonym '100', tokens: {:?}",
945            tokens
946                .iter()
947                .map(|t| (&t.text, &t.synonyms))
948                .collect::<alloc::vec::Vec<_>>()
949        );
950    }
951
952    #[test]
953    fn number_normalize_false_disables_conversion() {
954        let fts = FtsTokenizer::builder()
955            .number_normalize(false)
956            .stopwords(StopwordSet::from_text(""))
957            .build();
958        let tokens = fts.segment_for_fts("๑๒๓");
959        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
960        assert!(num.is_some());
961        assert!(
962            !num.unwrap().synonyms.contains(&String::from("123")),
963            "number_normalize=false should suppress ASCII synonym"
964        );
965    }
966
967    #[test]
968    fn mixed_thai_digit_in_context() {
969        // "ธนาคาร๑๐๐แห่ง" — the ๑๐๐ part should be a Number token with synonym "100"
970        let fts = FtsTokenizer::new();
971        let tokens = fts.segment_for_fts("ธนาคาร๑๐๐แห่ง");
972        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
973        assert!(num.is_some(), "expected Number token in mixed string");
974        assert!(
975            num.unwrap().synonyms.contains(&String::from("100")),
976            "expected ASCII synonym '100' for ๑๐๐"
977        );
978    }
979
980    // ── abbreviation expansion ────────────────────────────────────────────────
981
982    #[test]
983    fn abbrev_map_expands_before_segmentation() {
984        use crate::abbrev::AbbrevMap;
985        let fts = FtsTokenizer::builder()
986            .abbrevs(AbbrevMap::builtin())
987            .stopwords(StopwordSet::from_text(""))
988            .build();
989        // ก.ค. → กรกฎาคม before segmentation. The segmenter may split the
990        // expansion further (กรกฎา + คม) — what matters is that dots are gone
991        // and the Thai characters of กรกฎาคม are present.
992        let tokens = fts.segment_for_fts("ก.ค.");
993        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
994        let joined: String = texts.concat();
995        assert!(
996            joined.contains("กรกฎา") || joined.contains("กรกฎาคม"),
997            "expected กรกฎา(คม) characters after abbrev expansion, got: {texts:?}"
998        );
999        assert!(
1000            !texts.contains(&"."),
1001            "dots should be consumed by abbrev expansion, got: {texts:?}"
1002        );
1003    }
1004
1005    #[test]
1006    fn abbrev_expansion_disabled_by_default() {
1007        // FtsTokenizer::new() has no abbrev_map — ก.ค. stays as individual tokens.
1008        let fts = FtsTokenizer::new();
1009        let tokens = fts.segment_for_fts("ก.ค.");
1010        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1011        // Without expansion the dot(s) must still be present as punctuation tokens.
1012        assert!(
1013            texts.contains(&"."),
1014            "without abbrev expansion, dots should remain as tokens, got: {texts:?}"
1015        );
1016    }
1017
1018    // ── soundex synonyms ──────────────────────────────────────────────────────
1019
1020    #[test]
1021    fn soundex_lk82_appended_to_thai_synonyms() {
1022        use crate::soundex::lk82;
1023        let fts = FtsTokenizer::builder()
1024            .soundex(SoundexAlgorithm::Lk82)
1025            .stopwords(StopwordSet::from_text(""))
1026            .build();
1027        let tokens = fts.segment_for_fts("กิน");
1028        let t = tokens.iter().find(|t| t.text == "กิน");
1029        assert!(t.is_some(), "expected token 'กิน'");
1030        let expected_code = lk82("กิน");
1031        assert!(
1032            t.unwrap().synonyms.contains(&expected_code),
1033            "expected lk82 code '{expected_code}' in synonyms, got {:?}",
1034            t.unwrap().synonyms
1035        );
1036    }
1037
1038    #[test]
1039    fn soundex_not_emitted_by_default() {
1040        // Without .soundex() in the builder, no soundex codes should appear.
1041        let fts = FtsTokenizer::new();
1042        let tokens = fts.segment_for_fts("กินข้าว");
1043        for t in &tokens {
1044            // A soundex code is 4 ASCII chars (lk82/udom83); no synonym should look like one.
1045            for syn in &t.synonyms {
1046                let looks_like_soundex =
1047                    syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
1048                assert!(
1049                    !looks_like_soundex,
1050                    "unexpected soundex-like synonym '{}' on token '{}'",
1051                    syn, t.text
1052                );
1053            }
1054        }
1055    }
1056
1057    #[test]
1058    fn soundex_same_sounding_words_share_code_in_index() {
1059        // กาน and ขาน share lk82 code "1600"; both should carry it as a synonym.
1060        use crate::soundex::lk82;
1061        let fts = FtsTokenizer::builder()
1062            .soundex(SoundexAlgorithm::Lk82)
1063            .stopwords(StopwordSet::from_text(""))
1064            .build();
1065        let code = lk82("กาน");
1066        for word in &["กาน", "ขาน", "คาน"] {
1067            let tokens = fts.segment_for_fts(word);
1068            let t = tokens.first().expect("expected at least one token");
1069            assert!(
1070                t.synonyms.contains(&code),
1071                "'{word}' should carry lk82 code '{code}', got {:?}",
1072                t.synonyms
1073            );
1074        }
1075    }
1076
1077    #[test]
1078    fn soundex_not_emitted_for_non_thai_tokens() {
1079        let fts = FtsTokenizer::builder()
1080            .soundex(SoundexAlgorithm::Lk82)
1081            .stopwords(StopwordSet::from_text(""))
1082            .build();
1083        let tokens = fts.segment_for_fts("hello 123");
1084        for t in &tokens {
1085            for syn in &t.synonyms {
1086                let looks_like_soundex =
1087                    syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
1088                assert!(
1089                    !looks_like_soundex,
1090                    "non-Thai token '{}' should not get a soundex synonym, got '{syn}'",
1091                    t.text
1092                );
1093            }
1094        }
1095    }
1096
1097    #[test]
1098    fn soundex_udom83_appended() {
1099        use crate::soundex::udom83;
1100        let fts = FtsTokenizer::builder()
1101            .soundex(SoundexAlgorithm::Udom83)
1102            .stopwords(StopwordSet::from_text(""))
1103            .build();
1104        let tokens = fts.segment_for_fts("กิน");
1105        let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
1106        let expected = udom83("กิน");
1107        assert!(
1108            t.synonyms.contains(&expected),
1109            "expected udom83 code '{expected}' in synonyms, got {:?}",
1110            t.synonyms
1111        );
1112    }
1113
1114    #[test]
1115    fn abbrev_expansion_date_sentence() {
1116        use crate::abbrev::AbbrevMap;
1117        let fts = FtsTokenizer::builder()
1118            .abbrevs(AbbrevMap::builtin())
1119            .stopwords(StopwordSet::from_text(""))
1120            .build();
1121        // พ.ศ. → พุทธศักราช; the segmenter may split it further — verify the
1122        // chars are present and dots are gone.
1123        let tokens = fts.segment_for_fts("พ.ศ.2567");
1124        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1125        let joined: String = texts.concat();
1126        assert!(
1127            joined.contains("พุทธ") || joined.contains("พุทธศักราช"),
1128            "expected พุทธ(ศักราช) chars after expanding พ.ศ., got: {texts:?}"
1129        );
1130        assert!(
1131            !texts.contains(&"."),
1132            "dots should be consumed by expansion, got: {texts:?}"
1133        );
1134    }
1135}
kham_core/fts.rs

kham_core/
fts.rs