Skip to main content

kham_core/
fts.rs

1//! Full-text search pipeline for Thai text.
2//!
3//! [`FtsTokenizer`] orchestrates the complete FTS indexing pipeline:
4//! normalise → segment → tag stopwords → expand synonyms → attach position.
5//!
6//! The output [`FtsToken`] slice is consumed by the PostgreSQL `kham-pg`
7//! extension and by any other caller that needs FTS-ready lexemes.
8//!
9//! # Positions
10//!
11//! `position` is the ordinal index of the token in the non-whitespace token
12//! sequence (0-based). Stopwords retain their position so that phrase-distance
13//! scoring remains correct when stopwords are later omitted from the index.
14//!
15//! # Example
16//!
17//! ```rust
18//! use kham_core::fts::{FtsTokenizer, FtsToken};
19//!
20//! let fts = FtsTokenizer::new();
21//! let tokens = fts.segment_for_fts("กินข้าวกับปลา");
22//! for t in &tokens {
23//!     println!("{} pos={} stop={}", t.text, t.position, t.is_stop);
24//! }
25//! ```
26
27use alloc::string::String;
28use alloc::vec::Vec;
29
30use crate::abbrev::AbbrevMap;
31use crate::ne::NeTagger;
32use crate::ngram::char_ngrams;
33use crate::number::{thai_digits_to_ascii, thai_word_to_decimal};
34use crate::pos::{PosTag, PosTagger};
35use crate::romanizer::RomanizationMap;
36use crate::soundex::{soundex, SoundexAlgorithm};
37use crate::stopwords::StopwordSet;
38use crate::synonym::SynonymMap;
39use crate::token::{NamedEntityKind, TokenKind};
40use crate::Tokenizer;
41
42/// A streaming iterator over [`FtsToken`]s produced by the FTS pipeline.
43///
44/// Returned by [`FtsTokenizer::segment_stream`]. Internally holds the full
45/// `Vec<FtsToken>` as an [`alloc::vec::IntoIter`]; the streaming API is provided
46/// so callers can consume tokens one at a time without materialising a second
47/// collection.
48///
49/// # Example
50///
51/// ```rust
52/// use kham_core::fts::FtsTokenizer;
53///
54/// let fts = FtsTokenizer::new();
55/// let mut stream = fts.segment_stream("กินข้าวกับปลา");
56/// // next_index_token() skips stopwords — กับ is a stopword and is skipped.
57/// while let Some(tok) = stream.next_index_token() {
58///     println!("{} pos={}", tok.text, tok.position);
59/// }
60/// ```
61pub struct FtsTokenStream {
62    inner: alloc::vec::IntoIter<FtsToken>,
63}
64
65impl FtsTokenStream {
66    /// Advance to the next token that should be written into the search index,
67    /// skipping stopwords.
68    ///
69    /// Equivalent to calling [`Iterator::next`] in a loop until a token with
70    /// `is_stop == false` is found, or the stream is exhausted.
71    pub fn next_index_token(&mut self) -> Option<FtsToken> {
72        self.inner.by_ref().find(|t| !t.is_stop)
73    }
74}
75
76impl Iterator for FtsTokenStream {
77    type Item = FtsToken;
78
79    #[inline]
80    fn next(&mut self) -> Option<FtsToken> {
81        self.inner.next()
82    }
83
84    #[inline]
85    fn size_hint(&self) -> (usize, Option<usize>) {
86        self.inner.size_hint()
87    }
88}
89
90/// A token produced by the FTS pipeline, ready for lexeme indexing.
91#[derive(Debug, Clone, PartialEq)]
92pub struct FtsToken {
93    /// The token text (owned; may be normalised).
94    pub text: String,
95    /// Ordinal position in the token sequence (0-based, gaps for whitespace).
96    pub position: usize,
97    /// Script / category of the original token.
98    pub kind: TokenKind,
99    /// `true` if this token matches the stopword list.
100    pub is_stop: bool,
101    /// Synonym expansions (empty if none configured or no match).
102    pub synonyms: Vec<String>,
103    /// Character trigrams — populated only for [`TokenKind::Unknown`] tokens.
104    pub trigrams: Vec<String>,
105    /// Primary part-of-speech tag from the lookup table, or `None` if the word
106    /// is not in the table (OOV) or is not a Thai token.
107    pub pos: Option<PosTag>,
108    /// Named entity category, or `None` if the token is not in the NE
109    /// gazetteer. When set, `kind` is [`TokenKind::Named`]`(ne)`.
110    pub ne: Option<NamedEntityKind>,
111    /// Segmentation confidence in the range `[0.0, 1.0]`.
112    /// `0.0` = Unknown token (no dictionary evidence).
113    /// `1.0` = unambiguous high-frequency dictionary match.
114    pub confidence: f32,
115}
116
117/// Builder for [`FtsTokenizer`].
118#[derive(Default)]
119pub struct FtsTokenizerBuilder {
120    stopwords: Option<StopwordSet>,
121    synonyms: Option<SynonymMap>,
122    ngram_size: Option<usize>,
123    pos_tagger: Option<PosTagger>,
124    ne_tagger: Option<NeTagger>,
125    romanization: Option<RomanizationMap>,
126    abbrev_map: Option<AbbrevMap>,
127    /// `None` means "use default (true)".
128    number_normalize: Option<bool>,
129    soundex: Option<SoundexAlgorithm>,
130    /// Extra words to overlay on top of the built-in dictionary (fast path).
131    dict_merge: Option<String>,
132}
133
134impl FtsTokenizerBuilder {
135    /// Use a custom stopword set instead of the built-in list.
136    ///
137    /// # Example
138    ///
139    /// ```rust
140    /// use kham_core::fts::FtsTokenizer;
141    /// use kham_core::stopwords::StopwordSet;
142    ///
143    /// let stops = StopwordSet::from_text("กิน\nข้าว\n");
144    /// let fts = FtsTokenizer::builder().stopwords(stops).build();
145    /// let tokens = fts.segment_for_fts("กินข้าว");
146    /// assert!(tokens.iter().all(|t| t.is_stop || t.text != "กิน"));
147    /// ```
148    pub fn stopwords(mut self, s: StopwordSet) -> Self {
149        self.stopwords = Some(s);
150        self
151    }
152
153    /// Attach a synonym map for expansion.
154    ///
155    /// # Example
156    ///
157    /// ```rust
158    /// use kham_core::fts::FtsTokenizer;
159    /// use kham_core::synonym::SynonymMap;
160    ///
161    /// // TSV: canonical TAB synonym1 TAB synonym2 …
162    /// let syns = SynonymMap::from_tsv("รถ\tรถยนต์\tยานพาหนะ\n");
163    /// let fts = FtsTokenizer::builder().synonyms(syns).build();
164    /// let tokens = fts.segment_for_fts("รถ");
165    /// let t = tokens.iter().find(|t| t.text == "รถ").unwrap();
166    /// assert!(t.synonyms.contains(&String::from("รถยนต์")));
167    /// ```
168    pub fn synonyms(mut self, m: SynonymMap) -> Self {
169        self.synonyms = Some(m);
170        self
171    }
172
173    /// Override the n-gram size used for [`TokenKind::Unknown`] tokens.
174    ///
175    /// Default: 3 (trigrams). Set to 0 to disable n-gram generation.
176    ///
177    /// # Example
178    ///
179    /// ```rust
180    /// use kham_core::fts::FtsTokenizer;
181    /// use kham_core::stopwords::StopwordSet;
182    ///
183    /// // Disable n-grams entirely — useful when index size must be small
184    /// let fts = FtsTokenizer::builder()
185    ///     .ngram_size(0)
186    ///     .stopwords(StopwordSet::from_text(""))
187    ///     .build();
188    /// let tokens = fts.segment_for_fts("กขคง"); // unknown word → no trigrams
189    /// assert!(tokens.iter().all(|t| t.trigrams.is_empty()));
190    /// ```
191    pub fn ngram_size(mut self, n: usize) -> Self {
192        self.ngram_size = Some(n);
193        self
194    }
195
196    /// Use a custom POS tagger instead of the built-in table.
197    ///
198    /// # Example
199    ///
200    /// ```rust
201    /// use kham_core::fts::FtsTokenizer;
202    /// use kham_core::pos::{PosTag, PosTagger};
203    ///
204    /// // Custom TSV: word TAB POS_TAG
205    /// let tagger = PosTagger::from_tsv("กิน\tVERB\n");
206    /// let fts = FtsTokenizer::builder().pos_tagger(tagger).build();
207    /// // Segment กิน alone so it is not merged into a compound
208    /// let tokens = fts.segment_for_fts("กิน");
209    /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
210    /// assert_eq!(t.pos, Some(PosTag::Verb));
211    /// ```
212    pub fn pos_tagger(mut self, t: PosTagger) -> Self {
213        self.pos_tagger = Some(t);
214        self
215    }
216
217    /// Use a custom NE gazetteer instead of the built-in table.
218    ///
219    /// # Example
220    ///
221    /// ```rust
222    /// use kham_core::fts::FtsTokenizer;
223    /// use kham_core::ne::NeTagger;
224    /// use kham_core::TokenKind;
225    ///
226    /// // Domain-specific NE list: word TAB NE_TAG
227    /// let ne = NeTagger::from_tsv("เซเรน่า\tPERSON\n");
228    /// let fts = FtsTokenizer::builder().ne_tagger(ne).build();
229    /// let tokens = fts.segment_for_fts("เซเรน่า");
230    /// assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));
231    /// ```
232    pub fn ne_tagger(mut self, t: NeTagger) -> Self {
233        self.ne_tagger = Some(t);
234        self
235    }
236
237    /// Attach a romanization map so RTGS forms are added to [`FtsToken::synonyms`].
238    ///
239    /// When set, each Thai and Named token whose text is found in the map gets its
240    /// RTGS romanization appended to `synonyms`, enabling Latin-script queries
241    /// (e.g. `kin`) to match Thai-script documents (e.g. `กิน`) in PostgreSQL FTS.
242    ///
243    /// Disabled by default — call this method to opt in.
244    ///
245    /// # Example
246    ///
247    /// ```rust
248    /// use kham_core::fts::FtsTokenizer;
249    /// use kham_core::romanizer::RomanizationMap;
250    ///
251    /// // TSV: Thai word TAB RTGS romanization
252    /// let rom = RomanizationMap::from_tsv("กิน\tkin\n");
253    /// let fts = FtsTokenizer::builder().romanization(rom).build();
254    /// let tokens = fts.segment_for_fts("กิน");
255    /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
256    /// // Latin synonym "kin" enables queries like `WHERE doc @@ 'kin'`
257    /// assert!(t.synonyms.contains(&String::from("kin")));
258    /// ```
259    pub fn romanization(mut self, m: RomanizationMap) -> Self {
260        self.romanization = Some(m);
261        self
262    }
263
264    /// Attach an abbreviation map for pre-tokenisation expansion.
265    ///
266    /// When set, [`FtsTokenizer::segment_for_fts`] calls
267    /// [`AbbrevMap::expand_text`] on the normalised input before segmentation.
268    /// This replaces abbreviated forms (e.g. `ก.ค.`) with their canonical
269    /// expansions (`กรกฎาคม`) so they are indexed and searchable by full form.
270    ///
271    /// Disabled by default — call this method to opt in.
272    ///
273    /// # Example
274    ///
275    /// ```rust
276    /// use kham_core::fts::FtsTokenizer;
277    /// use kham_core::abbrev::AbbrevMap;
278    /// use kham_core::stopwords::StopwordSet;
279    ///
280    /// let fts = FtsTokenizer::builder()
281    ///     .abbrevs(AbbrevMap::builtin())
282    ///     .stopwords(StopwordSet::from_text(""))
283    ///     .build();
284    /// // ก.ค. expands to กรกฎาคม before segmentation — dots disappear
285    /// let tokens = fts.segment_for_fts("ก.ค.");
286    /// let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
287    /// assert!(!texts.contains(&"."), "dots should be consumed by expansion");
288    /// ```
289    pub fn abbrevs(mut self, m: AbbrevMap) -> Self {
290        self.abbrev_map = Some(m);
291        self
292    }
293
294    /// Enable or disable number normalization (default: `true`).
295    ///
296    /// When enabled:
297    /// - [`TokenKind::Number`] tokens that contain Thai digits (๐–๙) get the
298    ///   ASCII digit string added to their [`FtsToken::synonyms`]
299    ///   (e.g. `๑๒๓` → synonym `"123"`).
300    /// - [`TokenKind::Thai`] tokens that are recognised Thai cardinal number
301    ///   words get their decimal value added to `synonyms`
302    ///   (e.g. `หนึ่งร้อย` → synonym `"100"`).
303    ///
304    /// This lets queries using either script match documents written in the
305    /// other. Set to `false` to opt out.
306    ///
307    /// # Example
308    ///
309    /// ```rust
310    /// use kham_core::fts::FtsTokenizer;
311    /// use kham_core::TokenKind;
312    ///
313    /// // Default (true): ๑๒๓ gets ASCII synonym "123"
314    /// let fts = FtsTokenizer::new();
315    /// let tokens = fts.segment_for_fts("๑๒๓");
316    /// let num = tokens.iter().find(|t| t.kind == TokenKind::Number).unwrap();
317    /// assert!(num.synonyms.contains(&String::from("123")));
318    ///
319    /// // Opt out: no conversion performed
320    /// let fts_off = FtsTokenizer::builder().number_normalize(false).build();
321    /// let tokens_off = fts_off.segment_for_fts("๑๒๓");
322    /// let num_off = tokens_off.iter().find(|t| t.kind == TokenKind::Number).unwrap();
323    /// assert!(!num_off.synonyms.contains(&String::from("123")));
324    /// ```
325    pub fn number_normalize(mut self, v: bool) -> Self {
326        self.number_normalize = Some(v);
327        self
328    }
329
330    /// Emit a Thai phonetic soundex code as an additional synonym for Thai and Named tokens.
331    ///
332    /// When set, each Thai and Named token whose text contains Thai consonants gets its
333    /// soundex code appended to [`FtsToken::synonyms`], enabling phonetic fuzzy matching
334    /// in full-text search (e.g. querying `"1600"` matches กาน, ขาน, and คาน with lk82).
335    ///
336    /// [`SoundexAlgorithm::Lk82`] and [`SoundexAlgorithm::Udom83`] produce fixed
337    /// 4-character codes and are the recommended choices for FTS indexing.
338    /// [`SoundexAlgorithm::MetaSound`] produces variable-length codes and is more
339    /// collision-prone at word level — prefer lk82 or udom83 for general FTS use.
340    ///
341    /// Disabled by default — call this method to opt in.
342    ///
343    /// # Example
344    ///
345    /// ```rust
346    /// use kham_core::fts::FtsTokenizer;
347    /// use kham_core::soundex::{lk82, SoundexAlgorithm};
348    /// use kham_core::stopwords::StopwordSet;
349    ///
350    /// let fts = FtsTokenizer::builder()
351    ///     .soundex(SoundexAlgorithm::Lk82)
352    ///     .stopwords(StopwordSet::from_text(""))
353    ///     .build();
354    /// // กาน / ขาน / คาน all map to the same lk82 code — stored once per token
355    /// for word in &["กาน", "ขาน", "คาน"] {
356    ///     let tokens = fts.segment_for_fts(word);
357    ///     let t = tokens.first().unwrap();
358    ///     assert!(t.synonyms.contains(&lk82(word)), "{word} missing lk82 synonym");
359    /// }
360    /// ```
361    pub fn soundex(mut self, algo: SoundexAlgorithm) -> Self {
362        self.soundex = Some(algo);
363        self
364    }
365
366    /// Overlay extra words on the built-in dictionary without a full trie rebuild.
367    ///
368    /// Words are stored in a sorted list alongside the pre-compiled trie.
369    /// Prefer this over a full rebuild when adding a small domain-specific
370    /// vocabulary (e.g. product names, technical terms).
371    ///
372    /// Newline-separated; `#` lines are ignored.
373    ///
374    /// # Example
375    ///
376    /// ```rust
377    /// use kham_core::fts::FtsTokenizer;
378    /// use kham_core::TokenKind;
379    ///
380    /// let fts = FtsTokenizer::builder()
381    ///     .dict_merge("โปรแกรมเมอร์\nปัญญาประดิษฐ์\n")
382    ///     .build();
383    /// let tokens = fts.segment_for_fts("โปรแกรมเมอร์ไทย");
384    /// assert!(tokens.iter().any(|t| t.text == "โปรแกรมเมอร์" && t.kind == TokenKind::Thai));
385    /// ```
386    pub fn dict_merge(mut self, words: &str) -> Self {
387        self.dict_merge = Some(String::from(words));
388        self
389    }
390
391    /// Consume the builder and return a configured [`FtsTokenizer`].
392    ///
393    /// # Example
394    ///
395    /// ```rust
396    /// use kham_core::fts::FtsTokenizer;
397    /// use kham_core::soundex::SoundexAlgorithm;
398    /// use kham_core::stopwords::StopwordSet;
399    ///
400    /// let fts = FtsTokenizer::builder()
401    ///     .soundex(SoundexAlgorithm::Lk82)
402    ///     .stopwords(StopwordSet::from_text(""))
403    ///     .build();
404    /// assert!(!fts.segment_for_fts("กินข้าว").is_empty());
405    /// ```
406    pub fn build(self) -> FtsTokenizer {
407        let tokenizer = if let Some(ref words) = self.dict_merge {
408            Tokenizer::builder().dict_merge(words).build()
409        } else {
410            Tokenizer::new()
411        };
412        FtsTokenizer {
413            tokenizer,
414            stopwords: self.stopwords.unwrap_or_else(StopwordSet::builtin),
415            synonyms: self.synonyms.unwrap_or_else(SynonymMap::empty),
416            ngram_size: self.ngram_size.unwrap_or(3),
417            pos_tagger: self.pos_tagger.unwrap_or_else(PosTagger::builtin),
418            ne_tagger: self.ne_tagger.unwrap_or_else(NeTagger::builtin),
419            romanization: self.romanization,
420            abbrev_map: self.abbrev_map,
421            number_normalize: self.number_normalize.unwrap_or(true),
422            soundex: self.soundex,
423        }
424    }
425}
426
427/// Full-text search tokenizer for Thai text.
428///
429/// Wraps [`Tokenizer`] with stopword filtering, synonym expansion, and n-gram
430/// generation for out-of-vocabulary tokens.
431///
432/// Construct once and reuse:
433///
434/// ```rust
435/// use kham_core::fts::FtsTokenizer;
436///
437/// let fts = FtsTokenizer::new();
438/// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
439/// assert!(!tokens.is_empty());
440/// ```
441pub struct FtsTokenizer {
442    tokenizer: Tokenizer,
443    stopwords: StopwordSet,
444    synonyms: SynonymMap,
445    ngram_size: usize,
446    pos_tagger: PosTagger,
447    ne_tagger: NeTagger,
448    romanization: Option<RomanizationMap>,
449    abbrev_map: Option<AbbrevMap>,
450    number_normalize: bool,
451    soundex: Option<SoundexAlgorithm>,
452}
453
454impl FtsTokenizer {
455    /// Create an [`FtsTokenizer`] with built-in stopwords and no synonyms.
456    ///
457    /// # Example
458    ///
459    /// ```rust
460    /// use kham_core::fts::FtsTokenizer;
461    ///
462    /// let fts = FtsTokenizer::new();
463    /// let lexemes = fts.lexemes("กินข้าวกับปลา");
464    /// // Built-in stopword กับ is excluded; content words are present
465    /// assert!(!lexemes.contains(&String::from("กับ")));
466    /// assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
467    /// ```
468    pub fn new() -> Self {
469        FtsTokenizerBuilder::default().build()
470    }
471
472    /// Return a [`FtsTokenizerBuilder`] for custom configuration.
473    ///
474    /// # Example
475    ///
476    /// ```rust
477    /// use kham_core::fts::FtsTokenizer;
478    /// use kham_core::soundex::SoundexAlgorithm;
479    /// use kham_core::synonym::SynonymMap;
480    ///
481    /// let fts = FtsTokenizer::builder()
482    ///     .synonyms(SynonymMap::from_tsv("รถ\tรถยนต์\n"))
483    ///     .soundex(SoundexAlgorithm::Lk82)
484    ///     .build();
485    /// assert!(!fts.segment_for_fts("รถ").is_empty());
486    /// ```
487    pub fn builder() -> FtsTokenizerBuilder {
488        FtsTokenizerBuilder::default()
489    }
490
491    /// Segment `text` and annotate each token for FTS indexing.
492    ///
493    /// Normalises the input text before segmentation so that สระลอย and stacked
494    /// tone marks are handled correctly. Whitespace tokens are excluded.
495    ///
496    /// The returned `Vec<FtsToken>` covers all non-whitespace tokens. Call
497    /// [`index_tokens`] instead when you only need the tokens to be indexed
498    /// (stopwords excluded).
499    ///
500    /// [`index_tokens`]: FtsTokenizer::index_tokens
501    ///
502    /// # Examples
503    ///
504    /// ```rust
505    /// use kham_core::fts::FtsTokenizer;
506    ///
507    /// let fts = FtsTokenizer::new();
508    /// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
509    /// // Positions are 0-based and sequential across non-whitespace tokens
510    /// for (i, t) in tokens.iter().enumerate() {
511    ///     assert_eq!(t.position, i);
512    /// }
513    /// // กับ is a common conjunction — marked as a stopword
514    /// let kap = tokens.iter().find(|t| t.text == "กับ").unwrap();
515    /// assert!(kap.is_stop);
516    /// ```
517    ///
518    /// Named entities are tagged automatically — `kind` becomes `TokenKind::Named`:
519    ///
520    /// ```rust
521    /// use kham_core::fts::FtsTokenizer;
522    /// use kham_core::TokenKind;
523    ///
524    /// let fts = FtsTokenizer::new();
525    /// let tokens = fts.segment_for_fts("ไปกรุงเทพ");
526    /// assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));
527    /// ```
528    ///
529    /// Enable phonetic synonyms with [`FtsTokenizerBuilder::soundex`]:
530    ///
531    /// ```rust
532    /// use kham_core::fts::FtsTokenizer;
533    /// use kham_core::soundex::SoundexAlgorithm;
534    ///
535    /// let fts = FtsTokenizer::builder()
536    ///     .soundex(SoundexAlgorithm::Lk82)
537    ///     .build();
538    /// let tokens = fts.segment_for_fts("กิน");
539    /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
540    /// // synonyms now contains the lk82 code, enabling fuzzy phonetic matching
541    /// assert!(!t.synonyms.is_empty());
542    /// ```
543    pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken> {
544        let normalized = self.tokenizer.normalize(text);
545        // Expand abbreviations (e.g. ก.ค. → กรกฎาคม) before segmentation so
546        // dot-containing patterns are replaced as single units.
547        let expanded = match self.abbrev_map.as_ref() {
548            Some(am) => am.expand_text(&normalized),
549            None => normalized,
550        };
551        let raw_tokens = self
552            .ne_tagger
553            .tag_tokens(self.tokenizer.segment(&expanded), &expanded);
554
555        let mut result = Vec::with_capacity(raw_tokens.len());
556        let mut position = 0usize;
557
558        for token in &raw_tokens {
559            if token.kind == TokenKind::Whitespace {
560                continue;
561            }
562
563            let is_stop = self.stopwords.contains(token.text);
564            let is_thai_or_named = matches!(token.kind, TokenKind::Thai | TokenKind::Named(_));
565            let mut synonyms = self
566                .synonyms
567                .expand(token.text)
568                .map(|s| s.to_vec())
569                .unwrap_or_default();
570            if is_thai_or_named {
571                if let Some(ref rom) = self.romanization {
572                    if let Some(rtgs) = rom.romanize(token.text) {
573                        synonyms.push(String::from(rtgs));
574                    }
575                }
576                if let Some(algo) = self.soundex {
577                    let code = soundex(token.text, algo);
578                    if !code.chars().all(|c| c == '0') {
579                        synonyms.push(code);
580                    }
581                }
582            }
583            if self.number_normalize {
584                match token.kind {
585                    // Number token with Thai digits → add ASCII form as synonym.
586                    TokenKind::Number => {
587                        let ascii = thai_digits_to_ascii(token.text);
588                        if ascii != token.text {
589                            synonyms.push(ascii);
590                        }
591                    }
592                    // Thai token that is a recognised number word → add decimal string.
593                    TokenKind::Thai => {
594                        if let Some(decimal) = thai_word_to_decimal(token.text) {
595                            synonyms.push(decimal);
596                        }
597                    }
598                    _ => {}
599                }
600            }
601            let trigrams = if token.kind == TokenKind::Unknown && self.ngram_size > 0 {
602                char_ngrams(token.text, self.ngram_size)
603                    .map(String::from)
604                    .collect()
605            } else {
606                Vec::new()
607            };
608            let ne = if let TokenKind::Named(k) = token.kind {
609                Some(k)
610            } else {
611                None
612            };
613            let pos = if token.kind == TokenKind::Thai {
614                self.pos_tagger.tag(token.text)
615            } else {
616                None
617            };
618
619            result.push(FtsToken {
620                text: String::from(token.text),
621                position,
622                kind: token.kind,
623                is_stop,
624                synonyms,
625                trigrams,
626                pos,
627                ne,
628                confidence: token.confidence,
629            });
630
631            position += 1;
632        }
633
634        result
635    }
636
637    /// Return only the tokens to be written into a search index.
638    ///
639    /// Filters out stopwords and whitespace. Each [`FtsToken`] still carries
640    /// its original `position` so phrase-distance scoring remains correct.
641    ///
642    /// # Example
643    ///
644    /// ```rust
645    /// use kham_core::fts::FtsTokenizer;
646    ///
647    /// let fts = FtsTokenizer::new();
648    /// let tokens = fts.index_tokens("กินข้าวกับปลา");
649    /// // No stopwords in the index
650    /// assert!(tokens.iter().all(|t| !t.is_stop));
651    /// // Positions are preserved from the full sequence for phrase scoring
652    /// let positions: Vec<usize> = tokens.iter().map(|t| t.position).collect();
653    /// assert!(positions.windows(2).all(|w| w[0] < w[1]));
654    /// ```
655    pub fn index_tokens(&self, text: &str) -> Vec<FtsToken> {
656        self.segment_for_fts(text)
657            .into_iter()
658            .filter(|t| !t.is_stop)
659            .collect()
660    }
661
662    /// Return a streaming iterator over the FTS tokens for `text`.
663    ///
664    /// Equivalent to [`segment_for_fts`] but wraps the result in an
665    /// [`FtsTokenStream`] so callers can consume tokens one at a time.
666    /// Use [`FtsTokenStream::next_index_token`] to skip stopwords automatically.
667    ///
668    /// The full token list is materialised internally because the NE tagger
669    /// requires multi-token context; this is a streaming *consumer*, not a
670    /// lazy producer.
671    ///
672    /// # Example
673    ///
674    /// ```rust
675    /// use kham_core::fts::FtsTokenizer;
676    ///
677    /// let fts = FtsTokenizer::new();
678    /// let mut stream = fts.segment_stream("กินข้าวกับปลา");
679    /// let mut index_texts: Vec<String> = Vec::new();
680    /// while let Some(tok) = stream.next_index_token() {
681    ///     index_texts.push(tok.text);
682    /// }
683    /// // กับ is a stopword — it should not appear in index_texts
684    /// assert!(!index_texts.contains(&String::from("กับ")));
685    /// assert!(index_texts.iter().any(|t| t == "กิน" || t == "ปลา"));
686    /// ```
687    ///
688    /// [`segment_for_fts`]: FtsTokenizer::segment_for_fts
689    pub fn segment_stream(&self, text: &str) -> FtsTokenStream {
690        FtsTokenStream {
691            inner: self.segment_for_fts(text).into_iter(),
692        }
693    }
694
695    /// Collect all lexeme strings to be stored in a `tsvector`.
696    ///
697    /// Returns one string per non-stop token, plus synonym expansions and
698    /// trigrams for unknown tokens. Duplicates are not removed (the caller or
699    /// PostgreSQL handles deduplication).
700    ///
701    /// # Example
702    ///
703    /// ```rust
704    /// use kham_core::fts::FtsTokenizer;
705    ///
706    /// let fts = FtsTokenizer::new();
707    /// let lexemes = fts.lexemes("กินข้าวกับปลา");
708    /// // Content words are present; stopword กับ is absent
709    /// assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
710    /// assert!(!lexemes.contains(&String::from("กับ")));
711    /// ```
712    ///
713    /// With Thai digit normalization (enabled by default), both scripts match:
714    ///
715    /// ```rust
716    /// use kham_core::fts::FtsTokenizer;
717    ///
718    /// let fts = FtsTokenizer::new();
719    /// let lexemes = fts.lexemes("ธนาคาร๑๐๐แห่ง");
720    /// // ๑๐๐ (Thai digits) → synonym "100" (ASCII) — both appear in lexemes
721    /// assert!(lexemes.contains(&String::from("100")));
722    /// ```
723    pub fn lexemes(&self, text: &str) -> Vec<String> {
724        let tokens = self.index_tokens(text);
725        let mut out: Vec<String> = Vec::with_capacity(tokens.len() * 2);
726        for t in tokens {
727            out.push(t.text.clone());
728            out.extend(t.synonyms);
729            out.extend(t.trigrams);
730        }
731        out
732    }
733}
734
735impl Default for FtsTokenizer {
736    fn default() -> Self {
737        Self::new()
738    }
739}
740
741// ---------------------------------------------------------------------------
742// Tests
743// ---------------------------------------------------------------------------
744
745#[cfg(test)]
746mod tests {
747    use super::*;
748    use crate::stopwords::StopwordSet;
749    use crate::synonym::SynonymMap;
750
751    fn fts() -> FtsTokenizer {
752        FtsTokenizer::new()
753    }
754
755    // ── segment_for_fts ───────────────────────────────────────────────────────
756
757    #[test]
758    fn empty_input_returns_empty() {
759        assert!(fts().segment_for_fts("").is_empty());
760    }
761
762    #[test]
763    fn whitespace_tokens_excluded() {
764        let tokens = fts().segment_for_fts("กิน ข้าว");
765        assert!(tokens.iter().all(|t| t.kind != TokenKind::Whitespace));
766    }
767
768    #[test]
769    fn positions_are_sequential() {
770        let tokens = fts().segment_for_fts("กินข้าวกับปลา");
771        for (i, t) in tokens.iter().enumerate() {
772            assert_eq!(t.position, i, "position mismatch at index {i}");
773        }
774    }
775
776    #[test]
777    fn known_stopword_is_tagged() {
778        // "กับ" is a common conjunction and should be in the built-in stopword list
779        let tokens = fts().segment_for_fts("กินข้าวกับปลา");
780        let kap = tokens.iter().find(|t| t.text == "กับ");
781        assert!(kap.is_some(), "expected 'กับ' token");
782        assert!(kap.unwrap().is_stop, "'กับ' should be tagged as stopword");
783    }
784
785    #[test]
786    fn content_words_not_tagged_as_stop() {
787        let tokens = fts().segment_for_fts("โรงพยาบาล");
788        // May be OOV but should not be a stopword
789        for t in &tokens {
790            assert!(!t.is_stop, "'{}' should not be a stopword", t.text);
791        }
792    }
793
794    #[test]
795    fn text_is_reconstructable() {
796        // All tokens joined == normalised input (whitespace dropped)
797        let fts = fts();
798        let text = "กินข้าวกับปลา";
799        let normalized = fts.tokenizer.normalize(text);
800        let tokens = fts.segment_for_fts(text);
801        let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
802        assert_eq!(rebuilt, normalized);
803    }
804
805    // ── synonym expansion ─────────────────────────────────────────────────────
806
807    #[test]
808    fn synonym_expansion_attached() {
809        let synonyms = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
810        let fts = FtsTokenizer::builder()
811            .synonyms(synonyms)
812            .stopwords(StopwordSet::from_text(""))
813            .build();
814        // Segment a text containing "คอม" — need it in dict or it lands as Unknown
815        // Use builder with custom word so the segmenter recognises it
816        let tokens = fts.segment_for_fts("คอม");
817        let t = tokens.iter().find(|t| t.text == "คอม");
818        if let Some(tok) = t {
819            assert!(
820                tok.synonyms.contains(&String::from("คอมพิวเตอร์")),
821                "expected synonym expansion, got {:?}",
822                tok.synonyms
823            );
824        }
825    }
826
827    #[test]
828    fn no_synonyms_when_map_empty() {
829        let tokens = fts().segment_for_fts("กินข้าว");
830        for t in &tokens {
831            assert!(t.synonyms.is_empty());
832        }
833    }
834
835    // ── unknown token trigrams ────────────────────────────────────────────────
836
837    #[test]
838    fn unknown_token_gets_trigrams() {
839        // "กิ" = consonant + sara-i, a single 2-char TCC that is not a word.
840        // With ngram_size=2 the token should yield one bigram ("กิ").
841        // The newmm DP emits Unknown tokens one TCC at a time, so multi-char TCCs
842        // (like "กิ") are the shortest unit that can produce n-grams.
843        let fts = FtsTokenizer::builder()
844            .ngram_size(2)
845            .stopwords(StopwordSet::from_text(""))
846            .build();
847        let tokens = fts.segment_for_fts("กิ");
848        let unknown: Vec<_> = tokens
849            .iter()
850            .filter(|t| t.kind == TokenKind::Unknown && t.text.chars().count() >= 2)
851            .collect();
852        assert!(
853            !unknown.is_empty(),
854            "expected at least one multi-char Unknown token for 'กิ'"
855        );
856        for u in &unknown {
857            assert!(
858                !u.trigrams.is_empty(),
859                "unknown token '{}' ({} chars) should have bigrams",
860                u.text,
861                u.text.chars().count()
862            );
863        }
864    }
865
866    #[test]
867    fn known_thai_token_has_no_trigrams() {
868        let tokens = fts().segment_for_fts("กิน");
869        for t in &tokens {
870            if t.kind == TokenKind::Thai {
871                assert!(
872                    t.trigrams.is_empty(),
873                    "known Thai token '{}' should not have trigrams",
874                    t.text
875                );
876            }
877        }
878    }
879
880    #[test]
881    fn ngram_size_zero_disables_trigrams() {
882        let fts = FtsTokenizer::builder()
883            .ngram_size(0)
884            .stopwords(StopwordSet::from_text(""))
885            .build();
886        let tokens = fts.segment_for_fts("กขคง");
887        for t in &tokens {
888            assert!(t.trigrams.is_empty());
889        }
890    }
891
892    // ── index_tokens ──────────────────────────────────────────────────────────
893
894    #[test]
895    fn index_tokens_excludes_stopwords() {
896        let tokens = fts().index_tokens("กินข้าวกับปลา");
897        assert!(tokens.iter().all(|t| !t.is_stop));
898    }
899
900    #[test]
901    fn index_tokens_preserves_positions() {
902        // Positions in index_tokens must be a subset of segment_for_fts positions
903        let all = fts().segment_for_fts("กินข้าวกับปลา");
904        let indexed = fts().index_tokens("กินข้าวกับปลา");
905        for t in &indexed {
906            assert!(
907                all.iter().any(|a| a.position == t.position),
908                "indexed token at position {} not found in full token list",
909                t.position
910            );
911        }
912    }
913
914    // ── lexemes ───────────────────────────────────────────────────────────────
915
916    #[test]
917    fn lexemes_returns_non_stop_texts() {
918        let lexemes = fts().lexemes("กินข้าวกับปลา");
919        // "กับ" is a stopword — should not appear
920        assert!(!lexemes.contains(&String::from("กับ")));
921        // Content words should appear
922        assert!(
923            lexemes
924                .iter()
925                .any(|l| l == "กิน" || l == "ข้าว" || l == "ปลา"),
926            "expected content words in lexemes: {lexemes:?}"
927        );
928    }
929
930    #[test]
931    fn lexemes_empty_input_is_empty() {
932        assert!(fts().lexemes("").is_empty());
933    }
934
935    // ── multi-token NE ────────────────────────────────────────────────────────
936
937    #[test]
938    fn multi_token_ne_merged_in_pipeline() {
939        // กรุงเทพ is in the NE gazetteer as PLACE; the segmenter splits it
940        // into กรุง+เทพ. The FTS pipeline must merge them into one Named token.
941        let fts = FtsTokenizer::new();
942        let tokens = fts.segment_for_fts("ไปกรุงเทพ");
943        let named: Vec<_> = tokens
944            .iter()
945            .filter(|t| matches!(t.kind, TokenKind::Named(_)))
946            .collect();
947        assert!(
948            named.iter().any(|t| t.text == "กรุงเทพ"),
949            "กรุงเทพ should be tagged Named after multi-token merge, tokens: {:?}",
950            tokens
951                .iter()
952                .map(|t| (&t.text, &t.kind))
953                .collect::<alloc::vec::Vec<_>>()
954        );
955    }
956
957    #[test]
958    fn multi_token_ne_reconstructable() {
959        // Texts of all non-whitespace tokens must still reconstruct the normalized input.
960        let fts = FtsTokenizer::new();
961        let text = "ไปกรุงเทพ";
962        let normalized = fts.tokenizer.normalize(text);
963        let tokens = fts.segment_for_fts(text);
964        let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
965        assert_eq!(rebuilt, normalized);
966    }
967
968    // ── builder ───────────────────────────────────────────────────────────────
969
970    #[test]
971    fn builder_custom_stopwords() {
972        let stops = StopwordSet::from_text("กิน\n");
973        let fts = FtsTokenizer::builder().stopwords(stops).build();
974        let tokens = fts.segment_for_fts("กินข้าว");
975        let gin = tokens.iter().find(|t| t.text == "กิน");
976        if let Some(t) = gin {
977            assert!(t.is_stop, "'กิน' should be stop with custom list");
978        }
979    }
980
981    #[test]
982    fn builder_default_equals_new() {
983        // Both paths should produce the same result for a simple input
984        let a = FtsTokenizer::new().lexemes("กินข้าว");
985        let b = FtsTokenizer::builder().build().lexemes("กินข้าว");
986        assert_eq!(a, b);
987    }
988
989    // ── number normalization ──────────────────────────────────────────────────
990
991    #[test]
992    fn thai_digit_token_gets_ascii_synonym() {
993        let fts = FtsTokenizer::new();
994        let tokens = fts.segment_for_fts("๑๒๓");
995        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
996        assert!(num.is_some(), "expected a Number token");
997        let t = num.unwrap();
998        assert!(
999            t.synonyms.contains(&String::from("123")),
1000            "Thai digit token should have ASCII synonym, got {:?}",
1001            t.synonyms
1002        );
1003    }
1004
1005    #[test]
1006    fn ascii_digit_token_has_no_extra_synonym() {
1007        // ASCII digits need no conversion — synonyms should be empty (no map, no rom).
1008        let fts = FtsTokenizer::new();
1009        let tokens = fts.segment_for_fts("123");
1010        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
1011        assert!(num.is_some(), "expected a Number token");
1012        assert!(
1013            !num.unwrap().synonyms.contains(&String::from("123")),
1014            "ASCII digit token should not duplicate itself as a synonym"
1015        );
1016    }
1017
1018    #[test]
1019    fn thai_number_word_gets_decimal_synonym() {
1020        // หนึ่งร้อย may segment as a single Thai token or multiple tokens depending
1021        // on the dictionary. We check that at least one token carries "100" in synonyms.
1022        let fts = FtsTokenizer::new();
1023        let tokens = fts.segment_for_fts("หนึ่งร้อย");
1024        let has_hundred = tokens
1025            .iter()
1026            .any(|t| t.synonyms.contains(&String::from("100")));
1027        // หนึ่ง alone = Some(1), ร้อย alone = Some(100) — at least ร้อย should match.
1028        assert!(
1029            has_hundred,
1030            "expected a token with decimal synonym '100', tokens: {:?}",
1031            tokens
1032                .iter()
1033                .map(|t| (&t.text, &t.synonyms))
1034                .collect::<alloc::vec::Vec<_>>()
1035        );
1036    }
1037
1038    #[test]
1039    fn number_normalize_false_disables_conversion() {
1040        let fts = FtsTokenizer::builder()
1041            .number_normalize(false)
1042            .stopwords(StopwordSet::from_text(""))
1043            .build();
1044        let tokens = fts.segment_for_fts("๑๒๓");
1045        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
1046        assert!(num.is_some());
1047        assert!(
1048            !num.unwrap().synonyms.contains(&String::from("123")),
1049            "number_normalize=false should suppress ASCII synonym"
1050        );
1051    }
1052
1053    #[test]
1054    fn mixed_thai_digit_in_context() {
1055        // "ธนาคาร๑๐๐แห่ง" — the ๑๐๐ part should be a Number token with synonym "100"
1056        let fts = FtsTokenizer::new();
1057        let tokens = fts.segment_for_fts("ธนาคาร๑๐๐แห่ง");
1058        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
1059        assert!(num.is_some(), "expected Number token in mixed string");
1060        assert!(
1061            num.unwrap().synonyms.contains(&String::from("100")),
1062            "expected ASCII synonym '100' for ๑๐๐"
1063        );
1064    }
1065
1066    // ── abbreviation expansion ────────────────────────────────────────────────
1067
1068    #[test]
1069    fn abbrev_map_expands_before_segmentation() {
1070        use crate::abbrev::AbbrevMap;
1071        let fts = FtsTokenizer::builder()
1072            .abbrevs(AbbrevMap::builtin())
1073            .stopwords(StopwordSet::from_text(""))
1074            .build();
1075        // ก.ค. → กรกฎาคม before segmentation. The segmenter may split the
1076        // expansion further (กรกฎา + คม) — what matters is that dots are gone
1077        // and the Thai characters of กรกฎาคม are present.
1078        let tokens = fts.segment_for_fts("ก.ค.");
1079        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1080        let joined: String = texts.concat();
1081        assert!(
1082            joined.contains("กรกฎา") || joined.contains("กรกฎาคม"),
1083            "expected กรกฎา(คม) characters after abbrev expansion, got: {texts:?}"
1084        );
1085        assert!(
1086            !texts.contains(&"."),
1087            "dots should be consumed by abbrev expansion, got: {texts:?}"
1088        );
1089    }
1090
1091    // ── segment_stream / FtsTokenStream ──────────────────────────────────────
1092
1093    #[test]
1094    fn segment_stream_yields_all_non_whitespace_tokens() {
1095        let fts = fts();
1096        let via_vec = fts.segment_for_fts("กินข้าวกับปลา");
1097        let via_stream: Vec<FtsToken> = fts.segment_stream("กินข้าวกับปลา").collect();
1098        assert_eq!(via_vec, via_stream);
1099    }
1100
1101    #[test]
1102    fn segment_stream_empty_input() {
1103        let mut stream = fts().segment_stream("");
1104        assert!(stream.next().is_none());
1105    }
1106
1107    #[test]
1108    fn next_index_token_skips_stopwords() {
1109        let fts = fts();
1110        let mut stream = fts.segment_stream("กินข้าวกับปลา");
1111        let mut texts = Vec::new();
1112        while let Some(tok) = stream.next_index_token() {
1113            texts.push(tok.text);
1114        }
1115        assert!(
1116            !texts.contains(&String::from("กับ")),
1117            "stopword กับ must be skipped"
1118        );
1119        assert!(
1120            texts.iter().any(|t| t == "กิน" || t == "ปลา"),
1121            "content words must be yielded"
1122        );
1123    }
1124
1125    #[test]
1126    fn next_index_token_matches_index_tokens() {
1127        let fts = fts();
1128        let text = "กินข้าวกับปลา";
1129        let via_index: Vec<_> = fts.index_tokens(text);
1130        let mut stream = fts.segment_stream(text);
1131        let mut via_stream = Vec::new();
1132        while let Some(tok) = stream.next_index_token() {
1133            via_stream.push(tok);
1134        }
1135        assert_eq!(via_index, via_stream);
1136    }
1137
1138    #[test]
1139    fn stream_size_hint_is_correct() {
1140        let fts = fts();
1141        let via_vec = fts.segment_for_fts("กินข้าวกับปลา");
1142        let n = via_vec.len();
1143        let stream = fts.segment_stream("กินข้าวกับปลา");
1144        assert_eq!(stream.size_hint(), (n, Some(n)));
1145    }
1146
1147    #[test]
1148    fn abbrev_expansion_disabled_by_default() {
1149        // FtsTokenizer::new() has no abbrev_map — ก.ค. stays as individual tokens.
1150        let fts = FtsTokenizer::new();
1151        let tokens = fts.segment_for_fts("ก.ค.");
1152        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1153        // Without expansion the dot(s) must still be present as punctuation tokens.
1154        assert!(
1155            texts.contains(&"."),
1156            "without abbrev expansion, dots should remain as tokens, got: {texts:?}"
1157        );
1158    }
1159
1160    // ── soundex synonyms ──────────────────────────────────────────────────────
1161
1162    #[test]
1163    fn soundex_lk82_appended_to_thai_synonyms() {
1164        use crate::soundex::lk82;
1165        let fts = FtsTokenizer::builder()
1166            .soundex(SoundexAlgorithm::Lk82)
1167            .stopwords(StopwordSet::from_text(""))
1168            .build();
1169        let tokens = fts.segment_for_fts("กิน");
1170        let t = tokens.iter().find(|t| t.text == "กิน");
1171        assert!(t.is_some(), "expected token 'กิน'");
1172        let expected_code = lk82("กิน");
1173        assert!(
1174            t.unwrap().synonyms.contains(&expected_code),
1175            "expected lk82 code '{expected_code}' in synonyms, got {:?}",
1176            t.unwrap().synonyms
1177        );
1178    }
1179
1180    #[test]
1181    fn soundex_not_emitted_by_default() {
1182        // Without .soundex() in the builder, no soundex codes should appear.
1183        let fts = FtsTokenizer::new();
1184        let tokens = fts.segment_for_fts("กินข้าว");
1185        for t in &tokens {
1186            // A soundex code is 4 ASCII chars (lk82/udom83); no synonym should look like one.
1187            for syn in &t.synonyms {
1188                let looks_like_soundex =
1189                    syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
1190                assert!(
1191                    !looks_like_soundex,
1192                    "unexpected soundex-like synonym '{}' on token '{}'",
1193                    syn, t.text
1194                );
1195            }
1196        }
1197    }
1198
1199    #[test]
1200    fn soundex_same_sounding_words_share_code_in_index() {
1201        // กาน and ขาน share lk82 code "1600"; both should carry it as a synonym.
1202        use crate::soundex::lk82;
1203        let fts = FtsTokenizer::builder()
1204            .soundex(SoundexAlgorithm::Lk82)
1205            .stopwords(StopwordSet::from_text(""))
1206            .build();
1207        let code = lk82("กาน");
1208        for word in &["กาน", "ขาน", "คาน"] {
1209            let tokens = fts.segment_for_fts(word);
1210            let t = tokens.first().expect("expected at least one token");
1211            assert!(
1212                t.synonyms.contains(&code),
1213                "'{word}' should carry lk82 code '{code}', got {:?}",
1214                t.synonyms
1215            );
1216        }
1217    }
1218
1219    #[test]
1220    fn soundex_not_emitted_for_non_thai_tokens() {
1221        let fts = FtsTokenizer::builder()
1222            .soundex(SoundexAlgorithm::Lk82)
1223            .stopwords(StopwordSet::from_text(""))
1224            .build();
1225        let tokens = fts.segment_for_fts("hello 123");
1226        for t in &tokens {
1227            for syn in &t.synonyms {
1228                let looks_like_soundex =
1229                    syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
1230                assert!(
1231                    !looks_like_soundex,
1232                    "non-Thai token '{}' should not get a soundex synonym, got '{syn}'",
1233                    t.text
1234                );
1235            }
1236        }
1237    }
1238
1239    #[test]
1240    fn soundex_udom83_appended() {
1241        use crate::soundex::udom83;
1242        let fts = FtsTokenizer::builder()
1243            .soundex(SoundexAlgorithm::Udom83)
1244            .stopwords(StopwordSet::from_text(""))
1245            .build();
1246        let tokens = fts.segment_for_fts("กิน");
1247        let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
1248        let expected = udom83("กิน");
1249        assert!(
1250            t.synonyms.contains(&expected),
1251            "expected udom83 code '{expected}' in synonyms, got {:?}",
1252            t.synonyms
1253        );
1254    }
1255
1256    #[test]
1257    fn abbrev_expansion_date_sentence() {
1258        use crate::abbrev::AbbrevMap;
1259        let fts = FtsTokenizer::builder()
1260            .abbrevs(AbbrevMap::builtin())
1261            .stopwords(StopwordSet::from_text(""))
1262            .build();
1263        // พ.ศ. → พุทธศักราช; the segmenter may split it further — verify the
1264        // chars are present and dots are gone.
1265        let tokens = fts.segment_for_fts("พ.ศ.2567");
1266        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1267        let joined: String = texts.concat();
1268        assert!(
1269            joined.contains("พุทธ") || joined.contains("พุทธศักราช"),
1270            "expected พุทธ(ศักราช) chars after expanding พ.ศ., got: {texts:?}"
1271        );
1272        assert!(
1273            !texts.contains(&"."),
1274            "dots should be consumed by expansion, got: {texts:?}"
1275        );
1276    }
1277}