kham_core/
fts.rs

1//! Full-text search pipeline for Thai text.
2//!
3//! [`FtsTokenizer`] orchestrates the complete FTS indexing pipeline:
4//! normalise → segment → tag stopwords → expand synonyms → attach position.
5//!
6//! The output [`FtsToken`] slice is consumed by the PostgreSQL `kham-pg`
7//! extension and by any other caller that needs FTS-ready lexemes.
8//!
9//! # Positions
10//!
11//! `position` is the ordinal index of the token in the non-whitespace token
12//! sequence (0-based). Stopwords retain their position so that phrase-distance
13//! scoring remains correct when stopwords are later omitted from the index.
14//!
15//! # Example
16//!
17//! ```rust
18//! use kham_core::fts::{FtsTokenizer, FtsToken};
19//!
20//! let fts = FtsTokenizer::new();
21//! let tokens = fts.segment_for_fts("กินข้าวกับปลา");
22//! for t in &tokens {
23//!     println!("{} pos={} stop={}", t.text, t.position, t.is_stop);
24//! }
25//! ```
26
27use alloc::string::String;
28use alloc::vec::Vec;
29
30use crate::abbrev::AbbrevMap;
31use crate::ne::NeTagger;
32use crate::ngram::char_ngrams;
33use crate::number::{thai_digits_to_ascii, thai_word_to_decimal};
34use crate::pos::{PosTag, PosTagger};
35use crate::romanizer::RomanizationMap;
36use crate::soundex::{soundex, SoundexAlgorithm};
37use crate::stopwords::StopwordSet;
38use crate::synonym::SynonymMap;
39use crate::token::{NamedEntityKind, TokenKind};
40use crate::Tokenizer;
41
42/// A streaming iterator over [`FtsToken`]s produced by the FTS pipeline.
43///
44/// Returned by [`FtsTokenizer::segment_stream`]. Internally holds the full
45/// `Vec<FtsToken>` as an [`alloc::vec::IntoIter`]; the streaming API is provided
46/// so callers can consume tokens one at a time without materialising a second
47/// collection.
48///
49/// # Example
50///
51/// ```rust
52/// use kham_core::fts::FtsTokenizer;
53///
54/// let fts = FtsTokenizer::new();
55/// let mut stream = fts.segment_stream("กินข้าวกับปลา");
56/// // next_index_token() skips stopwords — กับ is a stopword and is skipped.
57/// while let Some(tok) = stream.next_index_token() {
58///     println!("{} pos={}", tok.text, tok.position);
59/// }
60/// ```
61pub struct FtsTokenStream {
62    inner: alloc::vec::IntoIter<FtsToken>,
63}
64
65impl FtsTokenStream {
66    /// Advance to the next token that should be written into the search index,
67    /// skipping stopwords.
68    ///
69    /// Equivalent to calling [`Iterator::next`] in a loop until a token with
70    /// `is_stop == false` is found, or the stream is exhausted.
71    pub fn next_index_token(&mut self) -> Option<FtsToken> {
72        self.inner.by_ref().find(|t| !t.is_stop)
73    }
74}
75
76impl Iterator for FtsTokenStream {
77    type Item = FtsToken;
78
79    #[inline]
80    fn next(&mut self) -> Option<FtsToken> {
81        self.inner.next()
82    }
83
84    #[inline]
85    fn size_hint(&self) -> (usize, Option<usize>) {
86        self.inner.size_hint()
87    }
88}
89
90/// A token produced by the FTS pipeline, ready for lexeme indexing.
91#[derive(Debug, Clone, PartialEq, Eq)]
92pub struct FtsToken {
93    /// The token text (owned; may be normalised).
94    pub text: String,
95    /// Ordinal position in the token sequence (0-based, gaps for whitespace).
96    pub position: usize,
97    /// Script / category of the original token.
98    pub kind: TokenKind,
99    /// `true` if this token matches the stopword list.
100    pub is_stop: bool,
101    /// Synonym expansions (empty if none configured or no match).
102    pub synonyms: Vec<String>,
103    /// Character trigrams — populated only for [`TokenKind::Unknown`] tokens.
104    pub trigrams: Vec<String>,
105    /// Primary part-of-speech tag from the lookup table, or `None` if the word
106    /// is not in the table (OOV) or is not a Thai token.
107    pub pos: Option<PosTag>,
108    /// Named entity category, or `None` if the token is not in the NE
109    /// gazetteer. When set, `kind` is [`TokenKind::Named`]`(ne)`.
110    pub ne: Option<NamedEntityKind>,
111}
112
113/// Builder for [`FtsTokenizer`].
114#[derive(Default)]
115pub struct FtsTokenizerBuilder {
116    stopwords: Option<StopwordSet>,
117    synonyms: Option<SynonymMap>,
118    ngram_size: Option<usize>,
119    pos_tagger: Option<PosTagger>,
120    ne_tagger: Option<NeTagger>,
121    romanization: Option<RomanizationMap>,
122    abbrev_map: Option<AbbrevMap>,
123    /// `None` means "use default (true)".
124    number_normalize: Option<bool>,
125    soundex: Option<SoundexAlgorithm>,
126    /// Extra words to overlay on top of the built-in dictionary (fast path).
127    dict_merge: Option<String>,
128}
129
130impl FtsTokenizerBuilder {
131    /// Use a custom stopword set instead of the built-in list.
132    ///
133    /// # Example
134    ///
135    /// ```rust
136    /// use kham_core::fts::FtsTokenizer;
137    /// use kham_core::stopwords::StopwordSet;
138    ///
139    /// let stops = StopwordSet::from_text("กิน\nข้าว\n");
140    /// let fts = FtsTokenizer::builder().stopwords(stops).build();
141    /// let tokens = fts.segment_for_fts("กินข้าว");
142    /// assert!(tokens.iter().all(|t| t.is_stop || t.text != "กิน"));
143    /// ```
144    pub fn stopwords(mut self, s: StopwordSet) -> Self {
145        self.stopwords = Some(s);
146        self
147    }
148
149    /// Attach a synonym map for expansion.
150    ///
151    /// # Example
152    ///
153    /// ```rust
154    /// use kham_core::fts::FtsTokenizer;
155    /// use kham_core::synonym::SynonymMap;
156    ///
157    /// // TSV: canonical TAB synonym1 TAB synonym2 …
158    /// let syns = SynonymMap::from_tsv("รถ\tรถยนต์\tยานพาหนะ\n");
159    /// let fts = FtsTokenizer::builder().synonyms(syns).build();
160    /// let tokens = fts.segment_for_fts("รถ");
161    /// let t = tokens.iter().find(|t| t.text == "รถ").unwrap();
162    /// assert!(t.synonyms.contains(&String::from("รถยนต์")));
163    /// ```
164    pub fn synonyms(mut self, m: SynonymMap) -> Self {
165        self.synonyms = Some(m);
166        self
167    }
168
169    /// Override the n-gram size used for [`TokenKind::Unknown`] tokens.
170    ///
171    /// Default: 3 (trigrams). Set to 0 to disable n-gram generation.
172    ///
173    /// # Example
174    ///
175    /// ```rust
176    /// use kham_core::fts::FtsTokenizer;
177    /// use kham_core::stopwords::StopwordSet;
178    ///
179    /// // Disable n-grams entirely — useful when index size must be small
180    /// let fts = FtsTokenizer::builder()
181    ///     .ngram_size(0)
182    ///     .stopwords(StopwordSet::from_text(""))
183    ///     .build();
184    /// let tokens = fts.segment_for_fts("กขคง"); // unknown word → no trigrams
185    /// assert!(tokens.iter().all(|t| t.trigrams.is_empty()));
186    /// ```
187    pub fn ngram_size(mut self, n: usize) -> Self {
188        self.ngram_size = Some(n);
189        self
190    }
191
192    /// Use a custom POS tagger instead of the built-in table.
193    ///
194    /// # Example
195    ///
196    /// ```rust
197    /// use kham_core::fts::FtsTokenizer;
198    /// use kham_core::pos::{PosTag, PosTagger};
199    ///
200    /// // Custom TSV: word TAB POS_TAG
201    /// let tagger = PosTagger::from_tsv("กิน\tVERB\n");
202    /// let fts = FtsTokenizer::builder().pos_tagger(tagger).build();
203    /// // Segment กิน alone so it is not merged into a compound
204    /// let tokens = fts.segment_for_fts("กิน");
205    /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
206    /// assert_eq!(t.pos, Some(PosTag::Verb));
207    /// ```
208    pub fn pos_tagger(mut self, t: PosTagger) -> Self {
209        self.pos_tagger = Some(t);
210        self
211    }
212
213    /// Use a custom NE gazetteer instead of the built-in table.
214    ///
215    /// # Example
216    ///
217    /// ```rust
218    /// use kham_core::fts::FtsTokenizer;
219    /// use kham_core::ne::NeTagger;
220    /// use kham_core::TokenKind;
221    ///
222    /// // Domain-specific NE list: word TAB NE_TAG
223    /// let ne = NeTagger::from_tsv("เซเรน่า\tPERSON\n");
224    /// let fts = FtsTokenizer::builder().ne_tagger(ne).build();
225    /// let tokens = fts.segment_for_fts("เซเรน่า");
226    /// assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));
227    /// ```
228    pub fn ne_tagger(mut self, t: NeTagger) -> Self {
229        self.ne_tagger = Some(t);
230        self
231    }
232
233    /// Attach a romanization map so RTGS forms are added to [`FtsToken::synonyms`].
234    ///
235    /// When set, each Thai and Named token whose text is found in the map gets its
236    /// RTGS romanization appended to `synonyms`, enabling Latin-script queries
237    /// (e.g. `kin`) to match Thai-script documents (e.g. `กิน`) in PostgreSQL FTS.
238    ///
239    /// Disabled by default — call this method to opt in.
240    ///
241    /// # Example
242    ///
243    /// ```rust
244    /// use kham_core::fts::FtsTokenizer;
245    /// use kham_core::romanizer::RomanizationMap;
246    ///
247    /// // TSV: Thai word TAB RTGS romanization
248    /// let rom = RomanizationMap::from_tsv("กิน\tkin\n");
249    /// let fts = FtsTokenizer::builder().romanization(rom).build();
250    /// let tokens = fts.segment_for_fts("กิน");
251    /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
252    /// // Latin synonym "kin" enables queries like `WHERE doc @@ 'kin'`
253    /// assert!(t.synonyms.contains(&String::from("kin")));
254    /// ```
255    pub fn romanization(mut self, m: RomanizationMap) -> Self {
256        self.romanization = Some(m);
257        self
258    }
259
260    /// Attach an abbreviation map for pre-tokenisation expansion.
261    ///
262    /// When set, [`FtsTokenizer::segment_for_fts`] calls
263    /// [`AbbrevMap::expand_text`] on the normalised input before segmentation.
264    /// This replaces abbreviated forms (e.g. `ก.ค.`) with their canonical
265    /// expansions (`กรกฎาคม`) so they are indexed and searchable by full form.
266    ///
267    /// Disabled by default — call this method to opt in.
268    ///
269    /// # Example
270    ///
271    /// ```rust
272    /// use kham_core::fts::FtsTokenizer;
273    /// use kham_core::abbrev::AbbrevMap;
274    /// use kham_core::stopwords::StopwordSet;
275    ///
276    /// let fts = FtsTokenizer::builder()
277    ///     .abbrevs(AbbrevMap::builtin())
278    ///     .stopwords(StopwordSet::from_text(""))
279    ///     .build();
280    /// // ก.ค. expands to กรกฎาคม before segmentation — dots disappear
281    /// let tokens = fts.segment_for_fts("ก.ค.");
282    /// let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
283    /// assert!(!texts.contains(&"."), "dots should be consumed by expansion");
284    /// ```
285    pub fn abbrevs(mut self, m: AbbrevMap) -> Self {
286        self.abbrev_map = Some(m);
287        self
288    }
289
290    /// Enable or disable number normalization (default: `true`).
291    ///
292    /// When enabled:
293    /// - [`TokenKind::Number`] tokens that contain Thai digits (๐–๙) get the
294    ///   ASCII digit string added to their [`FtsToken::synonyms`]
295    ///   (e.g. `๑๒๓` → synonym `"123"`).
296    /// - [`TokenKind::Thai`] tokens that are recognised Thai cardinal number
297    ///   words get their decimal value added to `synonyms`
298    ///   (e.g. `หนึ่งร้อย` → synonym `"100"`).
299    ///
300    /// This lets queries using either script match documents written in the
301    /// other. Set to `false` to opt out.
302    ///
303    /// # Example
304    ///
305    /// ```rust
306    /// use kham_core::fts::FtsTokenizer;
307    /// use kham_core::TokenKind;
308    ///
309    /// // Default (true): ๑๒๓ gets ASCII synonym "123"
310    /// let fts = FtsTokenizer::new();
311    /// let tokens = fts.segment_for_fts("๑๒๓");
312    /// let num = tokens.iter().find(|t| t.kind == TokenKind::Number).unwrap();
313    /// assert!(num.synonyms.contains(&String::from("123")));
314    ///
315    /// // Opt out: no conversion performed
316    /// let fts_off = FtsTokenizer::builder().number_normalize(false).build();
317    /// let tokens_off = fts_off.segment_for_fts("๑๒๓");
318    /// let num_off = tokens_off.iter().find(|t| t.kind == TokenKind::Number).unwrap();
319    /// assert!(!num_off.synonyms.contains(&String::from("123")));
320    /// ```
321    pub fn number_normalize(mut self, v: bool) -> Self {
322        self.number_normalize = Some(v);
323        self
324    }
325
326    /// Emit a Thai phonetic soundex code as an additional synonym for Thai and Named tokens.
327    ///
328    /// When set, each Thai and Named token whose text contains Thai consonants gets its
329    /// soundex code appended to [`FtsToken::synonyms`], enabling phonetic fuzzy matching
330    /// in full-text search (e.g. querying `"1600"` matches กาน, ขาน, and คาน with lk82).
331    ///
332    /// [`SoundexAlgorithm::Lk82`] and [`SoundexAlgorithm::Udom83`] produce fixed
333    /// 4-character codes and are the recommended choices for FTS indexing.
334    /// [`SoundexAlgorithm::MetaSound`] produces variable-length codes and is more
335    /// collision-prone at word level — prefer lk82 or udom83 for general FTS use.
336    ///
337    /// Disabled by default — call this method to opt in.
338    ///
339    /// # Example
340    ///
341    /// ```rust
342    /// use kham_core::fts::FtsTokenizer;
343    /// use kham_core::soundex::{lk82, SoundexAlgorithm};
344    /// use kham_core::stopwords::StopwordSet;
345    ///
346    /// let fts = FtsTokenizer::builder()
347    ///     .soundex(SoundexAlgorithm::Lk82)
348    ///     .stopwords(StopwordSet::from_text(""))
349    ///     .build();
350    /// // กาน / ขาน / คาน all map to the same lk82 code — stored once per token
351    /// for word in &["กาน", "ขาน", "คาน"] {
352    ///     let tokens = fts.segment_for_fts(word);
353    ///     let t = tokens.first().unwrap();
354    ///     assert!(t.synonyms.contains(&lk82(word)), "{word} missing lk82 synonym");
355    /// }
356    /// ```
357    pub fn soundex(mut self, algo: SoundexAlgorithm) -> Self {
358        self.soundex = Some(algo);
359        self
360    }
361
362    /// Overlay extra words on the built-in dictionary without a full trie rebuild.
363    ///
364    /// Words are stored in a sorted list alongside the pre-compiled trie.
365    /// Prefer this over a full rebuild when adding a small domain-specific
366    /// vocabulary (e.g. product names, technical terms).
367    ///
368    /// Newline-separated; `#` lines are ignored.
369    ///
370    /// # Example
371    ///
372    /// ```rust
373    /// use kham_core::fts::FtsTokenizer;
374    /// use kham_core::TokenKind;
375    ///
376    /// let fts = FtsTokenizer::builder()
377    ///     .dict_merge("โปรแกรมเมอร์\nปัญญาประดิษฐ์\n")
378    ///     .build();
379    /// let tokens = fts.segment_for_fts("โปรแกรมเมอร์ไทย");
380    /// assert!(tokens.iter().any(|t| t.text == "โปรแกรมเมอร์" && t.kind == TokenKind::Thai));
381    /// ```
382    pub fn dict_merge(mut self, words: &str) -> Self {
383        self.dict_merge = Some(String::from(words));
384        self
385    }
386
387    /// Consume the builder and return a configured [`FtsTokenizer`].
388    ///
389    /// # Example
390    ///
391    /// ```rust
392    /// use kham_core::fts::FtsTokenizer;
393    /// use kham_core::soundex::SoundexAlgorithm;
394    /// use kham_core::stopwords::StopwordSet;
395    ///
396    /// let fts = FtsTokenizer::builder()
397    ///     .soundex(SoundexAlgorithm::Lk82)
398    ///     .stopwords(StopwordSet::from_text(""))
399    ///     .build();
400    /// assert!(!fts.segment_for_fts("กินข้าว").is_empty());
401    /// ```
402    pub fn build(self) -> FtsTokenizer {
403        let tokenizer = if let Some(ref words) = self.dict_merge {
404            Tokenizer::builder().dict_merge(words).build()
405        } else {
406            Tokenizer::new()
407        };
408        FtsTokenizer {
409            tokenizer,
410            stopwords: self.stopwords.unwrap_or_else(StopwordSet::builtin),
411            synonyms: self.synonyms.unwrap_or_else(SynonymMap::empty),
412            ngram_size: self.ngram_size.unwrap_or(3),
413            pos_tagger: self.pos_tagger.unwrap_or_else(PosTagger::builtin),
414            ne_tagger: self.ne_tagger.unwrap_or_else(NeTagger::builtin),
415            romanization: self.romanization,
416            abbrev_map: self.abbrev_map,
417            number_normalize: self.number_normalize.unwrap_or(true),
418            soundex: self.soundex,
419        }
420    }
421}
422
423/// Full-text search tokenizer for Thai text.
424///
425/// Wraps [`Tokenizer`] with stopword filtering, synonym expansion, and n-gram
426/// generation for out-of-vocabulary tokens.
427///
428/// Construct once and reuse:
429///
430/// ```rust
431/// use kham_core::fts::FtsTokenizer;
432///
433/// let fts = FtsTokenizer::new();
434/// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
435/// assert!(!tokens.is_empty());
436/// ```
437pub struct FtsTokenizer {
438    tokenizer: Tokenizer,
439    stopwords: StopwordSet,
440    synonyms: SynonymMap,
441    ngram_size: usize,
442    pos_tagger: PosTagger,
443    ne_tagger: NeTagger,
444    romanization: Option<RomanizationMap>,
445    abbrev_map: Option<AbbrevMap>,
446    number_normalize: bool,
447    soundex: Option<SoundexAlgorithm>,
448}
449
450impl FtsTokenizer {
451    /// Create an [`FtsTokenizer`] with built-in stopwords and no synonyms.
452    ///
453    /// # Example
454    ///
455    /// ```rust
456    /// use kham_core::fts::FtsTokenizer;
457    ///
458    /// let fts = FtsTokenizer::new();
459    /// let lexemes = fts.lexemes("กินข้าวกับปลา");
460    /// // Built-in stopword กับ is excluded; content words are present
461    /// assert!(!lexemes.contains(&String::from("กับ")));
462    /// assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
463    /// ```
464    pub fn new() -> Self {
465        FtsTokenizerBuilder::default().build()
466    }
467
468    /// Return a [`FtsTokenizerBuilder`] for custom configuration.
469    ///
470    /// # Example
471    ///
472    /// ```rust
473    /// use kham_core::fts::FtsTokenizer;
474    /// use kham_core::soundex::SoundexAlgorithm;
475    /// use kham_core::synonym::SynonymMap;
476    ///
477    /// let fts = FtsTokenizer::builder()
478    ///     .synonyms(SynonymMap::from_tsv("รถ\tรถยนต์\n"))
479    ///     .soundex(SoundexAlgorithm::Lk82)
480    ///     .build();
481    /// assert!(!fts.segment_for_fts("รถ").is_empty());
482    /// ```
483    pub fn builder() -> FtsTokenizerBuilder {
484        FtsTokenizerBuilder::default()
485    }
486
487    /// Segment `text` and annotate each token for FTS indexing.
488    ///
489    /// Normalises the input text before segmentation so that สระลอย and stacked
490    /// tone marks are handled correctly. Whitespace tokens are excluded.
491    ///
492    /// The returned `Vec<FtsToken>` covers all non-whitespace tokens. Call
493    /// [`index_tokens`] instead when you only need the tokens to be indexed
494    /// (stopwords excluded).
495    ///
496    /// [`index_tokens`]: FtsTokenizer::index_tokens
497    ///
498    /// # Examples
499    ///
500    /// ```rust
501    /// use kham_core::fts::FtsTokenizer;
502    ///
503    /// let fts = FtsTokenizer::new();
504    /// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
505    /// // Positions are 0-based and sequential across non-whitespace tokens
506    /// for (i, t) in tokens.iter().enumerate() {
507    ///     assert_eq!(t.position, i);
508    /// }
509    /// // กับ is a common conjunction — marked as a stopword
510    /// let kap = tokens.iter().find(|t| t.text == "กับ").unwrap();
511    /// assert!(kap.is_stop);
512    /// ```
513    ///
514    /// Named entities are tagged automatically — `kind` becomes `TokenKind::Named`:
515    ///
516    /// ```rust
517    /// use kham_core::fts::FtsTokenizer;
518    /// use kham_core::TokenKind;
519    ///
520    /// let fts = FtsTokenizer::new();
521    /// let tokens = fts.segment_for_fts("ไปกรุงเทพ");
522    /// assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));
523    /// ```
524    ///
525    /// Enable phonetic synonyms with [`FtsTokenizerBuilder::soundex`]:
526    ///
527    /// ```rust
528    /// use kham_core::fts::FtsTokenizer;
529    /// use kham_core::soundex::SoundexAlgorithm;
530    ///
531    /// let fts = FtsTokenizer::builder()
532    ///     .soundex(SoundexAlgorithm::Lk82)
533    ///     .build();
534    /// let tokens = fts.segment_for_fts("กิน");
535    /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
536    /// // synonyms now contains the lk82 code, enabling fuzzy phonetic matching
537    /// assert!(!t.synonyms.is_empty());
538    /// ```
539    pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken> {
540        let normalized = self.tokenizer.normalize(text);
541        // Expand abbreviations (e.g. ก.ค. → กรกฎาคม) before segmentation so
542        // dot-containing patterns are replaced as single units.
543        let expanded = match self.abbrev_map.as_ref() {
544            Some(am) => am.expand_text(&normalized),
545            None => normalized,
546        };
547        let raw_tokens = self
548            .ne_tagger
549            .tag_tokens(self.tokenizer.segment(&expanded), &expanded);
550
551        let mut result = Vec::with_capacity(raw_tokens.len());
552        let mut position = 0usize;
553
554        for token in &raw_tokens {
555            if token.kind == TokenKind::Whitespace {
556                continue;
557            }
558
559            let is_stop = self.stopwords.contains(token.text);
560            let is_thai_or_named = matches!(token.kind, TokenKind::Thai | TokenKind::Named(_));
561            let mut synonyms = self
562                .synonyms
563                .expand(token.text)
564                .map(|s| s.to_vec())
565                .unwrap_or_default();
566            if is_thai_or_named {
567                if let Some(ref rom) = self.romanization {
568                    if let Some(rtgs) = rom.romanize(token.text) {
569                        synonyms.push(String::from(rtgs));
570                    }
571                }
572                if let Some(algo) = self.soundex {
573                    let code = soundex(token.text, algo);
574                    if !code.chars().all(|c| c == '0') {
575                        synonyms.push(code);
576                    }
577                }
578            }
579            if self.number_normalize {
580                match token.kind {
581                    // Number token with Thai digits → add ASCII form as synonym.
582                    TokenKind::Number => {
583                        let ascii = thai_digits_to_ascii(token.text);
584                        if ascii != token.text {
585                            synonyms.push(ascii);
586                        }
587                    }
588                    // Thai token that is a recognised number word → add decimal string.
589                    TokenKind::Thai => {
590                        if let Some(decimal) = thai_word_to_decimal(token.text) {
591                            synonyms.push(decimal);
592                        }
593                    }
594                    _ => {}
595                }
596            }
597            let trigrams = if token.kind == TokenKind::Unknown && self.ngram_size > 0 {
598                char_ngrams(token.text, self.ngram_size)
599                    .map(String::from)
600                    .collect()
601            } else {
602                Vec::new()
603            };
604            let ne = if let TokenKind::Named(k) = token.kind {
605                Some(k)
606            } else {
607                None
608            };
609            let pos = if token.kind == TokenKind::Thai {
610                self.pos_tagger.tag(token.text)
611            } else {
612                None
613            };
614
615            result.push(FtsToken {
616                text: String::from(token.text),
617                position,
618                kind: token.kind,
619                is_stop,
620                synonyms,
621                trigrams,
622                pos,
623                ne,
624            });
625
626            position += 1;
627        }
628
629        result
630    }
631
632    /// Return only the tokens to be written into a search index.
633    ///
634    /// Filters out stopwords and whitespace. Each [`FtsToken`] still carries
635    /// its original `position` so phrase-distance scoring remains correct.
636    ///
637    /// # Example
638    ///
639    /// ```rust
640    /// use kham_core::fts::FtsTokenizer;
641    ///
642    /// let fts = FtsTokenizer::new();
643    /// let tokens = fts.index_tokens("กินข้าวกับปลา");
644    /// // No stopwords in the index
645    /// assert!(tokens.iter().all(|t| !t.is_stop));
646    /// // Positions are preserved from the full sequence for phrase scoring
647    /// let positions: Vec<usize> = tokens.iter().map(|t| t.position).collect();
648    /// assert!(positions.windows(2).all(|w| w[0] < w[1]));
649    /// ```
650    pub fn index_tokens(&self, text: &str) -> Vec<FtsToken> {
651        self.segment_for_fts(text)
652            .into_iter()
653            .filter(|t| !t.is_stop)
654            .collect()
655    }
656
657    /// Return a streaming iterator over the FTS tokens for `text`.
658    ///
659    /// Equivalent to [`segment_for_fts`] but wraps the result in an
660    /// [`FtsTokenStream`] so callers can consume tokens one at a time.
661    /// Use [`FtsTokenStream::next_index_token`] to skip stopwords automatically.
662    ///
663    /// The full token list is materialised internally because the NE tagger
664    /// requires multi-token context; this is a streaming *consumer*, not a
665    /// lazy producer.
666    ///
667    /// # Example
668    ///
669    /// ```rust
670    /// use kham_core::fts::FtsTokenizer;
671    ///
672    /// let fts = FtsTokenizer::new();
673    /// let mut stream = fts.segment_stream("กินข้าวกับปลา");
674    /// let mut index_texts: Vec<String> = Vec::new();
675    /// while let Some(tok) = stream.next_index_token() {
676    ///     index_texts.push(tok.text);
677    /// }
678    /// // กับ is a stopword — it should not appear in index_texts
679    /// assert!(!index_texts.contains(&String::from("กับ")));
680    /// assert!(index_texts.iter().any(|t| t == "กิน" || t == "ปลา"));
681    /// ```
682    ///
683    /// [`segment_for_fts`]: FtsTokenizer::segment_for_fts
684    pub fn segment_stream(&self, text: &str) -> FtsTokenStream {
685        FtsTokenStream {
686            inner: self.segment_for_fts(text).into_iter(),
687        }
688    }
689
690    /// Collect all lexeme strings to be stored in a `tsvector`.
691    ///
692    /// Returns one string per non-stop token, plus synonym expansions and
693    /// trigrams for unknown tokens. Duplicates are not removed (the caller or
694    /// PostgreSQL handles deduplication).
695    ///
696    /// # Example
697    ///
698    /// ```rust
699    /// use kham_core::fts::FtsTokenizer;
700    ///
701    /// let fts = FtsTokenizer::new();
702    /// let lexemes = fts.lexemes("กินข้าวกับปลา");
703    /// // Content words are present; stopword กับ is absent
704    /// assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
705    /// assert!(!lexemes.contains(&String::from("กับ")));
706    /// ```
707    ///
708    /// With Thai digit normalization (enabled by default), both scripts match:
709    ///
710    /// ```rust
711    /// use kham_core::fts::FtsTokenizer;
712    ///
713    /// let fts = FtsTokenizer::new();
714    /// let lexemes = fts.lexemes("ธนาคาร๑๐๐แห่ง");
715    /// // ๑๐๐ (Thai digits) → synonym "100" (ASCII) — both appear in lexemes
716    /// assert!(lexemes.contains(&String::from("100")));
717    /// ```
718    pub fn lexemes(&self, text: &str) -> Vec<String> {
719        let tokens = self.index_tokens(text);
720        let mut out: Vec<String> = Vec::with_capacity(tokens.len() * 2);
721        for t in tokens {
722            out.push(t.text.clone());
723            out.extend(t.synonyms);
724            out.extend(t.trigrams);
725        }
726        out
727    }
728}
729
730impl Default for FtsTokenizer {
731    fn default() -> Self {
732        Self::new()
733    }
734}
735
736// ---------------------------------------------------------------------------
737// Tests
738// ---------------------------------------------------------------------------
739
740#[cfg(test)]
741mod tests {
742    use super::*;
743    use crate::stopwords::StopwordSet;
744    use crate::synonym::SynonymMap;
745
746    fn fts() -> FtsTokenizer {
747        FtsTokenizer::new()
748    }
749
750    // ── segment_for_fts ───────────────────────────────────────────────────────
751
752    #[test]
753    fn empty_input_returns_empty() {
754        assert!(fts().segment_for_fts("").is_empty());
755    }
756
757    #[test]
758    fn whitespace_tokens_excluded() {
759        let tokens = fts().segment_for_fts("กิน ข้าว");
760        assert!(tokens.iter().all(|t| t.kind != TokenKind::Whitespace));
761    }
762
763    #[test]
764    fn positions_are_sequential() {
765        let tokens = fts().segment_for_fts("กินข้าวกับปลา");
766        for (i, t) in tokens.iter().enumerate() {
767            assert_eq!(t.position, i, "position mismatch at index {i}");
768        }
769    }
770
771    #[test]
772    fn known_stopword_is_tagged() {
773        // "กับ" is a common conjunction and should be in the built-in stopword list
774        let tokens = fts().segment_for_fts("กินข้าวกับปลา");
775        let kap = tokens.iter().find(|t| t.text == "กับ");
776        assert!(kap.is_some(), "expected 'กับ' token");
777        assert!(kap.unwrap().is_stop, "'กับ' should be tagged as stopword");
778    }
779
780    #[test]
781    fn content_words_not_tagged_as_stop() {
782        let tokens = fts().segment_for_fts("โรงพยาบาล");
783        // May be OOV but should not be a stopword
784        for t in &tokens {
785            assert!(!t.is_stop, "'{}' should not be a stopword", t.text);
786        }
787    }
788
789    #[test]
790    fn text_is_reconstructable() {
791        // All tokens joined == normalised input (whitespace dropped)
792        let fts = fts();
793        let text = "กินข้าวกับปลา";
794        let normalized = fts.tokenizer.normalize(text);
795        let tokens = fts.segment_for_fts(text);
796        let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
797        assert_eq!(rebuilt, normalized);
798    }
799
800    // ── synonym expansion ─────────────────────────────────────────────────────
801
802    #[test]
803    fn synonym_expansion_attached() {
804        let synonyms = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
805        let fts = FtsTokenizer::builder()
806            .synonyms(synonyms)
807            .stopwords(StopwordSet::from_text(""))
808            .build();
809        // Segment a text containing "คอม" — need it in dict or it lands as Unknown
810        // Use builder with custom word so the segmenter recognises it
811        let tokens = fts.segment_for_fts("คอม");
812        let t = tokens.iter().find(|t| t.text == "คอม");
813        if let Some(tok) = t {
814            assert!(
815                tok.synonyms.contains(&String::from("คอมพิวเตอร์")),
816                "expected synonym expansion, got {:?}",
817                tok.synonyms
818            );
819        }
820    }
821
822    #[test]
823    fn no_synonyms_when_map_empty() {
824        let tokens = fts().segment_for_fts("กินข้าว");
825        for t in &tokens {
826            assert!(t.synonyms.is_empty());
827        }
828    }
829
830    // ── unknown token trigrams ────────────────────────────────────────────────
831
832    #[test]
833    fn unknown_token_gets_trigrams() {
834        // "กิ" = consonant + sara-i, a single 2-char TCC that is not a word.
835        // With ngram_size=2 the token should yield one bigram ("กิ").
836        // The newmm DP emits Unknown tokens one TCC at a time, so multi-char TCCs
837        // (like "กิ") are the shortest unit that can produce n-grams.
838        let fts = FtsTokenizer::builder()
839            .ngram_size(2)
840            .stopwords(StopwordSet::from_text(""))
841            .build();
842        let tokens = fts.segment_for_fts("กิ");
843        let unknown: Vec<_> = tokens
844            .iter()
845            .filter(|t| t.kind == TokenKind::Unknown && t.text.chars().count() >= 2)
846            .collect();
847        assert!(
848            !unknown.is_empty(),
849            "expected at least one multi-char Unknown token for 'กิ'"
850        );
851        for u in &unknown {
852            assert!(
853                !u.trigrams.is_empty(),
854                "unknown token '{}' ({} chars) should have bigrams",
855                u.text,
856                u.text.chars().count()
857            );
858        }
859    }
860
861    #[test]
862    fn known_thai_token_has_no_trigrams() {
863        let tokens = fts().segment_for_fts("กิน");
864        for t in &tokens {
865            if t.kind == TokenKind::Thai {
866                assert!(
867                    t.trigrams.is_empty(),
868                    "known Thai token '{}' should not have trigrams",
869                    t.text
870                );
871            }
872        }
873    }
874
875    #[test]
876    fn ngram_size_zero_disables_trigrams() {
877        let fts = FtsTokenizer::builder()
878            .ngram_size(0)
879            .stopwords(StopwordSet::from_text(""))
880            .build();
881        let tokens = fts.segment_for_fts("กขคง");
882        for t in &tokens {
883            assert!(t.trigrams.is_empty());
884        }
885    }
886
887    // ── index_tokens ──────────────────────────────────────────────────────────
888
889    #[test]
890    fn index_tokens_excludes_stopwords() {
891        let tokens = fts().index_tokens("กินข้าวกับปลา");
892        assert!(tokens.iter().all(|t| !t.is_stop));
893    }
894
895    #[test]
896    fn index_tokens_preserves_positions() {
897        // Positions in index_tokens must be a subset of segment_for_fts positions
898        let all = fts().segment_for_fts("กินข้าวกับปลา");
899        let indexed = fts().index_tokens("กินข้าวกับปลา");
900        for t in &indexed {
901            assert!(
902                all.iter().any(|a| a.position == t.position),
903                "indexed token at position {} not found in full token list",
904                t.position
905            );
906        }
907    }
908
909    // ── lexemes ───────────────────────────────────────────────────────────────
910
911    #[test]
912    fn lexemes_returns_non_stop_texts() {
913        let lexemes = fts().lexemes("กินข้าวกับปลา");
914        // "กับ" is a stopword — should not appear
915        assert!(!lexemes.contains(&String::from("กับ")));
916        // Content words should appear
917        assert!(
918            lexemes
919                .iter()
920                .any(|l| l == "กิน" || l == "ข้าว" || l == "ปลา"),
921            "expected content words in lexemes: {lexemes:?}"
922        );
923    }
924
925    #[test]
926    fn lexemes_empty_input_is_empty() {
927        assert!(fts().lexemes("").is_empty());
928    }
929
930    // ── multi-token NE ────────────────────────────────────────────────────────
931
932    #[test]
933    fn multi_token_ne_merged_in_pipeline() {
934        // กรุงเทพ is in the NE gazetteer as PLACE; the segmenter splits it
935        // into กรุง+เทพ. The FTS pipeline must merge them into one Named token.
936        let fts = FtsTokenizer::new();
937        let tokens = fts.segment_for_fts("ไปกรุงเทพ");
938        let named: Vec<_> = tokens
939            .iter()
940            .filter(|t| matches!(t.kind, TokenKind::Named(_)))
941            .collect();
942        assert!(
943            named.iter().any(|t| t.text == "กรุงเทพ"),
944            "กรุงเทพ should be tagged Named after multi-token merge, tokens: {:?}",
945            tokens
946                .iter()
947                .map(|t| (&t.text, &t.kind))
948                .collect::<alloc::vec::Vec<_>>()
949        );
950    }
951
952    #[test]
953    fn multi_token_ne_reconstructable() {
954        // Texts of all non-whitespace tokens must still reconstruct the normalized input.
955        let fts = FtsTokenizer::new();
956        let text = "ไปกรุงเทพ";
957        let normalized = fts.tokenizer.normalize(text);
958        let tokens = fts.segment_for_fts(text);
959        let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
960        assert_eq!(rebuilt, normalized);
961    }
962
963    // ── builder ───────────────────────────────────────────────────────────────
964
965    #[test]
966    fn builder_custom_stopwords() {
967        let stops = StopwordSet::from_text("กิน\n");
968        let fts = FtsTokenizer::builder().stopwords(stops).build();
969        let tokens = fts.segment_for_fts("กินข้าว");
970        let gin = tokens.iter().find(|t| t.text == "กิน");
971        if let Some(t) = gin {
972            assert!(t.is_stop, "'กิน' should be stop with custom list");
973        }
974    }
975
976    #[test]
977    fn builder_default_equals_new() {
978        // Both paths should produce the same result for a simple input
979        let a = FtsTokenizer::new().lexemes("กินข้าว");
980        let b = FtsTokenizer::builder().build().lexemes("กินข้าว");
981        assert_eq!(a, b);
982    }
983
984    // ── number normalization ──────────────────────────────────────────────────
985
986    #[test]
987    fn thai_digit_token_gets_ascii_synonym() {
988        let fts = FtsTokenizer::new();
989        let tokens = fts.segment_for_fts("๑๒๓");
990        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
991        assert!(num.is_some(), "expected a Number token");
992        let t = num.unwrap();
993        assert!(
994            t.synonyms.contains(&String::from("123")),
995            "Thai digit token should have ASCII synonym, got {:?}",
996            t.synonyms
997        );
998    }
999
1000    #[test]
1001    fn ascii_digit_token_has_no_extra_synonym() {
1002        // ASCII digits need no conversion — synonyms should be empty (no map, no rom).
1003        let fts = FtsTokenizer::new();
1004        let tokens = fts.segment_for_fts("123");
1005        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
1006        assert!(num.is_some(), "expected a Number token");
1007        assert!(
1008            !num.unwrap().synonyms.contains(&String::from("123")),
1009            "ASCII digit token should not duplicate itself as a synonym"
1010        );
1011    }
1012
1013    #[test]
1014    fn thai_number_word_gets_decimal_synonym() {
1015        // หนึ่งร้อย may segment as a single Thai token or multiple tokens depending
1016        // on the dictionary. We check that at least one token carries "100" in synonyms.
1017        let fts = FtsTokenizer::new();
1018        let tokens = fts.segment_for_fts("หนึ่งร้อย");
1019        let has_hundred = tokens
1020            .iter()
1021            .any(|t| t.synonyms.contains(&String::from("100")));
1022        // หนึ่ง alone = Some(1), ร้อย alone = Some(100) — at least ร้อย should match.
1023        assert!(
1024            has_hundred,
1025            "expected a token with decimal synonym '100', tokens: {:?}",
1026            tokens
1027                .iter()
1028                .map(|t| (&t.text, &t.synonyms))
1029                .collect::<alloc::vec::Vec<_>>()
1030        );
1031    }
1032
1033    #[test]
1034    fn number_normalize_false_disables_conversion() {
1035        let fts = FtsTokenizer::builder()
1036            .number_normalize(false)
1037            .stopwords(StopwordSet::from_text(""))
1038            .build();
1039        let tokens = fts.segment_for_fts("๑๒๓");
1040        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
1041        assert!(num.is_some());
1042        assert!(
1043            !num.unwrap().synonyms.contains(&String::from("123")),
1044            "number_normalize=false should suppress ASCII synonym"
1045        );
1046    }
1047
1048    #[test]
1049    fn mixed_thai_digit_in_context() {
1050        // "ธนาคาร๑๐๐แห่ง" — the ๑๐๐ part should be a Number token with synonym "100"
1051        let fts = FtsTokenizer::new();
1052        let tokens = fts.segment_for_fts("ธนาคาร๑๐๐แห่ง");
1053        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
1054        assert!(num.is_some(), "expected Number token in mixed string");
1055        assert!(
1056            num.unwrap().synonyms.contains(&String::from("100")),
1057            "expected ASCII synonym '100' for ๑๐๐"
1058        );
1059    }
1060
1061    // ── abbreviation expansion ────────────────────────────────────────────────
1062
1063    #[test]
1064    fn abbrev_map_expands_before_segmentation() {
1065        use crate::abbrev::AbbrevMap;
1066        let fts = FtsTokenizer::builder()
1067            .abbrevs(AbbrevMap::builtin())
1068            .stopwords(StopwordSet::from_text(""))
1069            .build();
1070        // ก.ค. → กรกฎาคม before segmentation. The segmenter may split the
1071        // expansion further (กรกฎา + คม) — what matters is that dots are gone
1072        // and the Thai characters of กรกฎาคม are present.
1073        let tokens = fts.segment_for_fts("ก.ค.");
1074        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1075        let joined: String = texts.concat();
1076        assert!(
1077            joined.contains("กรกฎา") || joined.contains("กรกฎาคม"),
1078            "expected กรกฎา(คม) characters after abbrev expansion, got: {texts:?}"
1079        );
1080        assert!(
1081            !texts.contains(&"."),
1082            "dots should be consumed by abbrev expansion, got: {texts:?}"
1083        );
1084    }
1085
1086    // ── segment_stream / FtsTokenStream ──────────────────────────────────────
1087
1088    #[test]
1089    fn segment_stream_yields_all_non_whitespace_tokens() {
1090        let fts = fts();
1091        let via_vec = fts.segment_for_fts("กินข้าวกับปลา");
1092        let via_stream: Vec<FtsToken> = fts.segment_stream("กินข้าวกับปลา").collect();
1093        assert_eq!(via_vec, via_stream);
1094    }
1095
1096    #[test]
1097    fn segment_stream_empty_input() {
1098        let mut stream = fts().segment_stream("");
1099        assert!(stream.next().is_none());
1100    }
1101
1102    #[test]
1103    fn next_index_token_skips_stopwords() {
1104        let fts = fts();
1105        let mut stream = fts.segment_stream("กินข้าวกับปลา");
1106        let mut texts = Vec::new();
1107        while let Some(tok) = stream.next_index_token() {
1108            texts.push(tok.text);
1109        }
1110        assert!(
1111            !texts.contains(&String::from("กับ")),
1112            "stopword กับ must be skipped"
1113        );
1114        assert!(
1115            texts.iter().any(|t| t == "กิน" || t == "ปลา"),
1116            "content words must be yielded"
1117        );
1118    }
1119
1120    #[test]
1121    fn next_index_token_matches_index_tokens() {
1122        let fts = fts();
1123        let text = "กินข้าวกับปลา";
1124        let via_index: Vec<_> = fts.index_tokens(text);
1125        let mut stream = fts.segment_stream(text);
1126        let mut via_stream = Vec::new();
1127        while let Some(tok) = stream.next_index_token() {
1128            via_stream.push(tok);
1129        }
1130        assert_eq!(via_index, via_stream);
1131    }
1132
1133    #[test]
1134    fn stream_size_hint_is_correct() {
1135        let fts = fts();
1136        let via_vec = fts.segment_for_fts("กินข้าวกับปลา");
1137        let n = via_vec.len();
1138        let stream = fts.segment_stream("กินข้าวกับปลา");
1139        assert_eq!(stream.size_hint(), (n, Some(n)));
1140    }
1141
1142    #[test]
1143    fn abbrev_expansion_disabled_by_default() {
1144        // FtsTokenizer::new() has no abbrev_map — ก.ค. stays as individual tokens.
1145        let fts = FtsTokenizer::new();
1146        let tokens = fts.segment_for_fts("ก.ค.");
1147        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1148        // Without expansion the dot(s) must still be present as punctuation tokens.
1149        assert!(
1150            texts.contains(&"."),
1151            "without abbrev expansion, dots should remain as tokens, got: {texts:?}"
1152        );
1153    }
1154
1155    // ── soundex synonyms ──────────────────────────────────────────────────────
1156
1157    #[test]
1158    fn soundex_lk82_appended_to_thai_synonyms() {
1159        use crate::soundex::lk82;
1160        let fts = FtsTokenizer::builder()
1161            .soundex(SoundexAlgorithm::Lk82)
1162            .stopwords(StopwordSet::from_text(""))
1163            .build();
1164        let tokens = fts.segment_for_fts("กิน");
1165        let t = tokens.iter().find(|t| t.text == "กิน");
1166        assert!(t.is_some(), "expected token 'กิน'");
1167        let expected_code = lk82("กิน");
1168        assert!(
1169            t.unwrap().synonyms.contains(&expected_code),
1170            "expected lk82 code '{expected_code}' in synonyms, got {:?}",
1171            t.unwrap().synonyms
1172        );
1173    }
1174
1175    #[test]
1176    fn soundex_not_emitted_by_default() {
1177        // Without .soundex() in the builder, no soundex codes should appear.
1178        let fts = FtsTokenizer::new();
1179        let tokens = fts.segment_for_fts("กินข้าว");
1180        for t in &tokens {
1181            // A soundex code is 4 ASCII chars (lk82/udom83); no synonym should look like one.
1182            for syn in &t.synonyms {
1183                let looks_like_soundex =
1184                    syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
1185                assert!(
1186                    !looks_like_soundex,
1187                    "unexpected soundex-like synonym '{}' on token '{}'",
1188                    syn, t.text
1189                );
1190            }
1191        }
1192    }
1193
1194    #[test]
1195    fn soundex_same_sounding_words_share_code_in_index() {
1196        // กาน and ขาน share lk82 code "1600"; both should carry it as a synonym.
1197        use crate::soundex::lk82;
1198        let fts = FtsTokenizer::builder()
1199            .soundex(SoundexAlgorithm::Lk82)
1200            .stopwords(StopwordSet::from_text(""))
1201            .build();
1202        let code = lk82("กาน");
1203        for word in &["กาน", "ขาน", "คาน"] {
1204            let tokens = fts.segment_for_fts(word);
1205            let t = tokens.first().expect("expected at least one token");
1206            assert!(
1207                t.synonyms.contains(&code),
1208                "'{word}' should carry lk82 code '{code}', got {:?}",
1209                t.synonyms
1210            );
1211        }
1212    }
1213
1214    #[test]
1215    fn soundex_not_emitted_for_non_thai_tokens() {
1216        let fts = FtsTokenizer::builder()
1217            .soundex(SoundexAlgorithm::Lk82)
1218            .stopwords(StopwordSet::from_text(""))
1219            .build();
1220        let tokens = fts.segment_for_fts("hello 123");
1221        for t in &tokens {
1222            for syn in &t.synonyms {
1223                let looks_like_soundex =
1224                    syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
1225                assert!(
1226                    !looks_like_soundex,
1227                    "non-Thai token '{}' should not get a soundex synonym, got '{syn}'",
1228                    t.text
1229                );
1230            }
1231        }
1232    }
1233
1234    #[test]
1235    fn soundex_udom83_appended() {
1236        use crate::soundex::udom83;
1237        let fts = FtsTokenizer::builder()
1238            .soundex(SoundexAlgorithm::Udom83)
1239            .stopwords(StopwordSet::from_text(""))
1240            .build();
1241        let tokens = fts.segment_for_fts("กิน");
1242        let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
1243        let expected = udom83("กิน");
1244        assert!(
1245            t.synonyms.contains(&expected),
1246            "expected udom83 code '{expected}' in synonyms, got {:?}",
1247            t.synonyms
1248        );
1249    }
1250
1251    #[test]
1252    fn abbrev_expansion_date_sentence() {
1253        use crate::abbrev::AbbrevMap;
1254        let fts = FtsTokenizer::builder()
1255            .abbrevs(AbbrevMap::builtin())
1256            .stopwords(StopwordSet::from_text(""))
1257            .build();
1258        // พ.ศ. → พุทธศักราช; the segmenter may split it further — verify the
1259        // chars are present and dots are gone.
1260        let tokens = fts.segment_for_fts("พ.ศ.2567");
1261        let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1262        let joined: String = texts.concat();
1263        assert!(
1264            joined.contains("พุทธ") || joined.contains("พุทธศักราช"),
1265            "expected พุทธ(ศักราช) chars after expanding พ.ศ., got: {texts:?}"
1266        );
1267        assert!(
1268            !texts.contains(&"."),
1269            "dots should be consumed by expansion, got: {texts:?}"
1270        );
1271    }
1272}
kham_core/fts.rs

kham_core/
fts.rs