kham_core/fts.rs
1//! Full-text search pipeline for Thai text.
2//!
3//! [`FtsTokenizer`] orchestrates the complete FTS indexing pipeline:
4//! normalise → segment → tag stopwords → expand synonyms → attach position.
5//!
6//! The output [`FtsToken`] slice is consumed by the PostgreSQL `kham-pg`
7//! extension and by any other caller that needs FTS-ready lexemes.
8//!
9//! # Positions
10//!
11//! `position` is the ordinal index of the token in the non-whitespace token
12//! sequence (0-based). Stopwords retain their position so that phrase-distance
13//! scoring remains correct when stopwords are later omitted from the index.
14//!
15//! # Example
16//!
17//! ```rust
18//! use kham_core::fts::{FtsTokenizer, FtsToken};
19//!
20//! let fts = FtsTokenizer::new();
21//! let tokens = fts.segment_for_fts("กินข้าวกับปลา");
22//! for t in &tokens {
23//! println!("{} pos={} stop={}", t.text, t.position, t.is_stop);
24//! }
25//! ```
26
27use alloc::string::String;
28use alloc::vec::Vec;
29
30use crate::abbrev::AbbrevMap;
31use crate::ne::NeTagger;
32use crate::ngram::char_ngrams;
33use crate::number::{thai_digits_to_ascii, thai_word_to_decimal};
34use crate::pos::{PosTag, PosTagger};
35use crate::romanizer::RomanizationMap;
36use crate::soundex::{soundex, SoundexAlgorithm};
37use crate::stopwords::StopwordSet;
38use crate::synonym::SynonymMap;
39use crate::token::{NamedEntityKind, TokenKind};
40use crate::Tokenizer;
41
42/// A streaming iterator over [`FtsToken`]s produced by the FTS pipeline.
43///
44/// Returned by [`FtsTokenizer::segment_stream`]. Internally holds the full
45/// `Vec<FtsToken>` as an [`alloc::vec::IntoIter`]; the streaming API is provided
46/// so callers can consume tokens one at a time without materialising a second
47/// collection.
48///
49/// # Example
50///
51/// ```rust
52/// use kham_core::fts::FtsTokenizer;
53///
54/// let fts = FtsTokenizer::new();
55/// let mut stream = fts.segment_stream("กินข้าวกับปลา");
56/// // next_index_token() skips stopwords — กับ is a stopword and is skipped.
57/// while let Some(tok) = stream.next_index_token() {
58/// println!("{} pos={}", tok.text, tok.position);
59/// }
60/// ```
61pub struct FtsTokenStream {
62 inner: alloc::vec::IntoIter<FtsToken>,
63}
64
65impl FtsTokenStream {
66 /// Advance to the next token that should be written into the search index,
67 /// skipping stopwords.
68 ///
69 /// Equivalent to calling [`Iterator::next`] in a loop until a token with
70 /// `is_stop == false` is found, or the stream is exhausted.
71 pub fn next_index_token(&mut self) -> Option<FtsToken> {
72 self.inner.by_ref().find(|t| !t.is_stop)
73 }
74}
75
76impl Iterator for FtsTokenStream {
77 type Item = FtsToken;
78
79 #[inline]
80 fn next(&mut self) -> Option<FtsToken> {
81 self.inner.next()
82 }
83
84 #[inline]
85 fn size_hint(&self) -> (usize, Option<usize>) {
86 self.inner.size_hint()
87 }
88}
89
90/// A token produced by the FTS pipeline, ready for lexeme indexing.
91#[derive(Debug, Clone, PartialEq, Eq)]
92pub struct FtsToken {
93 /// The token text (owned; may be normalised).
94 pub text: String,
95 /// Ordinal position in the token sequence (0-based, gaps for whitespace).
96 pub position: usize,
97 /// Script / category of the original token.
98 pub kind: TokenKind,
99 /// `true` if this token matches the stopword list.
100 pub is_stop: bool,
101 /// Synonym expansions (empty if none configured or no match).
102 pub synonyms: Vec<String>,
103 /// Character trigrams — populated only for [`TokenKind::Unknown`] tokens.
104 pub trigrams: Vec<String>,
105 /// Primary part-of-speech tag from the lookup table, or `None` if the word
106 /// is not in the table (OOV) or is not a Thai token.
107 pub pos: Option<PosTag>,
108 /// Named entity category, or `None` if the token is not in the NE
109 /// gazetteer. When set, `kind` is [`TokenKind::Named`]`(ne)`.
110 pub ne: Option<NamedEntityKind>,
111}
112
113/// Builder for [`FtsTokenizer`].
114#[derive(Default)]
115pub struct FtsTokenizerBuilder {
116 stopwords: Option<StopwordSet>,
117 synonyms: Option<SynonymMap>,
118 ngram_size: Option<usize>,
119 pos_tagger: Option<PosTagger>,
120 ne_tagger: Option<NeTagger>,
121 romanization: Option<RomanizationMap>,
122 abbrev_map: Option<AbbrevMap>,
123 /// `None` means "use default (true)".
124 number_normalize: Option<bool>,
125 soundex: Option<SoundexAlgorithm>,
126 /// Extra words to overlay on top of the built-in dictionary (fast path).
127 dict_merge: Option<String>,
128}
129
130impl FtsTokenizerBuilder {
131 /// Use a custom stopword set instead of the built-in list.
132 ///
133 /// # Example
134 ///
135 /// ```rust
136 /// use kham_core::fts::FtsTokenizer;
137 /// use kham_core::stopwords::StopwordSet;
138 ///
139 /// let stops = StopwordSet::from_text("กิน\nข้าว\n");
140 /// let fts = FtsTokenizer::builder().stopwords(stops).build();
141 /// let tokens = fts.segment_for_fts("กินข้าว");
142 /// assert!(tokens.iter().all(|t| t.is_stop || t.text != "กิน"));
143 /// ```
144 pub fn stopwords(mut self, s: StopwordSet) -> Self {
145 self.stopwords = Some(s);
146 self
147 }
148
149 /// Attach a synonym map for expansion.
150 ///
151 /// # Example
152 ///
153 /// ```rust
154 /// use kham_core::fts::FtsTokenizer;
155 /// use kham_core::synonym::SynonymMap;
156 ///
157 /// // TSV: canonical TAB synonym1 TAB synonym2 …
158 /// let syns = SynonymMap::from_tsv("รถ\tรถยนต์\tยานพาหนะ\n");
159 /// let fts = FtsTokenizer::builder().synonyms(syns).build();
160 /// let tokens = fts.segment_for_fts("รถ");
161 /// let t = tokens.iter().find(|t| t.text == "รถ").unwrap();
162 /// assert!(t.synonyms.contains(&String::from("รถยนต์")));
163 /// ```
164 pub fn synonyms(mut self, m: SynonymMap) -> Self {
165 self.synonyms = Some(m);
166 self
167 }
168
169 /// Override the n-gram size used for [`TokenKind::Unknown`] tokens.
170 ///
171 /// Default: 3 (trigrams). Set to 0 to disable n-gram generation.
172 ///
173 /// # Example
174 ///
175 /// ```rust
176 /// use kham_core::fts::FtsTokenizer;
177 /// use kham_core::stopwords::StopwordSet;
178 ///
179 /// // Disable n-grams entirely — useful when index size must be small
180 /// let fts = FtsTokenizer::builder()
181 /// .ngram_size(0)
182 /// .stopwords(StopwordSet::from_text(""))
183 /// .build();
184 /// let tokens = fts.segment_for_fts("กขคง"); // unknown word → no trigrams
185 /// assert!(tokens.iter().all(|t| t.trigrams.is_empty()));
186 /// ```
187 pub fn ngram_size(mut self, n: usize) -> Self {
188 self.ngram_size = Some(n);
189 self
190 }
191
192 /// Use a custom POS tagger instead of the built-in table.
193 ///
194 /// # Example
195 ///
196 /// ```rust
197 /// use kham_core::fts::FtsTokenizer;
198 /// use kham_core::pos::{PosTag, PosTagger};
199 ///
200 /// // Custom TSV: word TAB POS_TAG
201 /// let tagger = PosTagger::from_tsv("กิน\tVERB\n");
202 /// let fts = FtsTokenizer::builder().pos_tagger(tagger).build();
203 /// // Segment กิน alone so it is not merged into a compound
204 /// let tokens = fts.segment_for_fts("กิน");
205 /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
206 /// assert_eq!(t.pos, Some(PosTag::Verb));
207 /// ```
208 pub fn pos_tagger(mut self, t: PosTagger) -> Self {
209 self.pos_tagger = Some(t);
210 self
211 }
212
213 /// Use a custom NE gazetteer instead of the built-in table.
214 ///
215 /// # Example
216 ///
217 /// ```rust
218 /// use kham_core::fts::FtsTokenizer;
219 /// use kham_core::ne::NeTagger;
220 /// use kham_core::TokenKind;
221 ///
222 /// // Domain-specific NE list: word TAB NE_TAG
223 /// let ne = NeTagger::from_tsv("เซเรน่า\tPERSON\n");
224 /// let fts = FtsTokenizer::builder().ne_tagger(ne).build();
225 /// let tokens = fts.segment_for_fts("เซเรน่า");
226 /// assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));
227 /// ```
228 pub fn ne_tagger(mut self, t: NeTagger) -> Self {
229 self.ne_tagger = Some(t);
230 self
231 }
232
233 /// Attach a romanization map so RTGS forms are added to [`FtsToken::synonyms`].
234 ///
235 /// When set, each Thai and Named token whose text is found in the map gets its
236 /// RTGS romanization appended to `synonyms`, enabling Latin-script queries
237 /// (e.g. `kin`) to match Thai-script documents (e.g. `กิน`) in PostgreSQL FTS.
238 ///
239 /// Disabled by default — call this method to opt in.
240 ///
241 /// # Example
242 ///
243 /// ```rust
244 /// use kham_core::fts::FtsTokenizer;
245 /// use kham_core::romanizer::RomanizationMap;
246 ///
247 /// // TSV: Thai word TAB RTGS romanization
248 /// let rom = RomanizationMap::from_tsv("กิน\tkin\n");
249 /// let fts = FtsTokenizer::builder().romanization(rom).build();
250 /// let tokens = fts.segment_for_fts("กิน");
251 /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
252 /// // Latin synonym "kin" enables queries like `WHERE doc @@ 'kin'`
253 /// assert!(t.synonyms.contains(&String::from("kin")));
254 /// ```
255 pub fn romanization(mut self, m: RomanizationMap) -> Self {
256 self.romanization = Some(m);
257 self
258 }
259
260 /// Attach an abbreviation map for pre-tokenisation expansion.
261 ///
262 /// When set, [`FtsTokenizer::segment_for_fts`] calls
263 /// [`AbbrevMap::expand_text`] on the normalised input before segmentation.
264 /// This replaces abbreviated forms (e.g. `ก.ค.`) with their canonical
265 /// expansions (`กรกฎาคม`) so they are indexed and searchable by full form.
266 ///
267 /// Disabled by default — call this method to opt in.
268 ///
269 /// # Example
270 ///
271 /// ```rust
272 /// use kham_core::fts::FtsTokenizer;
273 /// use kham_core::abbrev::AbbrevMap;
274 /// use kham_core::stopwords::StopwordSet;
275 ///
276 /// let fts = FtsTokenizer::builder()
277 /// .abbrevs(AbbrevMap::builtin())
278 /// .stopwords(StopwordSet::from_text(""))
279 /// .build();
280 /// // ก.ค. expands to กรกฎาคม before segmentation — dots disappear
281 /// let tokens = fts.segment_for_fts("ก.ค.");
282 /// let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
283 /// assert!(!texts.contains(&"."), "dots should be consumed by expansion");
284 /// ```
285 pub fn abbrevs(mut self, m: AbbrevMap) -> Self {
286 self.abbrev_map = Some(m);
287 self
288 }
289
290 /// Enable or disable number normalization (default: `true`).
291 ///
292 /// When enabled:
293 /// - [`TokenKind::Number`] tokens that contain Thai digits (๐–๙) get the
294 /// ASCII digit string added to their [`FtsToken::synonyms`]
295 /// (e.g. `๑๒๓` → synonym `"123"`).
296 /// - [`TokenKind::Thai`] tokens that are recognised Thai cardinal number
297 /// words get their decimal value added to `synonyms`
298 /// (e.g. `หนึ่งร้อย` → synonym `"100"`).
299 ///
300 /// This lets queries using either script match documents written in the
301 /// other. Set to `false` to opt out.
302 ///
303 /// # Example
304 ///
305 /// ```rust
306 /// use kham_core::fts::FtsTokenizer;
307 /// use kham_core::TokenKind;
308 ///
309 /// // Default (true): ๑๒๓ gets ASCII synonym "123"
310 /// let fts = FtsTokenizer::new();
311 /// let tokens = fts.segment_for_fts("๑๒๓");
312 /// let num = tokens.iter().find(|t| t.kind == TokenKind::Number).unwrap();
313 /// assert!(num.synonyms.contains(&String::from("123")));
314 ///
315 /// // Opt out: no conversion performed
316 /// let fts_off = FtsTokenizer::builder().number_normalize(false).build();
317 /// let tokens_off = fts_off.segment_for_fts("๑๒๓");
318 /// let num_off = tokens_off.iter().find(|t| t.kind == TokenKind::Number).unwrap();
319 /// assert!(!num_off.synonyms.contains(&String::from("123")));
320 /// ```
321 pub fn number_normalize(mut self, v: bool) -> Self {
322 self.number_normalize = Some(v);
323 self
324 }
325
326 /// Emit a Thai phonetic soundex code as an additional synonym for Thai and Named tokens.
327 ///
328 /// When set, each Thai and Named token whose text contains Thai consonants gets its
329 /// soundex code appended to [`FtsToken::synonyms`], enabling phonetic fuzzy matching
330 /// in full-text search (e.g. querying `"1600"` matches กาน, ขาน, and คาน with lk82).
331 ///
332 /// [`SoundexAlgorithm::Lk82`] and [`SoundexAlgorithm::Udom83`] produce fixed
333 /// 4-character codes and are the recommended choices for FTS indexing.
334 /// [`SoundexAlgorithm::MetaSound`] produces variable-length codes and is more
335 /// collision-prone at word level — prefer lk82 or udom83 for general FTS use.
336 ///
337 /// Disabled by default — call this method to opt in.
338 ///
339 /// # Example
340 ///
341 /// ```rust
342 /// use kham_core::fts::FtsTokenizer;
343 /// use kham_core::soundex::{lk82, SoundexAlgorithm};
344 /// use kham_core::stopwords::StopwordSet;
345 ///
346 /// let fts = FtsTokenizer::builder()
347 /// .soundex(SoundexAlgorithm::Lk82)
348 /// .stopwords(StopwordSet::from_text(""))
349 /// .build();
350 /// // กาน / ขาน / คาน all map to the same lk82 code — stored once per token
351 /// for word in &["กาน", "ขาน", "คาน"] {
352 /// let tokens = fts.segment_for_fts(word);
353 /// let t = tokens.first().unwrap();
354 /// assert!(t.synonyms.contains(&lk82(word)), "{word} missing lk82 synonym");
355 /// }
356 /// ```
357 pub fn soundex(mut self, algo: SoundexAlgorithm) -> Self {
358 self.soundex = Some(algo);
359 self
360 }
361
362 /// Overlay extra words on the built-in dictionary without a full trie rebuild.
363 ///
364 /// Words are stored in a sorted list alongside the pre-compiled trie.
365 /// Prefer this over a full rebuild when adding a small domain-specific
366 /// vocabulary (e.g. product names, technical terms).
367 ///
368 /// Newline-separated; `#` lines are ignored.
369 ///
370 /// # Example
371 ///
372 /// ```rust
373 /// use kham_core::fts::FtsTokenizer;
374 /// use kham_core::TokenKind;
375 ///
376 /// let fts = FtsTokenizer::builder()
377 /// .dict_merge("โปรแกรมเมอร์\nปัญญาประดิษฐ์\n")
378 /// .build();
379 /// let tokens = fts.segment_for_fts("โปรแกรมเมอร์ไทย");
380 /// assert!(tokens.iter().any(|t| t.text == "โปรแกรมเมอร์" && t.kind == TokenKind::Thai));
381 /// ```
382 pub fn dict_merge(mut self, words: &str) -> Self {
383 self.dict_merge = Some(String::from(words));
384 self
385 }
386
387 /// Consume the builder and return a configured [`FtsTokenizer`].
388 ///
389 /// # Example
390 ///
391 /// ```rust
392 /// use kham_core::fts::FtsTokenizer;
393 /// use kham_core::soundex::SoundexAlgorithm;
394 /// use kham_core::stopwords::StopwordSet;
395 ///
396 /// let fts = FtsTokenizer::builder()
397 /// .soundex(SoundexAlgorithm::Lk82)
398 /// .stopwords(StopwordSet::from_text(""))
399 /// .build();
400 /// assert!(!fts.segment_for_fts("กินข้าว").is_empty());
401 /// ```
402 pub fn build(self) -> FtsTokenizer {
403 let tokenizer = if let Some(ref words) = self.dict_merge {
404 Tokenizer::builder().dict_merge(words).build()
405 } else {
406 Tokenizer::new()
407 };
408 FtsTokenizer {
409 tokenizer,
410 stopwords: self.stopwords.unwrap_or_else(StopwordSet::builtin),
411 synonyms: self.synonyms.unwrap_or_else(SynonymMap::empty),
412 ngram_size: self.ngram_size.unwrap_or(3),
413 pos_tagger: self.pos_tagger.unwrap_or_else(PosTagger::builtin),
414 ne_tagger: self.ne_tagger.unwrap_or_else(NeTagger::builtin),
415 romanization: self.romanization,
416 abbrev_map: self.abbrev_map,
417 number_normalize: self.number_normalize.unwrap_or(true),
418 soundex: self.soundex,
419 }
420 }
421}
422
423/// Full-text search tokenizer for Thai text.
424///
425/// Wraps [`Tokenizer`] with stopword filtering, synonym expansion, and n-gram
426/// generation for out-of-vocabulary tokens.
427///
428/// Construct once and reuse:
429///
430/// ```rust
431/// use kham_core::fts::FtsTokenizer;
432///
433/// let fts = FtsTokenizer::new();
434/// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
435/// assert!(!tokens.is_empty());
436/// ```
437pub struct FtsTokenizer {
438 tokenizer: Tokenizer,
439 stopwords: StopwordSet,
440 synonyms: SynonymMap,
441 ngram_size: usize,
442 pos_tagger: PosTagger,
443 ne_tagger: NeTagger,
444 romanization: Option<RomanizationMap>,
445 abbrev_map: Option<AbbrevMap>,
446 number_normalize: bool,
447 soundex: Option<SoundexAlgorithm>,
448}
449
450impl FtsTokenizer {
451 /// Create an [`FtsTokenizer`] with built-in stopwords and no synonyms.
452 ///
453 /// # Example
454 ///
455 /// ```rust
456 /// use kham_core::fts::FtsTokenizer;
457 ///
458 /// let fts = FtsTokenizer::new();
459 /// let lexemes = fts.lexemes("กินข้าวกับปลา");
460 /// // Built-in stopword กับ is excluded; content words are present
461 /// assert!(!lexemes.contains(&String::from("กับ")));
462 /// assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
463 /// ```
464 pub fn new() -> Self {
465 FtsTokenizerBuilder::default().build()
466 }
467
468 /// Return a [`FtsTokenizerBuilder`] for custom configuration.
469 ///
470 /// # Example
471 ///
472 /// ```rust
473 /// use kham_core::fts::FtsTokenizer;
474 /// use kham_core::soundex::SoundexAlgorithm;
475 /// use kham_core::synonym::SynonymMap;
476 ///
477 /// let fts = FtsTokenizer::builder()
478 /// .synonyms(SynonymMap::from_tsv("รถ\tรถยนต์\n"))
479 /// .soundex(SoundexAlgorithm::Lk82)
480 /// .build();
481 /// assert!(!fts.segment_for_fts("รถ").is_empty());
482 /// ```
483 pub fn builder() -> FtsTokenizerBuilder {
484 FtsTokenizerBuilder::default()
485 }
486
487 /// Segment `text` and annotate each token for FTS indexing.
488 ///
489 /// Normalises the input text before segmentation so that สระลอย and stacked
490 /// tone marks are handled correctly. Whitespace tokens are excluded.
491 ///
492 /// The returned `Vec<FtsToken>` covers all non-whitespace tokens. Call
493 /// [`index_tokens`] instead when you only need the tokens to be indexed
494 /// (stopwords excluded).
495 ///
496 /// [`index_tokens`]: FtsTokenizer::index_tokens
497 ///
498 /// # Examples
499 ///
500 /// ```rust
501 /// use kham_core::fts::FtsTokenizer;
502 ///
503 /// let fts = FtsTokenizer::new();
504 /// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
505 /// // Positions are 0-based and sequential across non-whitespace tokens
506 /// for (i, t) in tokens.iter().enumerate() {
507 /// assert_eq!(t.position, i);
508 /// }
509 /// // กับ is a common conjunction — marked as a stopword
510 /// let kap = tokens.iter().find(|t| t.text == "กับ").unwrap();
511 /// assert!(kap.is_stop);
512 /// ```
513 ///
514 /// Named entities are tagged automatically — `kind` becomes `TokenKind::Named`:
515 ///
516 /// ```rust
517 /// use kham_core::fts::FtsTokenizer;
518 /// use kham_core::TokenKind;
519 ///
520 /// let fts = FtsTokenizer::new();
521 /// let tokens = fts.segment_for_fts("ไปกรุงเทพ");
522 /// assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));
523 /// ```
524 ///
525 /// Enable phonetic synonyms with [`FtsTokenizerBuilder::soundex`]:
526 ///
527 /// ```rust
528 /// use kham_core::fts::FtsTokenizer;
529 /// use kham_core::soundex::SoundexAlgorithm;
530 ///
531 /// let fts = FtsTokenizer::builder()
532 /// .soundex(SoundexAlgorithm::Lk82)
533 /// .build();
534 /// let tokens = fts.segment_for_fts("กิน");
535 /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
536 /// // synonyms now contains the lk82 code, enabling fuzzy phonetic matching
537 /// assert!(!t.synonyms.is_empty());
538 /// ```
539 pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken> {
540 let normalized = self.tokenizer.normalize(text);
541 // Expand abbreviations (e.g. ก.ค. → กรกฎาคม) before segmentation so
542 // dot-containing patterns are replaced as single units.
543 let expanded = match self.abbrev_map.as_ref() {
544 Some(am) => am.expand_text(&normalized),
545 None => normalized,
546 };
547 let raw_tokens = self
548 .ne_tagger
549 .tag_tokens(self.tokenizer.segment(&expanded), &expanded);
550
551 let mut result = Vec::with_capacity(raw_tokens.len());
552 let mut position = 0usize;
553
554 for token in &raw_tokens {
555 if token.kind == TokenKind::Whitespace {
556 continue;
557 }
558
559 let is_stop = self.stopwords.contains(token.text);
560 let is_thai_or_named = matches!(token.kind, TokenKind::Thai | TokenKind::Named(_));
561 let mut synonyms = self
562 .synonyms
563 .expand(token.text)
564 .map(|s| s.to_vec())
565 .unwrap_or_default();
566 if is_thai_or_named {
567 if let Some(ref rom) = self.romanization {
568 if let Some(rtgs) = rom.romanize(token.text) {
569 synonyms.push(String::from(rtgs));
570 }
571 }
572 if let Some(algo) = self.soundex {
573 let code = soundex(token.text, algo);
574 if !code.chars().all(|c| c == '0') {
575 synonyms.push(code);
576 }
577 }
578 }
579 if self.number_normalize {
580 match token.kind {
581 // Number token with Thai digits → add ASCII form as synonym.
582 TokenKind::Number => {
583 let ascii = thai_digits_to_ascii(token.text);
584 if ascii != token.text {
585 synonyms.push(ascii);
586 }
587 }
588 // Thai token that is a recognised number word → add decimal string.
589 TokenKind::Thai => {
590 if let Some(decimal) = thai_word_to_decimal(token.text) {
591 synonyms.push(decimal);
592 }
593 }
594 _ => {}
595 }
596 }
597 let trigrams = if token.kind == TokenKind::Unknown && self.ngram_size > 0 {
598 char_ngrams(token.text, self.ngram_size)
599 .map(String::from)
600 .collect()
601 } else {
602 Vec::new()
603 };
604 let ne = if let TokenKind::Named(k) = token.kind {
605 Some(k)
606 } else {
607 None
608 };
609 let pos = if token.kind == TokenKind::Thai {
610 self.pos_tagger.tag(token.text)
611 } else {
612 None
613 };
614
615 result.push(FtsToken {
616 text: String::from(token.text),
617 position,
618 kind: token.kind,
619 is_stop,
620 synonyms,
621 trigrams,
622 pos,
623 ne,
624 });
625
626 position += 1;
627 }
628
629 result
630 }
631
632 /// Return only the tokens to be written into a search index.
633 ///
634 /// Filters out stopwords and whitespace. Each [`FtsToken`] still carries
635 /// its original `position` so phrase-distance scoring remains correct.
636 ///
637 /// # Example
638 ///
639 /// ```rust
640 /// use kham_core::fts::FtsTokenizer;
641 ///
642 /// let fts = FtsTokenizer::new();
643 /// let tokens = fts.index_tokens("กินข้าวกับปลา");
644 /// // No stopwords in the index
645 /// assert!(tokens.iter().all(|t| !t.is_stop));
646 /// // Positions are preserved from the full sequence for phrase scoring
647 /// let positions: Vec<usize> = tokens.iter().map(|t| t.position).collect();
648 /// assert!(positions.windows(2).all(|w| w[0] < w[1]));
649 /// ```
650 pub fn index_tokens(&self, text: &str) -> Vec<FtsToken> {
651 self.segment_for_fts(text)
652 .into_iter()
653 .filter(|t| !t.is_stop)
654 .collect()
655 }
656
657 /// Return a streaming iterator over the FTS tokens for `text`.
658 ///
659 /// Equivalent to [`segment_for_fts`] but wraps the result in an
660 /// [`FtsTokenStream`] so callers can consume tokens one at a time.
661 /// Use [`FtsTokenStream::next_index_token`] to skip stopwords automatically.
662 ///
663 /// The full token list is materialised internally because the NE tagger
664 /// requires multi-token context; this is a streaming *consumer*, not a
665 /// lazy producer.
666 ///
667 /// # Example
668 ///
669 /// ```rust
670 /// use kham_core::fts::FtsTokenizer;
671 ///
672 /// let fts = FtsTokenizer::new();
673 /// let mut stream = fts.segment_stream("กินข้าวกับปลา");
674 /// let mut index_texts: Vec<String> = Vec::new();
675 /// while let Some(tok) = stream.next_index_token() {
676 /// index_texts.push(tok.text);
677 /// }
678 /// // กับ is a stopword — it should not appear in index_texts
679 /// assert!(!index_texts.contains(&String::from("กับ")));
680 /// assert!(index_texts.iter().any(|t| t == "กิน" || t == "ปลา"));
681 /// ```
682 ///
683 /// [`segment_for_fts`]: FtsTokenizer::segment_for_fts
684 pub fn segment_stream(&self, text: &str) -> FtsTokenStream {
685 FtsTokenStream {
686 inner: self.segment_for_fts(text).into_iter(),
687 }
688 }
689
690 /// Collect all lexeme strings to be stored in a `tsvector`.
691 ///
692 /// Returns one string per non-stop token, plus synonym expansions and
693 /// trigrams for unknown tokens. Duplicates are not removed (the caller or
694 /// PostgreSQL handles deduplication).
695 ///
696 /// # Example
697 ///
698 /// ```rust
699 /// use kham_core::fts::FtsTokenizer;
700 ///
701 /// let fts = FtsTokenizer::new();
702 /// let lexemes = fts.lexemes("กินข้าวกับปลา");
703 /// // Content words are present; stopword กับ is absent
704 /// assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
705 /// assert!(!lexemes.contains(&String::from("กับ")));
706 /// ```
707 ///
708 /// With Thai digit normalization (enabled by default), both scripts match:
709 ///
710 /// ```rust
711 /// use kham_core::fts::FtsTokenizer;
712 ///
713 /// let fts = FtsTokenizer::new();
714 /// let lexemes = fts.lexemes("ธนาคาร๑๐๐แห่ง");
715 /// // ๑๐๐ (Thai digits) → synonym "100" (ASCII) — both appear in lexemes
716 /// assert!(lexemes.contains(&String::from("100")));
717 /// ```
718 pub fn lexemes(&self, text: &str) -> Vec<String> {
719 let tokens = self.index_tokens(text);
720 let mut out: Vec<String> = Vec::with_capacity(tokens.len() * 2);
721 for t in tokens {
722 out.push(t.text.clone());
723 out.extend(t.synonyms);
724 out.extend(t.trigrams);
725 }
726 out
727 }
728}
729
730impl Default for FtsTokenizer {
731 fn default() -> Self {
732 Self::new()
733 }
734}
735
736// ---------------------------------------------------------------------------
737// Tests
738// ---------------------------------------------------------------------------
739
740#[cfg(test)]
741mod tests {
742 use super::*;
743 use crate::stopwords::StopwordSet;
744 use crate::synonym::SynonymMap;
745
746 fn fts() -> FtsTokenizer {
747 FtsTokenizer::new()
748 }
749
750 // ── segment_for_fts ───────────────────────────────────────────────────────
751
752 #[test]
753 fn empty_input_returns_empty() {
754 assert!(fts().segment_for_fts("").is_empty());
755 }
756
757 #[test]
758 fn whitespace_tokens_excluded() {
759 let tokens = fts().segment_for_fts("กิน ข้าว");
760 assert!(tokens.iter().all(|t| t.kind != TokenKind::Whitespace));
761 }
762
763 #[test]
764 fn positions_are_sequential() {
765 let tokens = fts().segment_for_fts("กินข้าวกับปลา");
766 for (i, t) in tokens.iter().enumerate() {
767 assert_eq!(t.position, i, "position mismatch at index {i}");
768 }
769 }
770
771 #[test]
772 fn known_stopword_is_tagged() {
773 // "กับ" is a common conjunction and should be in the built-in stopword list
774 let tokens = fts().segment_for_fts("กินข้าวกับปลา");
775 let kap = tokens.iter().find(|t| t.text == "กับ");
776 assert!(kap.is_some(), "expected 'กับ' token");
777 assert!(kap.unwrap().is_stop, "'กับ' should be tagged as stopword");
778 }
779
780 #[test]
781 fn content_words_not_tagged_as_stop() {
782 let tokens = fts().segment_for_fts("โรงพยาบาล");
783 // May be OOV but should not be a stopword
784 for t in &tokens {
785 assert!(!t.is_stop, "'{}' should not be a stopword", t.text);
786 }
787 }
788
789 #[test]
790 fn text_is_reconstructable() {
791 // All tokens joined == normalised input (whitespace dropped)
792 let fts = fts();
793 let text = "กินข้าวกับปลา";
794 let normalized = fts.tokenizer.normalize(text);
795 let tokens = fts.segment_for_fts(text);
796 let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
797 assert_eq!(rebuilt, normalized);
798 }
799
800 // ── synonym expansion ─────────────────────────────────────────────────────
801
802 #[test]
803 fn synonym_expansion_attached() {
804 let synonyms = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
805 let fts = FtsTokenizer::builder()
806 .synonyms(synonyms)
807 .stopwords(StopwordSet::from_text(""))
808 .build();
809 // Segment a text containing "คอม" — need it in dict or it lands as Unknown
810 // Use builder with custom word so the segmenter recognises it
811 let tokens = fts.segment_for_fts("คอม");
812 let t = tokens.iter().find(|t| t.text == "คอม");
813 if let Some(tok) = t {
814 assert!(
815 tok.synonyms.contains(&String::from("คอมพิวเตอร์")),
816 "expected synonym expansion, got {:?}",
817 tok.synonyms
818 );
819 }
820 }
821
822 #[test]
823 fn no_synonyms_when_map_empty() {
824 let tokens = fts().segment_for_fts("กินข้าว");
825 for t in &tokens {
826 assert!(t.synonyms.is_empty());
827 }
828 }
829
830 // ── unknown token trigrams ────────────────────────────────────────────────
831
832 #[test]
833 fn unknown_token_gets_trigrams() {
834 // "กิ" = consonant + sara-i, a single 2-char TCC that is not a word.
835 // With ngram_size=2 the token should yield one bigram ("กิ").
836 // The newmm DP emits Unknown tokens one TCC at a time, so multi-char TCCs
837 // (like "กิ") are the shortest unit that can produce n-grams.
838 let fts = FtsTokenizer::builder()
839 .ngram_size(2)
840 .stopwords(StopwordSet::from_text(""))
841 .build();
842 let tokens = fts.segment_for_fts("กิ");
843 let unknown: Vec<_> = tokens
844 .iter()
845 .filter(|t| t.kind == TokenKind::Unknown && t.text.chars().count() >= 2)
846 .collect();
847 assert!(
848 !unknown.is_empty(),
849 "expected at least one multi-char Unknown token for 'กิ'"
850 );
851 for u in &unknown {
852 assert!(
853 !u.trigrams.is_empty(),
854 "unknown token '{}' ({} chars) should have bigrams",
855 u.text,
856 u.text.chars().count()
857 );
858 }
859 }
860
861 #[test]
862 fn known_thai_token_has_no_trigrams() {
863 let tokens = fts().segment_for_fts("กิน");
864 for t in &tokens {
865 if t.kind == TokenKind::Thai {
866 assert!(
867 t.trigrams.is_empty(),
868 "known Thai token '{}' should not have trigrams",
869 t.text
870 );
871 }
872 }
873 }
874
875 #[test]
876 fn ngram_size_zero_disables_trigrams() {
877 let fts = FtsTokenizer::builder()
878 .ngram_size(0)
879 .stopwords(StopwordSet::from_text(""))
880 .build();
881 let tokens = fts.segment_for_fts("กขคง");
882 for t in &tokens {
883 assert!(t.trigrams.is_empty());
884 }
885 }
886
887 // ── index_tokens ──────────────────────────────────────────────────────────
888
889 #[test]
890 fn index_tokens_excludes_stopwords() {
891 let tokens = fts().index_tokens("กินข้าวกับปลา");
892 assert!(tokens.iter().all(|t| !t.is_stop));
893 }
894
895 #[test]
896 fn index_tokens_preserves_positions() {
897 // Positions in index_tokens must be a subset of segment_for_fts positions
898 let all = fts().segment_for_fts("กินข้าวกับปลา");
899 let indexed = fts().index_tokens("กินข้าวกับปลา");
900 for t in &indexed {
901 assert!(
902 all.iter().any(|a| a.position == t.position),
903 "indexed token at position {} not found in full token list",
904 t.position
905 );
906 }
907 }
908
909 // ── lexemes ───────────────────────────────────────────────────────────────
910
911 #[test]
912 fn lexemes_returns_non_stop_texts() {
913 let lexemes = fts().lexemes("กินข้าวกับปลา");
914 // "กับ" is a stopword — should not appear
915 assert!(!lexemes.contains(&String::from("กับ")));
916 // Content words should appear
917 assert!(
918 lexemes
919 .iter()
920 .any(|l| l == "กิน" || l == "ข้าว" || l == "ปลา"),
921 "expected content words in lexemes: {lexemes:?}"
922 );
923 }
924
925 #[test]
926 fn lexemes_empty_input_is_empty() {
927 assert!(fts().lexemes("").is_empty());
928 }
929
930 // ── multi-token NE ────────────────────────────────────────────────────────
931
932 #[test]
933 fn multi_token_ne_merged_in_pipeline() {
934 // กรุงเทพ is in the NE gazetteer as PLACE; the segmenter splits it
935 // into กรุง+เทพ. The FTS pipeline must merge them into one Named token.
936 let fts = FtsTokenizer::new();
937 let tokens = fts.segment_for_fts("ไปกรุงเทพ");
938 let named: Vec<_> = tokens
939 .iter()
940 .filter(|t| matches!(t.kind, TokenKind::Named(_)))
941 .collect();
942 assert!(
943 named.iter().any(|t| t.text == "กรุงเทพ"),
944 "กรุงเทพ should be tagged Named after multi-token merge, tokens: {:?}",
945 tokens
946 .iter()
947 .map(|t| (&t.text, &t.kind))
948 .collect::<alloc::vec::Vec<_>>()
949 );
950 }
951
952 #[test]
953 fn multi_token_ne_reconstructable() {
954 // Texts of all non-whitespace tokens must still reconstruct the normalized input.
955 let fts = FtsTokenizer::new();
956 let text = "ไปกรุงเทพ";
957 let normalized = fts.tokenizer.normalize(text);
958 let tokens = fts.segment_for_fts(text);
959 let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
960 assert_eq!(rebuilt, normalized);
961 }
962
963 // ── builder ───────────────────────────────────────────────────────────────
964
965 #[test]
966 fn builder_custom_stopwords() {
967 let stops = StopwordSet::from_text("กิน\n");
968 let fts = FtsTokenizer::builder().stopwords(stops).build();
969 let tokens = fts.segment_for_fts("กินข้าว");
970 let gin = tokens.iter().find(|t| t.text == "กิน");
971 if let Some(t) = gin {
972 assert!(t.is_stop, "'กิน' should be stop with custom list");
973 }
974 }
975
976 #[test]
977 fn builder_default_equals_new() {
978 // Both paths should produce the same result for a simple input
979 let a = FtsTokenizer::new().lexemes("กินข้าว");
980 let b = FtsTokenizer::builder().build().lexemes("กินข้าว");
981 assert_eq!(a, b);
982 }
983
984 // ── number normalization ──────────────────────────────────────────────────
985
986 #[test]
987 fn thai_digit_token_gets_ascii_synonym() {
988 let fts = FtsTokenizer::new();
989 let tokens = fts.segment_for_fts("๑๒๓");
990 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
991 assert!(num.is_some(), "expected a Number token");
992 let t = num.unwrap();
993 assert!(
994 t.synonyms.contains(&String::from("123")),
995 "Thai digit token should have ASCII synonym, got {:?}",
996 t.synonyms
997 );
998 }
999
1000 #[test]
1001 fn ascii_digit_token_has_no_extra_synonym() {
1002 // ASCII digits need no conversion — synonyms should be empty (no map, no rom).
1003 let fts = FtsTokenizer::new();
1004 let tokens = fts.segment_for_fts("123");
1005 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
1006 assert!(num.is_some(), "expected a Number token");
1007 assert!(
1008 !num.unwrap().synonyms.contains(&String::from("123")),
1009 "ASCII digit token should not duplicate itself as a synonym"
1010 );
1011 }
1012
1013 #[test]
1014 fn thai_number_word_gets_decimal_synonym() {
1015 // หนึ่งร้อย may segment as a single Thai token or multiple tokens depending
1016 // on the dictionary. We check that at least one token carries "100" in synonyms.
1017 let fts = FtsTokenizer::new();
1018 let tokens = fts.segment_for_fts("หนึ่งร้อย");
1019 let has_hundred = tokens
1020 .iter()
1021 .any(|t| t.synonyms.contains(&String::from("100")));
1022 // หนึ่ง alone = Some(1), ร้อย alone = Some(100) — at least ร้อย should match.
1023 assert!(
1024 has_hundred,
1025 "expected a token with decimal synonym '100', tokens: {:?}",
1026 tokens
1027 .iter()
1028 .map(|t| (&t.text, &t.synonyms))
1029 .collect::<alloc::vec::Vec<_>>()
1030 );
1031 }
1032
1033 #[test]
1034 fn number_normalize_false_disables_conversion() {
1035 let fts = FtsTokenizer::builder()
1036 .number_normalize(false)
1037 .stopwords(StopwordSet::from_text(""))
1038 .build();
1039 let tokens = fts.segment_for_fts("๑๒๓");
1040 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
1041 assert!(num.is_some());
1042 assert!(
1043 !num.unwrap().synonyms.contains(&String::from("123")),
1044 "number_normalize=false should suppress ASCII synonym"
1045 );
1046 }
1047
1048 #[test]
1049 fn mixed_thai_digit_in_context() {
1050 // "ธนาคาร๑๐๐แห่ง" — the ๑๐๐ part should be a Number token with synonym "100"
1051 let fts = FtsTokenizer::new();
1052 let tokens = fts.segment_for_fts("ธนาคาร๑๐๐แห่ง");
1053 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
1054 assert!(num.is_some(), "expected Number token in mixed string");
1055 assert!(
1056 num.unwrap().synonyms.contains(&String::from("100")),
1057 "expected ASCII synonym '100' for ๑๐๐"
1058 );
1059 }
1060
1061 // ── abbreviation expansion ────────────────────────────────────────────────
1062
1063 #[test]
1064 fn abbrev_map_expands_before_segmentation() {
1065 use crate::abbrev::AbbrevMap;
1066 let fts = FtsTokenizer::builder()
1067 .abbrevs(AbbrevMap::builtin())
1068 .stopwords(StopwordSet::from_text(""))
1069 .build();
1070 // ก.ค. → กรกฎาคม before segmentation. The segmenter may split the
1071 // expansion further (กรกฎา + คม) — what matters is that dots are gone
1072 // and the Thai characters of กรกฎาคม are present.
1073 let tokens = fts.segment_for_fts("ก.ค.");
1074 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1075 let joined: String = texts.concat();
1076 assert!(
1077 joined.contains("กรกฎา") || joined.contains("กรกฎาคม"),
1078 "expected กรกฎา(คม) characters after abbrev expansion, got: {texts:?}"
1079 );
1080 assert!(
1081 !texts.contains(&"."),
1082 "dots should be consumed by abbrev expansion, got: {texts:?}"
1083 );
1084 }
1085
1086 // ── segment_stream / FtsTokenStream ──────────────────────────────────────
1087
1088 #[test]
1089 fn segment_stream_yields_all_non_whitespace_tokens() {
1090 let fts = fts();
1091 let via_vec = fts.segment_for_fts("กินข้าวกับปลา");
1092 let via_stream: Vec<FtsToken> = fts.segment_stream("กินข้าวกับปลา").collect();
1093 assert_eq!(via_vec, via_stream);
1094 }
1095
1096 #[test]
1097 fn segment_stream_empty_input() {
1098 let mut stream = fts().segment_stream("");
1099 assert!(stream.next().is_none());
1100 }
1101
1102 #[test]
1103 fn next_index_token_skips_stopwords() {
1104 let fts = fts();
1105 let mut stream = fts.segment_stream("กินข้าวกับปลา");
1106 let mut texts = Vec::new();
1107 while let Some(tok) = stream.next_index_token() {
1108 texts.push(tok.text);
1109 }
1110 assert!(
1111 !texts.contains(&String::from("กับ")),
1112 "stopword กับ must be skipped"
1113 );
1114 assert!(
1115 texts.iter().any(|t| t == "กิน" || t == "ปลา"),
1116 "content words must be yielded"
1117 );
1118 }
1119
1120 #[test]
1121 fn next_index_token_matches_index_tokens() {
1122 let fts = fts();
1123 let text = "กินข้าวกับปลา";
1124 let via_index: Vec<_> = fts.index_tokens(text);
1125 let mut stream = fts.segment_stream(text);
1126 let mut via_stream = Vec::new();
1127 while let Some(tok) = stream.next_index_token() {
1128 via_stream.push(tok);
1129 }
1130 assert_eq!(via_index, via_stream);
1131 }
1132
1133 #[test]
1134 fn stream_size_hint_is_correct() {
1135 let fts = fts();
1136 let via_vec = fts.segment_for_fts("กินข้าวกับปลา");
1137 let n = via_vec.len();
1138 let stream = fts.segment_stream("กินข้าวกับปลา");
1139 assert_eq!(stream.size_hint(), (n, Some(n)));
1140 }
1141
1142 #[test]
1143 fn abbrev_expansion_disabled_by_default() {
1144 // FtsTokenizer::new() has no abbrev_map — ก.ค. stays as individual tokens.
1145 let fts = FtsTokenizer::new();
1146 let tokens = fts.segment_for_fts("ก.ค.");
1147 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1148 // Without expansion the dot(s) must still be present as punctuation tokens.
1149 assert!(
1150 texts.contains(&"."),
1151 "without abbrev expansion, dots should remain as tokens, got: {texts:?}"
1152 );
1153 }
1154
1155 // ── soundex synonyms ──────────────────────────────────────────────────────
1156
1157 #[test]
1158 fn soundex_lk82_appended_to_thai_synonyms() {
1159 use crate::soundex::lk82;
1160 let fts = FtsTokenizer::builder()
1161 .soundex(SoundexAlgorithm::Lk82)
1162 .stopwords(StopwordSet::from_text(""))
1163 .build();
1164 let tokens = fts.segment_for_fts("กิน");
1165 let t = tokens.iter().find(|t| t.text == "กิน");
1166 assert!(t.is_some(), "expected token 'กิน'");
1167 let expected_code = lk82("กิน");
1168 assert!(
1169 t.unwrap().synonyms.contains(&expected_code),
1170 "expected lk82 code '{expected_code}' in synonyms, got {:?}",
1171 t.unwrap().synonyms
1172 );
1173 }
1174
1175 #[test]
1176 fn soundex_not_emitted_by_default() {
1177 // Without .soundex() in the builder, no soundex codes should appear.
1178 let fts = FtsTokenizer::new();
1179 let tokens = fts.segment_for_fts("กินข้าว");
1180 for t in &tokens {
1181 // A soundex code is 4 ASCII chars (lk82/udom83); no synonym should look like one.
1182 for syn in &t.synonyms {
1183 let looks_like_soundex =
1184 syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
1185 assert!(
1186 !looks_like_soundex,
1187 "unexpected soundex-like synonym '{}' on token '{}'",
1188 syn, t.text
1189 );
1190 }
1191 }
1192 }
1193
1194 #[test]
1195 fn soundex_same_sounding_words_share_code_in_index() {
1196 // กาน and ขาน share lk82 code "1600"; both should carry it as a synonym.
1197 use crate::soundex::lk82;
1198 let fts = FtsTokenizer::builder()
1199 .soundex(SoundexAlgorithm::Lk82)
1200 .stopwords(StopwordSet::from_text(""))
1201 .build();
1202 let code = lk82("กาน");
1203 for word in &["กาน", "ขาน", "คาน"] {
1204 let tokens = fts.segment_for_fts(word);
1205 let t = tokens.first().expect("expected at least one token");
1206 assert!(
1207 t.synonyms.contains(&code),
1208 "'{word}' should carry lk82 code '{code}', got {:?}",
1209 t.synonyms
1210 );
1211 }
1212 }
1213
1214 #[test]
1215 fn soundex_not_emitted_for_non_thai_tokens() {
1216 let fts = FtsTokenizer::builder()
1217 .soundex(SoundexAlgorithm::Lk82)
1218 .stopwords(StopwordSet::from_text(""))
1219 .build();
1220 let tokens = fts.segment_for_fts("hello 123");
1221 for t in &tokens {
1222 for syn in &t.synonyms {
1223 let looks_like_soundex =
1224 syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
1225 assert!(
1226 !looks_like_soundex,
1227 "non-Thai token '{}' should not get a soundex synonym, got '{syn}'",
1228 t.text
1229 );
1230 }
1231 }
1232 }
1233
1234 #[test]
1235 fn soundex_udom83_appended() {
1236 use crate::soundex::udom83;
1237 let fts = FtsTokenizer::builder()
1238 .soundex(SoundexAlgorithm::Udom83)
1239 .stopwords(StopwordSet::from_text(""))
1240 .build();
1241 let tokens = fts.segment_for_fts("กิน");
1242 let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
1243 let expected = udom83("กิน");
1244 assert!(
1245 t.synonyms.contains(&expected),
1246 "expected udom83 code '{expected}' in synonyms, got {:?}",
1247 t.synonyms
1248 );
1249 }
1250
1251 #[test]
1252 fn abbrev_expansion_date_sentence() {
1253 use crate::abbrev::AbbrevMap;
1254 let fts = FtsTokenizer::builder()
1255 .abbrevs(AbbrevMap::builtin())
1256 .stopwords(StopwordSet::from_text(""))
1257 .build();
1258 // พ.ศ. → พุทธศักราช; the segmenter may split it further — verify the
1259 // chars are present and dots are gone.
1260 let tokens = fts.segment_for_fts("พ.ศ.2567");
1261 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1262 let joined: String = texts.concat();
1263 assert!(
1264 joined.contains("พุทธ") || joined.contains("พุทธศักราช"),
1265 "expected พุทธ(ศักราช) chars after expanding พ.ศ., got: {texts:?}"
1266 );
1267 assert!(
1268 !texts.contains(&"."),
1269 "dots should be consumed by expansion, got: {texts:?}"
1270 );
1271 }
1272}