kham_core/fts.rs
1//! Full-text search pipeline for Thai text.
2//!
3//! [`FtsTokenizer`] orchestrates the complete FTS indexing pipeline:
4//! normalise → segment → tag stopwords → expand synonyms → attach position.
5//!
6//! The output [`FtsToken`] slice is consumed by the PostgreSQL `kham-pg`
7//! extension and by any other caller that needs FTS-ready lexemes.
8//!
9//! # Positions
10//!
11//! `position` is the ordinal index of the token in the non-whitespace token
12//! sequence (0-based). Stopwords retain their position so that phrase-distance
13//! scoring remains correct when stopwords are later omitted from the index.
14//!
15//! # Example
16//!
17//! ```rust
18//! use kham_core::fts::{FtsTokenizer, FtsToken};
19//!
20//! let fts = FtsTokenizer::new();
21//! let tokens = fts.segment_for_fts("กินข้าวกับปลา");
22//! for t in &tokens {
23//! println!("{} pos={} stop={}", t.text, t.position, t.is_stop);
24//! }
25//! ```
26
27use alloc::string::String;
28use alloc::vec::Vec;
29
30use crate::abbrev::AbbrevMap;
31use crate::ne::NeTagger;
32use crate::ngram::char_ngrams;
33use crate::number::{thai_digits_to_ascii, thai_word_to_decimal};
34use crate::pos::{PosTag, PosTagger};
35use crate::romanizer::RomanizationMap;
36use crate::soundex::{soundex, SoundexAlgorithm};
37use crate::stopwords::StopwordSet;
38use crate::synonym::SynonymMap;
39use crate::token::{NamedEntityKind, TokenKind};
40use crate::Tokenizer;
41
42/// A streaming iterator over [`FtsToken`]s produced by the FTS pipeline.
43///
44/// Returned by [`FtsTokenizer::segment_stream`]. Internally holds the full
45/// `Vec<FtsToken>` as an [`alloc::vec::IntoIter`]; the streaming API is provided
46/// so callers can consume tokens one at a time without materialising a second
47/// collection.
48///
49/// # Example
50///
51/// ```rust
52/// use kham_core::fts::FtsTokenizer;
53///
54/// let fts = FtsTokenizer::new();
55/// let mut stream = fts.segment_stream("กินข้าวกับปลา");
56/// // next_index_token() skips stopwords — กับ is a stopword and is skipped.
57/// while let Some(tok) = stream.next_index_token() {
58/// println!("{} pos={}", tok.text, tok.position);
59/// }
60/// ```
61pub struct FtsTokenStream {
62 inner: alloc::vec::IntoIter<FtsToken>,
63}
64
65impl FtsTokenStream {
66 /// Advance to the next token that should be written into the search index,
67 /// skipping stopwords.
68 ///
69 /// Equivalent to calling [`Iterator::next`] in a loop until a token with
70 /// `is_stop == false` is found, or the stream is exhausted.
71 pub fn next_index_token(&mut self) -> Option<FtsToken> {
72 self.inner.by_ref().find(|t| !t.is_stop)
73 }
74}
75
76impl Iterator for FtsTokenStream {
77 type Item = FtsToken;
78
79 #[inline]
80 fn next(&mut self) -> Option<FtsToken> {
81 self.inner.next()
82 }
83
84 #[inline]
85 fn size_hint(&self) -> (usize, Option<usize>) {
86 self.inner.size_hint()
87 }
88}
89
90/// A token produced by the FTS pipeline, ready for lexeme indexing.
91#[derive(Debug, Clone, PartialEq)]
92pub struct FtsToken {
93 /// The token text (owned; may be normalised).
94 pub text: String,
95 /// Ordinal position in the token sequence (0-based, gaps for whitespace).
96 pub position: usize,
97 /// Script / category of the original token.
98 pub kind: TokenKind,
99 /// `true` if this token matches the stopword list.
100 pub is_stop: bool,
101 /// Synonym expansions (empty if none configured or no match).
102 pub synonyms: Vec<String>,
103 /// Character trigrams — populated only for [`TokenKind::Unknown`] tokens.
104 pub trigrams: Vec<String>,
105 /// Primary part-of-speech tag from the lookup table, or `None` if the word
106 /// is not in the table (OOV) or is not a Thai token.
107 pub pos: Option<PosTag>,
108 /// Named entity category, or `None` if the token is not in the NE
109 /// gazetteer. When set, `kind` is [`TokenKind::Named`]`(ne)`.
110 pub ne: Option<NamedEntityKind>,
111 /// Segmentation confidence in the range `[0.0, 1.0]`.
112 /// `0.0` = Unknown token (no dictionary evidence).
113 /// `1.0` = unambiguous high-frequency dictionary match.
114 pub confidence: f32,
115}
116
117/// Builder for [`FtsTokenizer`].
118#[derive(Default)]
119pub struct FtsTokenizerBuilder {
120 stopwords: Option<StopwordSet>,
121 synonyms: Option<SynonymMap>,
122 ngram_size: Option<usize>,
123 pos_tagger: Option<PosTagger>,
124 ne_tagger: Option<NeTagger>,
125 romanization: Option<RomanizationMap>,
126 abbrev_map: Option<AbbrevMap>,
127 /// `None` means "use default (true)".
128 number_normalize: Option<bool>,
129 soundex: Option<SoundexAlgorithm>,
130 /// Extra words to overlay on top of the built-in dictionary (fast path).
131 dict_merge: Option<String>,
132}
133
134impl FtsTokenizerBuilder {
135 /// Use a custom stopword set instead of the built-in list.
136 ///
137 /// # Example
138 ///
139 /// ```rust
140 /// use kham_core::fts::FtsTokenizer;
141 /// use kham_core::stopwords::StopwordSet;
142 ///
143 /// let stops = StopwordSet::from_text("กิน\nข้าว\n");
144 /// let fts = FtsTokenizer::builder().stopwords(stops).build();
145 /// let tokens = fts.segment_for_fts("กินข้าว");
146 /// assert!(tokens.iter().all(|t| t.is_stop || t.text != "กิน"));
147 /// ```
148 pub fn stopwords(mut self, s: StopwordSet) -> Self {
149 self.stopwords = Some(s);
150 self
151 }
152
153 /// Attach a synonym map for expansion.
154 ///
155 /// # Example
156 ///
157 /// ```rust
158 /// use kham_core::fts::FtsTokenizer;
159 /// use kham_core::synonym::SynonymMap;
160 ///
161 /// // TSV: canonical TAB synonym1 TAB synonym2 …
162 /// let syns = SynonymMap::from_tsv("รถ\tรถยนต์\tยานพาหนะ\n");
163 /// let fts = FtsTokenizer::builder().synonyms(syns).build();
164 /// let tokens = fts.segment_for_fts("รถ");
165 /// let t = tokens.iter().find(|t| t.text == "รถ").unwrap();
166 /// assert!(t.synonyms.contains(&String::from("รถยนต์")));
167 /// ```
168 pub fn synonyms(mut self, m: SynonymMap) -> Self {
169 self.synonyms = Some(m);
170 self
171 }
172
173 /// Override the n-gram size used for [`TokenKind::Unknown`] tokens.
174 ///
175 /// Default: 3 (trigrams). Set to 0 to disable n-gram generation.
176 ///
177 /// # Example
178 ///
179 /// ```rust
180 /// use kham_core::fts::FtsTokenizer;
181 /// use kham_core::stopwords::StopwordSet;
182 ///
183 /// // Disable n-grams entirely — useful when index size must be small
184 /// let fts = FtsTokenizer::builder()
185 /// .ngram_size(0)
186 /// .stopwords(StopwordSet::from_text(""))
187 /// .build();
188 /// let tokens = fts.segment_for_fts("กขคง"); // unknown word → no trigrams
189 /// assert!(tokens.iter().all(|t| t.trigrams.is_empty()));
190 /// ```
191 pub fn ngram_size(mut self, n: usize) -> Self {
192 self.ngram_size = Some(n);
193 self
194 }
195
196 /// Use a custom POS tagger instead of the built-in table.
197 ///
198 /// # Example
199 ///
200 /// ```rust
201 /// use kham_core::fts::FtsTokenizer;
202 /// use kham_core::pos::{PosTag, PosTagger};
203 ///
204 /// // Custom TSV: word TAB POS_TAG
205 /// let tagger = PosTagger::from_tsv("กิน\tVERB\n");
206 /// let fts = FtsTokenizer::builder().pos_tagger(tagger).build();
207 /// // Segment กิน alone so it is not merged into a compound
208 /// let tokens = fts.segment_for_fts("กิน");
209 /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
210 /// assert_eq!(t.pos, Some(PosTag::Verb));
211 /// ```
212 pub fn pos_tagger(mut self, t: PosTagger) -> Self {
213 self.pos_tagger = Some(t);
214 self
215 }
216
217 /// Use a custom NE gazetteer instead of the built-in table.
218 ///
219 /// # Example
220 ///
221 /// ```rust
222 /// use kham_core::fts::FtsTokenizer;
223 /// use kham_core::ne::NeTagger;
224 /// use kham_core::TokenKind;
225 ///
226 /// // Domain-specific NE list: word TAB NE_TAG
227 /// let ne = NeTagger::from_tsv("เซเรน่า\tPERSON\n");
228 /// let fts = FtsTokenizer::builder().ne_tagger(ne).build();
229 /// let tokens = fts.segment_for_fts("เซเรน่า");
230 /// assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));
231 /// ```
232 pub fn ne_tagger(mut self, t: NeTagger) -> Self {
233 self.ne_tagger = Some(t);
234 self
235 }
236
237 /// Attach a romanization map so RTGS forms are added to [`FtsToken::synonyms`].
238 ///
239 /// When set, each Thai and Named token whose text is found in the map gets its
240 /// RTGS romanization appended to `synonyms`, enabling Latin-script queries
241 /// (e.g. `kin`) to match Thai-script documents (e.g. `กิน`) in PostgreSQL FTS.
242 ///
243 /// Disabled by default — call this method to opt in.
244 ///
245 /// # Example
246 ///
247 /// ```rust
248 /// use kham_core::fts::FtsTokenizer;
249 /// use kham_core::romanizer::RomanizationMap;
250 ///
251 /// // TSV: Thai word TAB RTGS romanization
252 /// let rom = RomanizationMap::from_tsv("กิน\tkin\n");
253 /// let fts = FtsTokenizer::builder().romanization(rom).build();
254 /// let tokens = fts.segment_for_fts("กิน");
255 /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
256 /// // Latin synonym "kin" enables queries like `WHERE doc @@ 'kin'`
257 /// assert!(t.synonyms.contains(&String::from("kin")));
258 /// ```
259 pub fn romanization(mut self, m: RomanizationMap) -> Self {
260 self.romanization = Some(m);
261 self
262 }
263
264 /// Attach an abbreviation map for pre-tokenisation expansion.
265 ///
266 /// When set, [`FtsTokenizer::segment_for_fts`] calls
267 /// [`AbbrevMap::expand_text`] on the normalised input before segmentation.
268 /// This replaces abbreviated forms (e.g. `ก.ค.`) with their canonical
269 /// expansions (`กรกฎาคม`) so they are indexed and searchable by full form.
270 ///
271 /// Disabled by default — call this method to opt in.
272 ///
273 /// # Example
274 ///
275 /// ```rust
276 /// use kham_core::fts::FtsTokenizer;
277 /// use kham_core::abbrev::AbbrevMap;
278 /// use kham_core::stopwords::StopwordSet;
279 ///
280 /// let fts = FtsTokenizer::builder()
281 /// .abbrevs(AbbrevMap::builtin())
282 /// .stopwords(StopwordSet::from_text(""))
283 /// .build();
284 /// // ก.ค. expands to กรกฎาคม before segmentation — dots disappear
285 /// let tokens = fts.segment_for_fts("ก.ค.");
286 /// let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
287 /// assert!(!texts.contains(&"."), "dots should be consumed by expansion");
288 /// ```
289 pub fn abbrevs(mut self, m: AbbrevMap) -> Self {
290 self.abbrev_map = Some(m);
291 self
292 }
293
294 /// Enable or disable number normalization (default: `true`).
295 ///
296 /// When enabled:
297 /// - [`TokenKind::Number`] tokens that contain Thai digits (๐–๙) get the
298 /// ASCII digit string added to their [`FtsToken::synonyms`]
299 /// (e.g. `๑๒๓` → synonym `"123"`).
300 /// - [`TokenKind::Thai`] tokens that are recognised Thai cardinal number
301 /// words get their decimal value added to `synonyms`
302 /// (e.g. `หนึ่งร้อย` → synonym `"100"`).
303 ///
304 /// This lets queries using either script match documents written in the
305 /// other. Set to `false` to opt out.
306 ///
307 /// # Example
308 ///
309 /// ```rust
310 /// use kham_core::fts::FtsTokenizer;
311 /// use kham_core::TokenKind;
312 ///
313 /// // Default (true): ๑๒๓ gets ASCII synonym "123"
314 /// let fts = FtsTokenizer::new();
315 /// let tokens = fts.segment_for_fts("๑๒๓");
316 /// let num = tokens.iter().find(|t| t.kind == TokenKind::Number).unwrap();
317 /// assert!(num.synonyms.contains(&String::from("123")));
318 ///
319 /// // Opt out: no conversion performed
320 /// let fts_off = FtsTokenizer::builder().number_normalize(false).build();
321 /// let tokens_off = fts_off.segment_for_fts("๑๒๓");
322 /// let num_off = tokens_off.iter().find(|t| t.kind == TokenKind::Number).unwrap();
323 /// assert!(!num_off.synonyms.contains(&String::from("123")));
324 /// ```
325 pub fn number_normalize(mut self, v: bool) -> Self {
326 self.number_normalize = Some(v);
327 self
328 }
329
330 /// Emit a Thai phonetic soundex code as an additional synonym for Thai and Named tokens.
331 ///
332 /// When set, each Thai and Named token whose text contains Thai consonants gets its
333 /// soundex code appended to [`FtsToken::synonyms`], enabling phonetic fuzzy matching
334 /// in full-text search (e.g. querying `"1600"` matches กาน, ขาน, and คาน with lk82).
335 ///
336 /// [`SoundexAlgorithm::Lk82`] and [`SoundexAlgorithm::Udom83`] produce fixed
337 /// 4-character codes and are the recommended choices for FTS indexing.
338 /// [`SoundexAlgorithm::MetaSound`] produces variable-length codes and is more
339 /// collision-prone at word level — prefer lk82 or udom83 for general FTS use.
340 ///
341 /// Disabled by default — call this method to opt in.
342 ///
343 /// # Example
344 ///
345 /// ```rust
346 /// use kham_core::fts::FtsTokenizer;
347 /// use kham_core::soundex::{lk82, SoundexAlgorithm};
348 /// use kham_core::stopwords::StopwordSet;
349 ///
350 /// let fts = FtsTokenizer::builder()
351 /// .soundex(SoundexAlgorithm::Lk82)
352 /// .stopwords(StopwordSet::from_text(""))
353 /// .build();
354 /// // กาน / ขาน / คาน all map to the same lk82 code — stored once per token
355 /// for word in &["กาน", "ขาน", "คาน"] {
356 /// let tokens = fts.segment_for_fts(word);
357 /// let t = tokens.first().unwrap();
358 /// assert!(t.synonyms.contains(&lk82(word)), "{word} missing lk82 synonym");
359 /// }
360 /// ```
361 pub fn soundex(mut self, algo: SoundexAlgorithm) -> Self {
362 self.soundex = Some(algo);
363 self
364 }
365
366 /// Overlay extra words on the built-in dictionary without a full trie rebuild.
367 ///
368 /// Words are stored in a sorted list alongside the pre-compiled trie.
369 /// Prefer this over a full rebuild when adding a small domain-specific
370 /// vocabulary (e.g. product names, technical terms).
371 ///
372 /// Newline-separated; `#` lines are ignored.
373 ///
374 /// # Example
375 ///
376 /// ```rust
377 /// use kham_core::fts::FtsTokenizer;
378 /// use kham_core::TokenKind;
379 ///
380 /// let fts = FtsTokenizer::builder()
381 /// .dict_merge("โปรแกรมเมอร์\nปัญญาประดิษฐ์\n")
382 /// .build();
383 /// let tokens = fts.segment_for_fts("โปรแกรมเมอร์ไทย");
384 /// assert!(tokens.iter().any(|t| t.text == "โปรแกรมเมอร์" && t.kind == TokenKind::Thai));
385 /// ```
386 pub fn dict_merge(mut self, words: &str) -> Self {
387 self.dict_merge = Some(String::from(words));
388 self
389 }
390
391 /// Consume the builder and return a configured [`FtsTokenizer`].
392 ///
393 /// # Example
394 ///
395 /// ```rust
396 /// use kham_core::fts::FtsTokenizer;
397 /// use kham_core::soundex::SoundexAlgorithm;
398 /// use kham_core::stopwords::StopwordSet;
399 ///
400 /// let fts = FtsTokenizer::builder()
401 /// .soundex(SoundexAlgorithm::Lk82)
402 /// .stopwords(StopwordSet::from_text(""))
403 /// .build();
404 /// assert!(!fts.segment_for_fts("กินข้าว").is_empty());
405 /// ```
406 pub fn build(self) -> FtsTokenizer {
407 let tokenizer = if let Some(ref words) = self.dict_merge {
408 Tokenizer::builder().dict_merge(words).build()
409 } else {
410 Tokenizer::new()
411 };
412 FtsTokenizer {
413 tokenizer,
414 stopwords: self.stopwords.unwrap_or_else(StopwordSet::builtin),
415 synonyms: self.synonyms.unwrap_or_else(SynonymMap::empty),
416 ngram_size: self.ngram_size.unwrap_or(3),
417 pos_tagger: self.pos_tagger.unwrap_or_else(PosTagger::builtin),
418 ne_tagger: self.ne_tagger.unwrap_or_else(NeTagger::builtin),
419 romanization: self.romanization,
420 abbrev_map: self.abbrev_map,
421 number_normalize: self.number_normalize.unwrap_or(true),
422 soundex: self.soundex,
423 }
424 }
425}
426
427/// Full-text search tokenizer for Thai text.
428///
429/// Wraps [`Tokenizer`] with stopword filtering, synonym expansion, and n-gram
430/// generation for out-of-vocabulary tokens.
431///
432/// Construct once and reuse:
433///
434/// ```rust
435/// use kham_core::fts::FtsTokenizer;
436///
437/// let fts = FtsTokenizer::new();
438/// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
439/// assert!(!tokens.is_empty());
440/// ```
441pub struct FtsTokenizer {
442 tokenizer: Tokenizer,
443 stopwords: StopwordSet,
444 synonyms: SynonymMap,
445 ngram_size: usize,
446 pos_tagger: PosTagger,
447 ne_tagger: NeTagger,
448 romanization: Option<RomanizationMap>,
449 abbrev_map: Option<AbbrevMap>,
450 number_normalize: bool,
451 soundex: Option<SoundexAlgorithm>,
452}
453
454impl FtsTokenizer {
455 /// Create an [`FtsTokenizer`] with built-in stopwords and no synonyms.
456 ///
457 /// # Example
458 ///
459 /// ```rust
460 /// use kham_core::fts::FtsTokenizer;
461 ///
462 /// let fts = FtsTokenizer::new();
463 /// let lexemes = fts.lexemes("กินข้าวกับปลา");
464 /// // Built-in stopword กับ is excluded; content words are present
465 /// assert!(!lexemes.contains(&String::from("กับ")));
466 /// assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
467 /// ```
468 pub fn new() -> Self {
469 FtsTokenizerBuilder::default().build()
470 }
471
472 /// Return a [`FtsTokenizerBuilder`] for custom configuration.
473 ///
474 /// # Example
475 ///
476 /// ```rust
477 /// use kham_core::fts::FtsTokenizer;
478 /// use kham_core::soundex::SoundexAlgorithm;
479 /// use kham_core::synonym::SynonymMap;
480 ///
481 /// let fts = FtsTokenizer::builder()
482 /// .synonyms(SynonymMap::from_tsv("รถ\tรถยนต์\n"))
483 /// .soundex(SoundexAlgorithm::Lk82)
484 /// .build();
485 /// assert!(!fts.segment_for_fts("รถ").is_empty());
486 /// ```
487 pub fn builder() -> FtsTokenizerBuilder {
488 FtsTokenizerBuilder::default()
489 }
490
491 /// Segment `text` and annotate each token for FTS indexing.
492 ///
493 /// Normalises the input text before segmentation so that สระลอย and stacked
494 /// tone marks are handled correctly. Whitespace tokens are excluded.
495 ///
496 /// The returned `Vec<FtsToken>` covers all non-whitespace tokens. Call
497 /// [`index_tokens`] instead when you only need the tokens to be indexed
498 /// (stopwords excluded).
499 ///
500 /// [`index_tokens`]: FtsTokenizer::index_tokens
501 ///
502 /// # Examples
503 ///
504 /// ```rust
505 /// use kham_core::fts::FtsTokenizer;
506 ///
507 /// let fts = FtsTokenizer::new();
508 /// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
509 /// // Positions are 0-based and sequential across non-whitespace tokens
510 /// for (i, t) in tokens.iter().enumerate() {
511 /// assert_eq!(t.position, i);
512 /// }
513 /// // กับ is a common conjunction — marked as a stopword
514 /// let kap = tokens.iter().find(|t| t.text == "กับ").unwrap();
515 /// assert!(kap.is_stop);
516 /// ```
517 ///
518 /// Named entities are tagged automatically — `kind` becomes `TokenKind::Named`:
519 ///
520 /// ```rust
521 /// use kham_core::fts::FtsTokenizer;
522 /// use kham_core::TokenKind;
523 ///
524 /// let fts = FtsTokenizer::new();
525 /// let tokens = fts.segment_for_fts("ไปกรุงเทพ");
526 /// assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));
527 /// ```
528 ///
529 /// Enable phonetic synonyms with [`FtsTokenizerBuilder::soundex`]:
530 ///
531 /// ```rust
532 /// use kham_core::fts::FtsTokenizer;
533 /// use kham_core::soundex::SoundexAlgorithm;
534 ///
535 /// let fts = FtsTokenizer::builder()
536 /// .soundex(SoundexAlgorithm::Lk82)
537 /// .build();
538 /// let tokens = fts.segment_for_fts("กิน");
539 /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
540 /// // synonyms now contains the lk82 code, enabling fuzzy phonetic matching
541 /// assert!(!t.synonyms.is_empty());
542 /// ```
543 pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken> {
544 let normalized = self.tokenizer.normalize(text);
545 // Expand abbreviations (e.g. ก.ค. → กรกฎาคม) before segmentation so
546 // dot-containing patterns are replaced as single units.
547 let expanded = match self.abbrev_map.as_ref() {
548 Some(am) => am.expand_text(&normalized),
549 None => normalized,
550 };
551 let raw_tokens = self
552 .ne_tagger
553 .tag_tokens(self.tokenizer.segment(&expanded), &expanded);
554
555 let mut result = Vec::with_capacity(raw_tokens.len());
556 let mut position = 0usize;
557
558 for token in &raw_tokens {
559 if token.kind == TokenKind::Whitespace {
560 continue;
561 }
562
563 let is_stop = self.stopwords.contains(token.text);
564 let is_thai_or_named = matches!(token.kind, TokenKind::Thai | TokenKind::Named(_));
565 let mut synonyms = self
566 .synonyms
567 .expand(token.text)
568 .map(|s| s.to_vec())
569 .unwrap_or_default();
570 if is_thai_or_named {
571 if let Some(ref rom) = self.romanization {
572 if let Some(rtgs) = rom.romanize(token.text) {
573 synonyms.push(String::from(rtgs));
574 }
575 }
576 if let Some(algo) = self.soundex {
577 let code = soundex(token.text, algo);
578 if !code.chars().all(|c| c == '0') {
579 synonyms.push(code);
580 }
581 }
582 }
583 if self.number_normalize {
584 match token.kind {
585 // Number token with Thai digits → add ASCII form as synonym.
586 TokenKind::Number => {
587 let ascii = thai_digits_to_ascii(token.text);
588 if ascii != token.text {
589 synonyms.push(ascii);
590 }
591 }
592 // Thai token that is a recognised number word → add decimal string.
593 TokenKind::Thai => {
594 if let Some(decimal) = thai_word_to_decimal(token.text) {
595 synonyms.push(decimal);
596 }
597 }
598 _ => {}
599 }
600 }
601 let trigrams = if token.kind == TokenKind::Unknown && self.ngram_size > 0 {
602 char_ngrams(token.text, self.ngram_size)
603 .map(String::from)
604 .collect()
605 } else {
606 Vec::new()
607 };
608 let ne = if let TokenKind::Named(k) = token.kind {
609 Some(k)
610 } else {
611 None
612 };
613 let pos = if token.kind == TokenKind::Thai {
614 self.pos_tagger.tag(token.text)
615 } else {
616 None
617 };
618
619 result.push(FtsToken {
620 text: String::from(token.text),
621 position,
622 kind: token.kind,
623 is_stop,
624 synonyms,
625 trigrams,
626 pos,
627 ne,
628 confidence: token.confidence,
629 });
630
631 position += 1;
632 }
633
634 result
635 }
636
637 /// Return only the tokens to be written into a search index.
638 ///
639 /// Filters out stopwords and whitespace. Each [`FtsToken`] still carries
640 /// its original `position` so phrase-distance scoring remains correct.
641 ///
642 /// # Example
643 ///
644 /// ```rust
645 /// use kham_core::fts::FtsTokenizer;
646 ///
647 /// let fts = FtsTokenizer::new();
648 /// let tokens = fts.index_tokens("กินข้าวกับปลา");
649 /// // No stopwords in the index
650 /// assert!(tokens.iter().all(|t| !t.is_stop));
651 /// // Positions are preserved from the full sequence for phrase scoring
652 /// let positions: Vec<usize> = tokens.iter().map(|t| t.position).collect();
653 /// assert!(positions.windows(2).all(|w| w[0] < w[1]));
654 /// ```
655 pub fn index_tokens(&self, text: &str) -> Vec<FtsToken> {
656 self.segment_for_fts(text)
657 .into_iter()
658 .filter(|t| !t.is_stop)
659 .collect()
660 }
661
662 /// Return a streaming iterator over the FTS tokens for `text`.
663 ///
664 /// Equivalent to [`segment_for_fts`] but wraps the result in an
665 /// [`FtsTokenStream`] so callers can consume tokens one at a time.
666 /// Use [`FtsTokenStream::next_index_token`] to skip stopwords automatically.
667 ///
668 /// The full token list is materialised internally because the NE tagger
669 /// requires multi-token context; this is a streaming *consumer*, not a
670 /// lazy producer.
671 ///
672 /// # Example
673 ///
674 /// ```rust
675 /// use kham_core::fts::FtsTokenizer;
676 ///
677 /// let fts = FtsTokenizer::new();
678 /// let mut stream = fts.segment_stream("กินข้าวกับปลา");
679 /// let mut index_texts: Vec<String> = Vec::new();
680 /// while let Some(tok) = stream.next_index_token() {
681 /// index_texts.push(tok.text);
682 /// }
683 /// // กับ is a stopword — it should not appear in index_texts
684 /// assert!(!index_texts.contains(&String::from("กับ")));
685 /// assert!(index_texts.iter().any(|t| t == "กิน" || t == "ปลา"));
686 /// ```
687 ///
688 /// [`segment_for_fts`]: FtsTokenizer::segment_for_fts
689 pub fn segment_stream(&self, text: &str) -> FtsTokenStream {
690 FtsTokenStream {
691 inner: self.segment_for_fts(text).into_iter(),
692 }
693 }
694
695 /// Collect all lexeme strings to be stored in a `tsvector`.
696 ///
697 /// Returns one string per non-stop token, plus synonym expansions and
698 /// trigrams for unknown tokens. Duplicates are not removed (the caller or
699 /// PostgreSQL handles deduplication).
700 ///
701 /// # Example
702 ///
703 /// ```rust
704 /// use kham_core::fts::FtsTokenizer;
705 ///
706 /// let fts = FtsTokenizer::new();
707 /// let lexemes = fts.lexemes("กินข้าวกับปลา");
708 /// // Content words are present; stopword กับ is absent
709 /// assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
710 /// assert!(!lexemes.contains(&String::from("กับ")));
711 /// ```
712 ///
713 /// With Thai digit normalization (enabled by default), both scripts match:
714 ///
715 /// ```rust
716 /// use kham_core::fts::FtsTokenizer;
717 ///
718 /// let fts = FtsTokenizer::new();
719 /// let lexemes = fts.lexemes("ธนาคาร๑๐๐แห่ง");
720 /// // ๑๐๐ (Thai digits) → synonym "100" (ASCII) — both appear in lexemes
721 /// assert!(lexemes.contains(&String::from("100")));
722 /// ```
723 pub fn lexemes(&self, text: &str) -> Vec<String> {
724 let tokens = self.index_tokens(text);
725 let mut out: Vec<String> = Vec::with_capacity(tokens.len() * 2);
726 for t in tokens {
727 out.push(t.text.clone());
728 out.extend(t.synonyms);
729 out.extend(t.trigrams);
730 }
731 out
732 }
733}
734
735impl Default for FtsTokenizer {
736 fn default() -> Self {
737 Self::new()
738 }
739}
740
741// ---------------------------------------------------------------------------
742// Tests
743// ---------------------------------------------------------------------------
744
745#[cfg(test)]
746mod tests {
747 use super::*;
748 use crate::stopwords::StopwordSet;
749 use crate::synonym::SynonymMap;
750
751 fn fts() -> FtsTokenizer {
752 FtsTokenizer::new()
753 }
754
755 // ── segment_for_fts ───────────────────────────────────────────────────────
756
757 #[test]
758 fn empty_input_returns_empty() {
759 assert!(fts().segment_for_fts("").is_empty());
760 }
761
762 #[test]
763 fn whitespace_tokens_excluded() {
764 let tokens = fts().segment_for_fts("กิน ข้าว");
765 assert!(tokens.iter().all(|t| t.kind != TokenKind::Whitespace));
766 }
767
768 #[test]
769 fn positions_are_sequential() {
770 let tokens = fts().segment_for_fts("กินข้าวกับปลา");
771 for (i, t) in tokens.iter().enumerate() {
772 assert_eq!(t.position, i, "position mismatch at index {i}");
773 }
774 }
775
776 #[test]
777 fn known_stopword_is_tagged() {
778 // "กับ" is a common conjunction and should be in the built-in stopword list
779 let tokens = fts().segment_for_fts("กินข้าวกับปลา");
780 let kap = tokens.iter().find(|t| t.text == "กับ");
781 assert!(kap.is_some(), "expected 'กับ' token");
782 assert!(kap.unwrap().is_stop, "'กับ' should be tagged as stopword");
783 }
784
785 #[test]
786 fn content_words_not_tagged_as_stop() {
787 let tokens = fts().segment_for_fts("โรงพยาบาล");
788 // May be OOV but should not be a stopword
789 for t in &tokens {
790 assert!(!t.is_stop, "'{}' should not be a stopword", t.text);
791 }
792 }
793
794 #[test]
795 fn text_is_reconstructable() {
796 // All tokens joined == normalised input (whitespace dropped)
797 let fts = fts();
798 let text = "กินข้าวกับปลา";
799 let normalized = fts.tokenizer.normalize(text);
800 let tokens = fts.segment_for_fts(text);
801 let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
802 assert_eq!(rebuilt, normalized);
803 }
804
805 // ── synonym expansion ─────────────────────────────────────────────────────
806
807 #[test]
808 fn synonym_expansion_attached() {
809 let synonyms = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
810 let fts = FtsTokenizer::builder()
811 .synonyms(synonyms)
812 .stopwords(StopwordSet::from_text(""))
813 .build();
814 // Segment a text containing "คอม" — need it in dict or it lands as Unknown
815 // Use builder with custom word so the segmenter recognises it
816 let tokens = fts.segment_for_fts("คอม");
817 let t = tokens.iter().find(|t| t.text == "คอม");
818 if let Some(tok) = t {
819 assert!(
820 tok.synonyms.contains(&String::from("คอมพิวเตอร์")),
821 "expected synonym expansion, got {:?}",
822 tok.synonyms
823 );
824 }
825 }
826
827 #[test]
828 fn no_synonyms_when_map_empty() {
829 let tokens = fts().segment_for_fts("กินข้าว");
830 for t in &tokens {
831 assert!(t.synonyms.is_empty());
832 }
833 }
834
835 // ── unknown token trigrams ────────────────────────────────────────────────
836
837 #[test]
838 fn unknown_token_gets_trigrams() {
839 // "กิ" = consonant + sara-i, a single 2-char TCC that is not a word.
840 // With ngram_size=2 the token should yield one bigram ("กิ").
841 // The newmm DP emits Unknown tokens one TCC at a time, so multi-char TCCs
842 // (like "กิ") are the shortest unit that can produce n-grams.
843 let fts = FtsTokenizer::builder()
844 .ngram_size(2)
845 .stopwords(StopwordSet::from_text(""))
846 .build();
847 let tokens = fts.segment_for_fts("กิ");
848 let unknown: Vec<_> = tokens
849 .iter()
850 .filter(|t| t.kind == TokenKind::Unknown && t.text.chars().count() >= 2)
851 .collect();
852 assert!(
853 !unknown.is_empty(),
854 "expected at least one multi-char Unknown token for 'กิ'"
855 );
856 for u in &unknown {
857 assert!(
858 !u.trigrams.is_empty(),
859 "unknown token '{}' ({} chars) should have bigrams",
860 u.text,
861 u.text.chars().count()
862 );
863 }
864 }
865
866 #[test]
867 fn known_thai_token_has_no_trigrams() {
868 let tokens = fts().segment_for_fts("กิน");
869 for t in &tokens {
870 if t.kind == TokenKind::Thai {
871 assert!(
872 t.trigrams.is_empty(),
873 "known Thai token '{}' should not have trigrams",
874 t.text
875 );
876 }
877 }
878 }
879
880 #[test]
881 fn ngram_size_zero_disables_trigrams() {
882 let fts = FtsTokenizer::builder()
883 .ngram_size(0)
884 .stopwords(StopwordSet::from_text(""))
885 .build();
886 let tokens = fts.segment_for_fts("กขคง");
887 for t in &tokens {
888 assert!(t.trigrams.is_empty());
889 }
890 }
891
892 // ── index_tokens ──────────────────────────────────────────────────────────
893
894 #[test]
895 fn index_tokens_excludes_stopwords() {
896 let tokens = fts().index_tokens("กินข้าวกับปลา");
897 assert!(tokens.iter().all(|t| !t.is_stop));
898 }
899
900 #[test]
901 fn index_tokens_preserves_positions() {
902 // Positions in index_tokens must be a subset of segment_for_fts positions
903 let all = fts().segment_for_fts("กินข้าวกับปลา");
904 let indexed = fts().index_tokens("กินข้าวกับปลา");
905 for t in &indexed {
906 assert!(
907 all.iter().any(|a| a.position == t.position),
908 "indexed token at position {} not found in full token list",
909 t.position
910 );
911 }
912 }
913
914 // ── lexemes ───────────────────────────────────────────────────────────────
915
916 #[test]
917 fn lexemes_returns_non_stop_texts() {
918 let lexemes = fts().lexemes("กินข้าวกับปลา");
919 // "กับ" is a stopword — should not appear
920 assert!(!lexemes.contains(&String::from("กับ")));
921 // Content words should appear
922 assert!(
923 lexemes
924 .iter()
925 .any(|l| l == "กิน" || l == "ข้าว" || l == "ปลา"),
926 "expected content words in lexemes: {lexemes:?}"
927 );
928 }
929
930 #[test]
931 fn lexemes_empty_input_is_empty() {
932 assert!(fts().lexemes("").is_empty());
933 }
934
935 // ── multi-token NE ────────────────────────────────────────────────────────
936
937 #[test]
938 fn multi_token_ne_merged_in_pipeline() {
939 // กรุงเทพ is in the NE gazetteer as PLACE; the segmenter splits it
940 // into กรุง+เทพ. The FTS pipeline must merge them into one Named token.
941 let fts = FtsTokenizer::new();
942 let tokens = fts.segment_for_fts("ไปกรุงเทพ");
943 let named: Vec<_> = tokens
944 .iter()
945 .filter(|t| matches!(t.kind, TokenKind::Named(_)))
946 .collect();
947 assert!(
948 named.iter().any(|t| t.text == "กรุงเทพ"),
949 "กรุงเทพ should be tagged Named after multi-token merge, tokens: {:?}",
950 tokens
951 .iter()
952 .map(|t| (&t.text, &t.kind))
953 .collect::<alloc::vec::Vec<_>>()
954 );
955 }
956
957 #[test]
958 fn multi_token_ne_reconstructable() {
959 // Texts of all non-whitespace tokens must still reconstruct the normalized input.
960 let fts = FtsTokenizer::new();
961 let text = "ไปกรุงเทพ";
962 let normalized = fts.tokenizer.normalize(text);
963 let tokens = fts.segment_for_fts(text);
964 let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
965 assert_eq!(rebuilt, normalized);
966 }
967
968 // ── builder ───────────────────────────────────────────────────────────────
969
970 #[test]
971 fn builder_custom_stopwords() {
972 let stops = StopwordSet::from_text("กิน\n");
973 let fts = FtsTokenizer::builder().stopwords(stops).build();
974 let tokens = fts.segment_for_fts("กินข้าว");
975 let gin = tokens.iter().find(|t| t.text == "กิน");
976 if let Some(t) = gin {
977 assert!(t.is_stop, "'กิน' should be stop with custom list");
978 }
979 }
980
981 #[test]
982 fn builder_default_equals_new() {
983 // Both paths should produce the same result for a simple input
984 let a = FtsTokenizer::new().lexemes("กินข้าว");
985 let b = FtsTokenizer::builder().build().lexemes("กินข้าว");
986 assert_eq!(a, b);
987 }
988
989 // ── number normalization ──────────────────────────────────────────────────
990
991 #[test]
992 fn thai_digit_token_gets_ascii_synonym() {
993 let fts = FtsTokenizer::new();
994 let tokens = fts.segment_for_fts("๑๒๓");
995 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
996 assert!(num.is_some(), "expected a Number token");
997 let t = num.unwrap();
998 assert!(
999 t.synonyms.contains(&String::from("123")),
1000 "Thai digit token should have ASCII synonym, got {:?}",
1001 t.synonyms
1002 );
1003 }
1004
1005 #[test]
1006 fn ascii_digit_token_has_no_extra_synonym() {
1007 // ASCII digits need no conversion — synonyms should be empty (no map, no rom).
1008 let fts = FtsTokenizer::new();
1009 let tokens = fts.segment_for_fts("123");
1010 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
1011 assert!(num.is_some(), "expected a Number token");
1012 assert!(
1013 !num.unwrap().synonyms.contains(&String::from("123")),
1014 "ASCII digit token should not duplicate itself as a synonym"
1015 );
1016 }
1017
1018 #[test]
1019 fn thai_number_word_gets_decimal_synonym() {
1020 // หนึ่งร้อย may segment as a single Thai token or multiple tokens depending
1021 // on the dictionary. We check that at least one token carries "100" in synonyms.
1022 let fts = FtsTokenizer::new();
1023 let tokens = fts.segment_for_fts("หนึ่งร้อย");
1024 let has_hundred = tokens
1025 .iter()
1026 .any(|t| t.synonyms.contains(&String::from("100")));
1027 // หนึ่ง alone = Some(1), ร้อย alone = Some(100) — at least ร้อย should match.
1028 assert!(
1029 has_hundred,
1030 "expected a token with decimal synonym '100', tokens: {:?}",
1031 tokens
1032 .iter()
1033 .map(|t| (&t.text, &t.synonyms))
1034 .collect::<alloc::vec::Vec<_>>()
1035 );
1036 }
1037
1038 #[test]
1039 fn number_normalize_false_disables_conversion() {
1040 let fts = FtsTokenizer::builder()
1041 .number_normalize(false)
1042 .stopwords(StopwordSet::from_text(""))
1043 .build();
1044 let tokens = fts.segment_for_fts("๑๒๓");
1045 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
1046 assert!(num.is_some());
1047 assert!(
1048 !num.unwrap().synonyms.contains(&String::from("123")),
1049 "number_normalize=false should suppress ASCII synonym"
1050 );
1051 }
1052
1053 #[test]
1054 fn mixed_thai_digit_in_context() {
1055 // "ธนาคาร๑๐๐แห่ง" — the ๑๐๐ part should be a Number token with synonym "100"
1056 let fts = FtsTokenizer::new();
1057 let tokens = fts.segment_for_fts("ธนาคาร๑๐๐แห่ง");
1058 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
1059 assert!(num.is_some(), "expected Number token in mixed string");
1060 assert!(
1061 num.unwrap().synonyms.contains(&String::from("100")),
1062 "expected ASCII synonym '100' for ๑๐๐"
1063 );
1064 }
1065
1066 // ── abbreviation expansion ────────────────────────────────────────────────
1067
1068 #[test]
1069 fn abbrev_map_expands_before_segmentation() {
1070 use crate::abbrev::AbbrevMap;
1071 let fts = FtsTokenizer::builder()
1072 .abbrevs(AbbrevMap::builtin())
1073 .stopwords(StopwordSet::from_text(""))
1074 .build();
1075 // ก.ค. → กรกฎาคม before segmentation. The segmenter may split the
1076 // expansion further (กรกฎา + คม) — what matters is that dots are gone
1077 // and the Thai characters of กรกฎาคม are present.
1078 let tokens = fts.segment_for_fts("ก.ค.");
1079 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1080 let joined: String = texts.concat();
1081 assert!(
1082 joined.contains("กรกฎา") || joined.contains("กรกฎาคม"),
1083 "expected กรกฎา(คม) characters after abbrev expansion, got: {texts:?}"
1084 );
1085 assert!(
1086 !texts.contains(&"."),
1087 "dots should be consumed by abbrev expansion, got: {texts:?}"
1088 );
1089 }
1090
1091 // ── segment_stream / FtsTokenStream ──────────────────────────────────────
1092
1093 #[test]
1094 fn segment_stream_yields_all_non_whitespace_tokens() {
1095 let fts = fts();
1096 let via_vec = fts.segment_for_fts("กินข้าวกับปลา");
1097 let via_stream: Vec<FtsToken> = fts.segment_stream("กินข้าวกับปลา").collect();
1098 assert_eq!(via_vec, via_stream);
1099 }
1100
1101 #[test]
1102 fn segment_stream_empty_input() {
1103 let mut stream = fts().segment_stream("");
1104 assert!(stream.next().is_none());
1105 }
1106
1107 #[test]
1108 fn next_index_token_skips_stopwords() {
1109 let fts = fts();
1110 let mut stream = fts.segment_stream("กินข้าวกับปลา");
1111 let mut texts = Vec::new();
1112 while let Some(tok) = stream.next_index_token() {
1113 texts.push(tok.text);
1114 }
1115 assert!(
1116 !texts.contains(&String::from("กับ")),
1117 "stopword กับ must be skipped"
1118 );
1119 assert!(
1120 texts.iter().any(|t| t == "กิน" || t == "ปลา"),
1121 "content words must be yielded"
1122 );
1123 }
1124
1125 #[test]
1126 fn next_index_token_matches_index_tokens() {
1127 let fts = fts();
1128 let text = "กินข้าวกับปลา";
1129 let via_index: Vec<_> = fts.index_tokens(text);
1130 let mut stream = fts.segment_stream(text);
1131 let mut via_stream = Vec::new();
1132 while let Some(tok) = stream.next_index_token() {
1133 via_stream.push(tok);
1134 }
1135 assert_eq!(via_index, via_stream);
1136 }
1137
1138 #[test]
1139 fn stream_size_hint_is_correct() {
1140 let fts = fts();
1141 let via_vec = fts.segment_for_fts("กินข้าวกับปลา");
1142 let n = via_vec.len();
1143 let stream = fts.segment_stream("กินข้าวกับปลา");
1144 assert_eq!(stream.size_hint(), (n, Some(n)));
1145 }
1146
1147 #[test]
1148 fn abbrev_expansion_disabled_by_default() {
1149 // FtsTokenizer::new() has no abbrev_map — ก.ค. stays as individual tokens.
1150 let fts = FtsTokenizer::new();
1151 let tokens = fts.segment_for_fts("ก.ค.");
1152 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1153 // Without expansion the dot(s) must still be present as punctuation tokens.
1154 assert!(
1155 texts.contains(&"."),
1156 "without abbrev expansion, dots should remain as tokens, got: {texts:?}"
1157 );
1158 }
1159
1160 // ── soundex synonyms ──────────────────────────────────────────────────────
1161
1162 #[test]
1163 fn soundex_lk82_appended_to_thai_synonyms() {
1164 use crate::soundex::lk82;
1165 let fts = FtsTokenizer::builder()
1166 .soundex(SoundexAlgorithm::Lk82)
1167 .stopwords(StopwordSet::from_text(""))
1168 .build();
1169 let tokens = fts.segment_for_fts("กิน");
1170 let t = tokens.iter().find(|t| t.text == "กิน");
1171 assert!(t.is_some(), "expected token 'กิน'");
1172 let expected_code = lk82("กิน");
1173 assert!(
1174 t.unwrap().synonyms.contains(&expected_code),
1175 "expected lk82 code '{expected_code}' in synonyms, got {:?}",
1176 t.unwrap().synonyms
1177 );
1178 }
1179
1180 #[test]
1181 fn soundex_not_emitted_by_default() {
1182 // Without .soundex() in the builder, no soundex codes should appear.
1183 let fts = FtsTokenizer::new();
1184 let tokens = fts.segment_for_fts("กินข้าว");
1185 for t in &tokens {
1186 // A soundex code is 4 ASCII chars (lk82/udom83); no synonym should look like one.
1187 for syn in &t.synonyms {
1188 let looks_like_soundex =
1189 syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
1190 assert!(
1191 !looks_like_soundex,
1192 "unexpected soundex-like synonym '{}' on token '{}'",
1193 syn, t.text
1194 );
1195 }
1196 }
1197 }
1198
1199 #[test]
1200 fn soundex_same_sounding_words_share_code_in_index() {
1201 // กาน and ขาน share lk82 code "1600"; both should carry it as a synonym.
1202 use crate::soundex::lk82;
1203 let fts = FtsTokenizer::builder()
1204 .soundex(SoundexAlgorithm::Lk82)
1205 .stopwords(StopwordSet::from_text(""))
1206 .build();
1207 let code = lk82("กาน");
1208 for word in &["กาน", "ขาน", "คาน"] {
1209 let tokens = fts.segment_for_fts(word);
1210 let t = tokens.first().expect("expected at least one token");
1211 assert!(
1212 t.synonyms.contains(&code),
1213 "'{word}' should carry lk82 code '{code}', got {:?}",
1214 t.synonyms
1215 );
1216 }
1217 }
1218
1219 #[test]
1220 fn soundex_not_emitted_for_non_thai_tokens() {
1221 let fts = FtsTokenizer::builder()
1222 .soundex(SoundexAlgorithm::Lk82)
1223 .stopwords(StopwordSet::from_text(""))
1224 .build();
1225 let tokens = fts.segment_for_fts("hello 123");
1226 for t in &tokens {
1227 for syn in &t.synonyms {
1228 let looks_like_soundex =
1229 syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
1230 assert!(
1231 !looks_like_soundex,
1232 "non-Thai token '{}' should not get a soundex synonym, got '{syn}'",
1233 t.text
1234 );
1235 }
1236 }
1237 }
1238
1239 #[test]
1240 fn soundex_udom83_appended() {
1241 use crate::soundex::udom83;
1242 let fts = FtsTokenizer::builder()
1243 .soundex(SoundexAlgorithm::Udom83)
1244 .stopwords(StopwordSet::from_text(""))
1245 .build();
1246 let tokens = fts.segment_for_fts("กิน");
1247 let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
1248 let expected = udom83("กิน");
1249 assert!(
1250 t.synonyms.contains(&expected),
1251 "expected udom83 code '{expected}' in synonyms, got {:?}",
1252 t.synonyms
1253 );
1254 }
1255
1256 #[test]
1257 fn abbrev_expansion_date_sentence() {
1258 use crate::abbrev::AbbrevMap;
1259 let fts = FtsTokenizer::builder()
1260 .abbrevs(AbbrevMap::builtin())
1261 .stopwords(StopwordSet::from_text(""))
1262 .build();
1263 // พ.ศ. → พุทธศักราช; the segmenter may split it further — verify the
1264 // chars are present and dots are gone.
1265 let tokens = fts.segment_for_fts("พ.ศ.2567");
1266 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1267 let joined: String = texts.concat();
1268 assert!(
1269 joined.contains("พุทธ") || joined.contains("พุทธศักราช"),
1270 "expected พุทธ(ศักราช) chars after expanding พ.ศ., got: {texts:?}"
1271 );
1272 assert!(
1273 !texts.contains(&"."),
1274 "dots should be consumed by expansion, got: {texts:?}"
1275 );
1276 }
1277}