kham_core/fts.rs
1//! Full-text search pipeline for Thai text.
2//!
3//! [`FtsTokenizer`] orchestrates the complete FTS indexing pipeline:
4//! normalise → segment → tag stopwords → expand synonyms → attach position.
5//!
6//! The output [`FtsToken`] slice is consumed by the PostgreSQL `kham-pg`
7//! extension and by any other caller that needs FTS-ready lexemes.
8//!
9//! # Positions
10//!
11//! `position` is the ordinal index of the token in the non-whitespace token
12//! sequence (0-based). Stopwords retain their position so that phrase-distance
13//! scoring remains correct when stopwords are later omitted from the index.
14//!
15//! # Example
16//!
17//! ```rust
18//! use kham_core::fts::{FtsTokenizer, FtsToken};
19//!
20//! let fts = FtsTokenizer::new();
21//! let tokens = fts.segment_for_fts("กินข้าวกับปลา");
22//! for t in &tokens {
23//! println!("{} pos={} stop={}", t.text, t.position, t.is_stop);
24//! }
25//! ```
26
27use alloc::string::String;
28use alloc::vec::Vec;
29
30use crate::abbrev::AbbrevMap;
31use crate::ne::NeTagger;
32use crate::ngram::char_ngrams;
33use crate::number::{thai_digits_to_ascii, thai_word_to_decimal};
34use crate::pos::{PosTag, PosTagger};
35use crate::romanizer::RomanizationMap;
36use crate::soundex::{soundex, SoundexAlgorithm};
37use crate::stopwords::StopwordSet;
38use crate::synonym::SynonymMap;
39use crate::token::{NamedEntityKind, TokenKind};
40use crate::Tokenizer;
41
42/// A token produced by the FTS pipeline, ready for lexeme indexing.
43#[derive(Debug, Clone, PartialEq, Eq)]
44pub struct FtsToken {
45 /// The token text (owned; may be normalised).
46 pub text: String,
47 /// Ordinal position in the token sequence (0-based, gaps for whitespace).
48 pub position: usize,
49 /// Script / category of the original token.
50 pub kind: TokenKind,
51 /// `true` if this token matches the stopword list.
52 pub is_stop: bool,
53 /// Synonym expansions (empty if none configured or no match).
54 pub synonyms: Vec<String>,
55 /// Character trigrams — populated only for [`TokenKind::Unknown`] tokens.
56 pub trigrams: Vec<String>,
57 /// Primary part-of-speech tag from the lookup table, or `None` if the word
58 /// is not in the table (OOV) or is not a Thai token.
59 pub pos: Option<PosTag>,
60 /// Named entity category, or `None` if the token is not in the NE
61 /// gazetteer. When set, `kind` is [`TokenKind::Named`]`(ne)`.
62 pub ne: Option<NamedEntityKind>,
63}
64
65/// Builder for [`FtsTokenizer`].
66#[derive(Default)]
67pub struct FtsTokenizerBuilder {
68 stopwords: Option<StopwordSet>,
69 synonyms: Option<SynonymMap>,
70 ngram_size: Option<usize>,
71 pos_tagger: Option<PosTagger>,
72 ne_tagger: Option<NeTagger>,
73 romanization: Option<RomanizationMap>,
74 abbrev_map: Option<AbbrevMap>,
75 /// `None` means "use default (true)".
76 number_normalize: Option<bool>,
77 soundex: Option<SoundexAlgorithm>,
78 /// Extra words to overlay on top of the built-in dictionary (fast path).
79 dict_merge: Option<String>,
80}
81
82impl FtsTokenizerBuilder {
83 /// Use a custom stopword set instead of the built-in list.
84 ///
85 /// # Example
86 ///
87 /// ```rust
88 /// use kham_core::fts::FtsTokenizer;
89 /// use kham_core::stopwords::StopwordSet;
90 ///
91 /// let stops = StopwordSet::from_text("กิน\nข้าว\n");
92 /// let fts = FtsTokenizer::builder().stopwords(stops).build();
93 /// let tokens = fts.segment_for_fts("กินข้าว");
94 /// assert!(tokens.iter().all(|t| t.is_stop || t.text != "กิน"));
95 /// ```
96 pub fn stopwords(mut self, s: StopwordSet) -> Self {
97 self.stopwords = Some(s);
98 self
99 }
100
101 /// Attach a synonym map for expansion.
102 ///
103 /// # Example
104 ///
105 /// ```rust
106 /// use kham_core::fts::FtsTokenizer;
107 /// use kham_core::synonym::SynonymMap;
108 ///
109 /// // TSV: canonical TAB synonym1 TAB synonym2 …
110 /// let syns = SynonymMap::from_tsv("รถ\tรถยนต์\tยานพาหนะ\n");
111 /// let fts = FtsTokenizer::builder().synonyms(syns).build();
112 /// let tokens = fts.segment_for_fts("รถ");
113 /// let t = tokens.iter().find(|t| t.text == "รถ").unwrap();
114 /// assert!(t.synonyms.contains(&String::from("รถยนต์")));
115 /// ```
116 pub fn synonyms(mut self, m: SynonymMap) -> Self {
117 self.synonyms = Some(m);
118 self
119 }
120
121 /// Override the n-gram size used for [`TokenKind::Unknown`] tokens.
122 ///
123 /// Default: 3 (trigrams). Set to 0 to disable n-gram generation.
124 ///
125 /// # Example
126 ///
127 /// ```rust
128 /// use kham_core::fts::FtsTokenizer;
129 /// use kham_core::stopwords::StopwordSet;
130 ///
131 /// // Disable n-grams entirely — useful when index size must be small
132 /// let fts = FtsTokenizer::builder()
133 /// .ngram_size(0)
134 /// .stopwords(StopwordSet::from_text(""))
135 /// .build();
136 /// let tokens = fts.segment_for_fts("กขคง"); // unknown word → no trigrams
137 /// assert!(tokens.iter().all(|t| t.trigrams.is_empty()));
138 /// ```
139 pub fn ngram_size(mut self, n: usize) -> Self {
140 self.ngram_size = Some(n);
141 self
142 }
143
144 /// Use a custom POS tagger instead of the built-in table.
145 ///
146 /// # Example
147 ///
148 /// ```rust
149 /// use kham_core::fts::FtsTokenizer;
150 /// use kham_core::pos::{PosTag, PosTagger};
151 ///
152 /// // Custom TSV: word TAB POS_TAG
153 /// let tagger = PosTagger::from_tsv("กิน\tVERB\n");
154 /// let fts = FtsTokenizer::builder().pos_tagger(tagger).build();
155 /// // Segment กิน alone so it is not merged into a compound
156 /// let tokens = fts.segment_for_fts("กิน");
157 /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
158 /// assert_eq!(t.pos, Some(PosTag::Verb));
159 /// ```
160 pub fn pos_tagger(mut self, t: PosTagger) -> Self {
161 self.pos_tagger = Some(t);
162 self
163 }
164
165 /// Use a custom NE gazetteer instead of the built-in table.
166 ///
167 /// # Example
168 ///
169 /// ```rust
170 /// use kham_core::fts::FtsTokenizer;
171 /// use kham_core::ne::NeTagger;
172 /// use kham_core::TokenKind;
173 ///
174 /// // Domain-specific NE list: word TAB NE_TAG
175 /// let ne = NeTagger::from_tsv("เซเรน่า\tPERSON\n");
176 /// let fts = FtsTokenizer::builder().ne_tagger(ne).build();
177 /// let tokens = fts.segment_for_fts("เซเรน่า");
178 /// assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));
179 /// ```
180 pub fn ne_tagger(mut self, t: NeTagger) -> Self {
181 self.ne_tagger = Some(t);
182 self
183 }
184
185 /// Attach a romanization map so RTGS forms are added to [`FtsToken::synonyms`].
186 ///
187 /// When set, each Thai and Named token whose text is found in the map gets its
188 /// RTGS romanization appended to `synonyms`, enabling Latin-script queries
189 /// (e.g. `kin`) to match Thai-script documents (e.g. `กิน`) in PostgreSQL FTS.
190 ///
191 /// Disabled by default — call this method to opt in.
192 ///
193 /// # Example
194 ///
195 /// ```rust
196 /// use kham_core::fts::FtsTokenizer;
197 /// use kham_core::romanizer::RomanizationMap;
198 ///
199 /// // TSV: Thai word TAB RTGS romanization
200 /// let rom = RomanizationMap::from_tsv("กิน\tkin\n");
201 /// let fts = FtsTokenizer::builder().romanization(rom).build();
202 /// let tokens = fts.segment_for_fts("กิน");
203 /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
204 /// // Latin synonym "kin" enables queries like `WHERE doc @@ 'kin'`
205 /// assert!(t.synonyms.contains(&String::from("kin")));
206 /// ```
207 pub fn romanization(mut self, m: RomanizationMap) -> Self {
208 self.romanization = Some(m);
209 self
210 }
211
212 /// Attach an abbreviation map for pre-tokenisation expansion.
213 ///
214 /// When set, [`FtsTokenizer::segment_for_fts`] calls
215 /// [`AbbrevMap::expand_text`] on the normalised input before segmentation.
216 /// This replaces abbreviated forms (e.g. `ก.ค.`) with their canonical
217 /// expansions (`กรกฎาคม`) so they are indexed and searchable by full form.
218 ///
219 /// Disabled by default — call this method to opt in.
220 ///
221 /// # Example
222 ///
223 /// ```rust
224 /// use kham_core::fts::FtsTokenizer;
225 /// use kham_core::abbrev::AbbrevMap;
226 /// use kham_core::stopwords::StopwordSet;
227 ///
228 /// let fts = FtsTokenizer::builder()
229 /// .abbrevs(AbbrevMap::builtin())
230 /// .stopwords(StopwordSet::from_text(""))
231 /// .build();
232 /// // ก.ค. expands to กรกฎาคม before segmentation — dots disappear
233 /// let tokens = fts.segment_for_fts("ก.ค.");
234 /// let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
235 /// assert!(!texts.contains(&"."), "dots should be consumed by expansion");
236 /// ```
237 pub fn abbrevs(mut self, m: AbbrevMap) -> Self {
238 self.abbrev_map = Some(m);
239 self
240 }
241
242 /// Enable or disable number normalization (default: `true`).
243 ///
244 /// When enabled:
245 /// - [`TokenKind::Number`] tokens that contain Thai digits (๐–๙) get the
246 /// ASCII digit string added to their [`FtsToken::synonyms`]
247 /// (e.g. `๑๒๓` → synonym `"123"`).
248 /// - [`TokenKind::Thai`] tokens that are recognised Thai cardinal number
249 /// words get their decimal value added to `synonyms`
250 /// (e.g. `หนึ่งร้อย` → synonym `"100"`).
251 ///
252 /// This lets queries using either script match documents written in the
253 /// other. Set to `false` to opt out.
254 ///
255 /// # Example
256 ///
257 /// ```rust
258 /// use kham_core::fts::FtsTokenizer;
259 /// use kham_core::TokenKind;
260 ///
261 /// // Default (true): ๑๒๓ gets ASCII synonym "123"
262 /// let fts = FtsTokenizer::new();
263 /// let tokens = fts.segment_for_fts("๑๒๓");
264 /// let num = tokens.iter().find(|t| t.kind == TokenKind::Number).unwrap();
265 /// assert!(num.synonyms.contains(&String::from("123")));
266 ///
267 /// // Opt out: no conversion performed
268 /// let fts_off = FtsTokenizer::builder().number_normalize(false).build();
269 /// let tokens_off = fts_off.segment_for_fts("๑๒๓");
270 /// let num_off = tokens_off.iter().find(|t| t.kind == TokenKind::Number).unwrap();
271 /// assert!(!num_off.synonyms.contains(&String::from("123")));
272 /// ```
273 pub fn number_normalize(mut self, v: bool) -> Self {
274 self.number_normalize = Some(v);
275 self
276 }
277
278 /// Emit a Thai phonetic soundex code as an additional synonym for Thai and Named tokens.
279 ///
280 /// When set, each Thai and Named token whose text contains Thai consonants gets its
281 /// soundex code appended to [`FtsToken::synonyms`], enabling phonetic fuzzy matching
282 /// in full-text search (e.g. querying `"1600"` matches กาน, ขาน, and คาน with lk82).
283 ///
284 /// [`SoundexAlgorithm::Lk82`] and [`SoundexAlgorithm::Udom83`] produce fixed
285 /// 4-character codes and are the recommended choices for FTS indexing.
286 /// [`SoundexAlgorithm::MetaSound`] produces variable-length codes and is more
287 /// collision-prone at word level — prefer lk82 or udom83 for general FTS use.
288 ///
289 /// Disabled by default — call this method to opt in.
290 ///
291 /// # Example
292 ///
293 /// ```rust
294 /// use kham_core::fts::FtsTokenizer;
295 /// use kham_core::soundex::{lk82, SoundexAlgorithm};
296 /// use kham_core::stopwords::StopwordSet;
297 ///
298 /// let fts = FtsTokenizer::builder()
299 /// .soundex(SoundexAlgorithm::Lk82)
300 /// .stopwords(StopwordSet::from_text(""))
301 /// .build();
302 /// // กาน / ขาน / คาน all map to the same lk82 code — stored once per token
303 /// for word in &["กาน", "ขาน", "คาน"] {
304 /// let tokens = fts.segment_for_fts(word);
305 /// let t = tokens.first().unwrap();
306 /// assert!(t.synonyms.contains(&lk82(word)), "{word} missing lk82 synonym");
307 /// }
308 /// ```
309 pub fn soundex(mut self, algo: SoundexAlgorithm) -> Self {
310 self.soundex = Some(algo);
311 self
312 }
313
314 /// Overlay extra words on the built-in dictionary without a full trie rebuild.
315 ///
316 /// Words are stored in a sorted list alongside the pre-compiled trie.
317 /// Prefer this over a full rebuild when adding a small domain-specific
318 /// vocabulary (e.g. product names, technical terms).
319 ///
320 /// Newline-separated; `#` lines are ignored.
321 ///
322 /// # Example
323 ///
324 /// ```rust
325 /// use kham_core::fts::FtsTokenizer;
326 /// use kham_core::TokenKind;
327 ///
328 /// let fts = FtsTokenizer::builder()
329 /// .dict_merge("โปรแกรมเมอร์\nปัญญาประดิษฐ์\n")
330 /// .build();
331 /// let tokens = fts.segment_for_fts("โปรแกรมเมอร์ไทย");
332 /// assert!(tokens.iter().any(|t| t.text == "โปรแกรมเมอร์" && t.kind == TokenKind::Thai));
333 /// ```
334 pub fn dict_merge(mut self, words: &str) -> Self {
335 self.dict_merge = Some(String::from(words));
336 self
337 }
338
339 /// Consume the builder and return a configured [`FtsTokenizer`].
340 ///
341 /// # Example
342 ///
343 /// ```rust
344 /// use kham_core::fts::FtsTokenizer;
345 /// use kham_core::soundex::SoundexAlgorithm;
346 /// use kham_core::stopwords::StopwordSet;
347 ///
348 /// let fts = FtsTokenizer::builder()
349 /// .soundex(SoundexAlgorithm::Lk82)
350 /// .stopwords(StopwordSet::from_text(""))
351 /// .build();
352 /// assert!(!fts.segment_for_fts("กินข้าว").is_empty());
353 /// ```
354 pub fn build(self) -> FtsTokenizer {
355 let tokenizer = if let Some(ref words) = self.dict_merge {
356 Tokenizer::builder().dict_merge(words).build()
357 } else {
358 Tokenizer::new()
359 };
360 FtsTokenizer {
361 tokenizer,
362 stopwords: self.stopwords.unwrap_or_else(StopwordSet::builtin),
363 synonyms: self.synonyms.unwrap_or_else(SynonymMap::empty),
364 ngram_size: self.ngram_size.unwrap_or(3),
365 pos_tagger: self.pos_tagger.unwrap_or_else(PosTagger::builtin),
366 ne_tagger: self.ne_tagger.unwrap_or_else(NeTagger::builtin),
367 romanization: self.romanization,
368 abbrev_map: self.abbrev_map,
369 number_normalize: self.number_normalize.unwrap_or(true),
370 soundex: self.soundex,
371 }
372 }
373}
374
375/// Full-text search tokenizer for Thai text.
376///
377/// Wraps [`Tokenizer`] with stopword filtering, synonym expansion, and n-gram
378/// generation for out-of-vocabulary tokens.
379///
380/// Construct once and reuse:
381///
382/// ```rust
383/// use kham_core::fts::FtsTokenizer;
384///
385/// let fts = FtsTokenizer::new();
386/// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
387/// assert!(!tokens.is_empty());
388/// ```
389pub struct FtsTokenizer {
390 tokenizer: Tokenizer,
391 stopwords: StopwordSet,
392 synonyms: SynonymMap,
393 ngram_size: usize,
394 pos_tagger: PosTagger,
395 ne_tagger: NeTagger,
396 romanization: Option<RomanizationMap>,
397 abbrev_map: Option<AbbrevMap>,
398 number_normalize: bool,
399 soundex: Option<SoundexAlgorithm>,
400}
401
402impl FtsTokenizer {
403 /// Create an [`FtsTokenizer`] with built-in stopwords and no synonyms.
404 ///
405 /// # Example
406 ///
407 /// ```rust
408 /// use kham_core::fts::FtsTokenizer;
409 ///
410 /// let fts = FtsTokenizer::new();
411 /// let lexemes = fts.lexemes("กินข้าวกับปลา");
412 /// // Built-in stopword กับ is excluded; content words are present
413 /// assert!(!lexemes.contains(&String::from("กับ")));
414 /// assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
415 /// ```
416 pub fn new() -> Self {
417 FtsTokenizerBuilder::default().build()
418 }
419
420 /// Return a [`FtsTokenizerBuilder`] for custom configuration.
421 ///
422 /// # Example
423 ///
424 /// ```rust
425 /// use kham_core::fts::FtsTokenizer;
426 /// use kham_core::soundex::SoundexAlgorithm;
427 /// use kham_core::synonym::SynonymMap;
428 ///
429 /// let fts = FtsTokenizer::builder()
430 /// .synonyms(SynonymMap::from_tsv("รถ\tรถยนต์\n"))
431 /// .soundex(SoundexAlgorithm::Lk82)
432 /// .build();
433 /// assert!(!fts.segment_for_fts("รถ").is_empty());
434 /// ```
435 pub fn builder() -> FtsTokenizerBuilder {
436 FtsTokenizerBuilder::default()
437 }
438
439 /// Segment `text` and annotate each token for FTS indexing.
440 ///
441 /// Normalises the input text before segmentation so that สระลอย and stacked
442 /// tone marks are handled correctly. Whitespace tokens are excluded.
443 ///
444 /// The returned `Vec<FtsToken>` covers all non-whitespace tokens. Call
445 /// [`index_tokens`] instead when you only need the tokens to be indexed
446 /// (stopwords excluded).
447 ///
448 /// [`index_tokens`]: FtsTokenizer::index_tokens
449 ///
450 /// # Examples
451 ///
452 /// ```rust
453 /// use kham_core::fts::FtsTokenizer;
454 ///
455 /// let fts = FtsTokenizer::new();
456 /// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
457 /// // Positions are 0-based and sequential across non-whitespace tokens
458 /// for (i, t) in tokens.iter().enumerate() {
459 /// assert_eq!(t.position, i);
460 /// }
461 /// // กับ is a common conjunction — marked as a stopword
462 /// let kap = tokens.iter().find(|t| t.text == "กับ").unwrap();
463 /// assert!(kap.is_stop);
464 /// ```
465 ///
466 /// Named entities are tagged automatically — `kind` becomes `TokenKind::Named`:
467 ///
468 /// ```rust
469 /// use kham_core::fts::FtsTokenizer;
470 /// use kham_core::TokenKind;
471 ///
472 /// let fts = FtsTokenizer::new();
473 /// let tokens = fts.segment_for_fts("ไปกรุงเทพ");
474 /// assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));
475 /// ```
476 ///
477 /// Enable phonetic synonyms with [`FtsTokenizerBuilder::soundex`]:
478 ///
479 /// ```rust
480 /// use kham_core::fts::FtsTokenizer;
481 /// use kham_core::soundex::SoundexAlgorithm;
482 ///
483 /// let fts = FtsTokenizer::builder()
484 /// .soundex(SoundexAlgorithm::Lk82)
485 /// .build();
486 /// let tokens = fts.segment_for_fts("กิน");
487 /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
488 /// // synonyms now contains the lk82 code, enabling fuzzy phonetic matching
489 /// assert!(!t.synonyms.is_empty());
490 /// ```
491 pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken> {
492 let normalized = self.tokenizer.normalize(text);
493 // Expand abbreviations (e.g. ก.ค. → กรกฎาคม) before segmentation so
494 // dot-containing patterns are replaced as single units.
495 let expanded = match self.abbrev_map.as_ref() {
496 Some(am) => am.expand_text(&normalized),
497 None => normalized,
498 };
499 let raw_tokens = self
500 .ne_tagger
501 .tag_tokens(self.tokenizer.segment(&expanded), &expanded);
502
503 let mut result = Vec::with_capacity(raw_tokens.len());
504 let mut position = 0usize;
505
506 for token in &raw_tokens {
507 if token.kind == TokenKind::Whitespace {
508 continue;
509 }
510
511 let is_stop = self.stopwords.contains(token.text);
512 let is_thai_or_named = matches!(token.kind, TokenKind::Thai | TokenKind::Named(_));
513 let mut synonyms = self
514 .synonyms
515 .expand(token.text)
516 .map(|s| s.to_vec())
517 .unwrap_or_default();
518 if is_thai_or_named {
519 if let Some(ref rom) = self.romanization {
520 if let Some(rtgs) = rom.romanize(token.text) {
521 synonyms.push(String::from(rtgs));
522 }
523 }
524 if let Some(algo) = self.soundex {
525 let code = soundex(token.text, algo);
526 if !code.chars().all(|c| c == '0') {
527 synonyms.push(code);
528 }
529 }
530 }
531 if self.number_normalize {
532 match token.kind {
533 // Number token with Thai digits → add ASCII form as synonym.
534 TokenKind::Number => {
535 let ascii = thai_digits_to_ascii(token.text);
536 if ascii != token.text {
537 synonyms.push(ascii);
538 }
539 }
540 // Thai token that is a recognised number word → add decimal string.
541 TokenKind::Thai => {
542 if let Some(decimal) = thai_word_to_decimal(token.text) {
543 synonyms.push(decimal);
544 }
545 }
546 _ => {}
547 }
548 }
549 let trigrams = if token.kind == TokenKind::Unknown && self.ngram_size > 0 {
550 char_ngrams(token.text, self.ngram_size)
551 .map(String::from)
552 .collect()
553 } else {
554 Vec::new()
555 };
556 let ne = if let TokenKind::Named(k) = token.kind {
557 Some(k)
558 } else {
559 None
560 };
561 let pos = if token.kind == TokenKind::Thai {
562 self.pos_tagger.tag(token.text)
563 } else {
564 None
565 };
566
567 result.push(FtsToken {
568 text: String::from(token.text),
569 position,
570 kind: token.kind,
571 is_stop,
572 synonyms,
573 trigrams,
574 pos,
575 ne,
576 });
577
578 position += 1;
579 }
580
581 result
582 }
583
584 /// Return only the tokens to be written into a search index.
585 ///
586 /// Filters out stopwords and whitespace. Each [`FtsToken`] still carries
587 /// its original `position` so phrase-distance scoring remains correct.
588 ///
589 /// # Example
590 ///
591 /// ```rust
592 /// use kham_core::fts::FtsTokenizer;
593 ///
594 /// let fts = FtsTokenizer::new();
595 /// let tokens = fts.index_tokens("กินข้าวกับปลา");
596 /// // No stopwords in the index
597 /// assert!(tokens.iter().all(|t| !t.is_stop));
598 /// // Positions are preserved from the full sequence for phrase scoring
599 /// let positions: Vec<usize> = tokens.iter().map(|t| t.position).collect();
600 /// assert!(positions.windows(2).all(|w| w[0] < w[1]));
601 /// ```
602 pub fn index_tokens(&self, text: &str) -> Vec<FtsToken> {
603 self.segment_for_fts(text)
604 .into_iter()
605 .filter(|t| !t.is_stop)
606 .collect()
607 }
608
609 /// Collect all lexeme strings to be stored in a `tsvector`.
610 ///
611 /// Returns one string per non-stop token, plus synonym expansions and
612 /// trigrams for unknown tokens. Duplicates are not removed (the caller or
613 /// PostgreSQL handles deduplication).
614 ///
615 /// # Example
616 ///
617 /// ```rust
618 /// use kham_core::fts::FtsTokenizer;
619 ///
620 /// let fts = FtsTokenizer::new();
621 /// let lexemes = fts.lexemes("กินข้าวกับปลา");
622 /// // Content words are present; stopword กับ is absent
623 /// assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
624 /// assert!(!lexemes.contains(&String::from("กับ")));
625 /// ```
626 ///
627 /// With Thai digit normalization (enabled by default), both scripts match:
628 ///
629 /// ```rust
630 /// use kham_core::fts::FtsTokenizer;
631 ///
632 /// let fts = FtsTokenizer::new();
633 /// let lexemes = fts.lexemes("ธนาคาร๑๐๐แห่ง");
634 /// // ๑๐๐ (Thai digits) → synonym "100" (ASCII) — both appear in lexemes
635 /// assert!(lexemes.contains(&String::from("100")));
636 /// ```
637 pub fn lexemes(&self, text: &str) -> Vec<String> {
638 let tokens = self.index_tokens(text);
639 let mut out: Vec<String> = Vec::with_capacity(tokens.len() * 2);
640 for t in tokens {
641 out.push(t.text.clone());
642 out.extend(t.synonyms);
643 out.extend(t.trigrams);
644 }
645 out
646 }
647}
648
649impl Default for FtsTokenizer {
650 fn default() -> Self {
651 Self::new()
652 }
653}
654
655// ---------------------------------------------------------------------------
656// Tests
657// ---------------------------------------------------------------------------
658
659#[cfg(test)]
660mod tests {
661 use super::*;
662 use crate::stopwords::StopwordSet;
663 use crate::synonym::SynonymMap;
664
665 fn fts() -> FtsTokenizer {
666 FtsTokenizer::new()
667 }
668
669 // ── segment_for_fts ───────────────────────────────────────────────────────
670
671 #[test]
672 fn empty_input_returns_empty() {
673 assert!(fts().segment_for_fts("").is_empty());
674 }
675
676 #[test]
677 fn whitespace_tokens_excluded() {
678 let tokens = fts().segment_for_fts("กิน ข้าว");
679 assert!(tokens.iter().all(|t| t.kind != TokenKind::Whitespace));
680 }
681
682 #[test]
683 fn positions_are_sequential() {
684 let tokens = fts().segment_for_fts("กินข้าวกับปลา");
685 for (i, t) in tokens.iter().enumerate() {
686 assert_eq!(t.position, i, "position mismatch at index {i}");
687 }
688 }
689
690 #[test]
691 fn known_stopword_is_tagged() {
692 // "กับ" is a common conjunction and should be in the built-in stopword list
693 let tokens = fts().segment_for_fts("กินข้าวกับปลา");
694 let kap = tokens.iter().find(|t| t.text == "กับ");
695 assert!(kap.is_some(), "expected 'กับ' token");
696 assert!(kap.unwrap().is_stop, "'กับ' should be tagged as stopword");
697 }
698
699 #[test]
700 fn content_words_not_tagged_as_stop() {
701 let tokens = fts().segment_for_fts("โรงพยาบาล");
702 // May be OOV but should not be a stopword
703 for t in &tokens {
704 assert!(!t.is_stop, "'{}' should not be a stopword", t.text);
705 }
706 }
707
708 #[test]
709 fn text_is_reconstructable() {
710 // All tokens joined == normalised input (whitespace dropped)
711 let fts = fts();
712 let text = "กินข้าวกับปลา";
713 let normalized = fts.tokenizer.normalize(text);
714 let tokens = fts.segment_for_fts(text);
715 let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
716 assert_eq!(rebuilt, normalized);
717 }
718
719 // ── synonym expansion ─────────────────────────────────────────────────────
720
721 #[test]
722 fn synonym_expansion_attached() {
723 let synonyms = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
724 let fts = FtsTokenizer::builder()
725 .synonyms(synonyms)
726 .stopwords(StopwordSet::from_text(""))
727 .build();
728 // Segment a text containing "คอม" — need it in dict or it lands as Unknown
729 // Use builder with custom word so the segmenter recognises it
730 let tokens = fts.segment_for_fts("คอม");
731 let t = tokens.iter().find(|t| t.text == "คอม");
732 if let Some(tok) = t {
733 assert!(
734 tok.synonyms.contains(&String::from("คอมพิวเตอร์")),
735 "expected synonym expansion, got {:?}",
736 tok.synonyms
737 );
738 }
739 }
740
741 #[test]
742 fn no_synonyms_when_map_empty() {
743 let tokens = fts().segment_for_fts("กินข้าว");
744 for t in &tokens {
745 assert!(t.synonyms.is_empty());
746 }
747 }
748
749 // ── unknown token trigrams ────────────────────────────────────────────────
750
751 #[test]
752 fn unknown_token_gets_trigrams() {
753 // "กิ" = consonant + sara-i, a single 2-char TCC that is not a word.
754 // With ngram_size=2 the token should yield one bigram ("กิ").
755 // The newmm DP emits Unknown tokens one TCC at a time, so multi-char TCCs
756 // (like "กิ") are the shortest unit that can produce n-grams.
757 let fts = FtsTokenizer::builder()
758 .ngram_size(2)
759 .stopwords(StopwordSet::from_text(""))
760 .build();
761 let tokens = fts.segment_for_fts("กิ");
762 let unknown: Vec<_> = tokens
763 .iter()
764 .filter(|t| t.kind == TokenKind::Unknown && t.text.chars().count() >= 2)
765 .collect();
766 assert!(
767 !unknown.is_empty(),
768 "expected at least one multi-char Unknown token for 'กิ'"
769 );
770 for u in &unknown {
771 assert!(
772 !u.trigrams.is_empty(),
773 "unknown token '{}' ({} chars) should have bigrams",
774 u.text,
775 u.text.chars().count()
776 );
777 }
778 }
779
780 #[test]
781 fn known_thai_token_has_no_trigrams() {
782 let tokens = fts().segment_for_fts("กิน");
783 for t in &tokens {
784 if t.kind == TokenKind::Thai {
785 assert!(
786 t.trigrams.is_empty(),
787 "known Thai token '{}' should not have trigrams",
788 t.text
789 );
790 }
791 }
792 }
793
794 #[test]
795 fn ngram_size_zero_disables_trigrams() {
796 let fts = FtsTokenizer::builder()
797 .ngram_size(0)
798 .stopwords(StopwordSet::from_text(""))
799 .build();
800 let tokens = fts.segment_for_fts("กขคง");
801 for t in &tokens {
802 assert!(t.trigrams.is_empty());
803 }
804 }
805
806 // ── index_tokens ──────────────────────────────────────────────────────────
807
808 #[test]
809 fn index_tokens_excludes_stopwords() {
810 let tokens = fts().index_tokens("กินข้าวกับปลา");
811 assert!(tokens.iter().all(|t| !t.is_stop));
812 }
813
814 #[test]
815 fn index_tokens_preserves_positions() {
816 // Positions in index_tokens must be a subset of segment_for_fts positions
817 let all = fts().segment_for_fts("กินข้าวกับปลา");
818 let indexed = fts().index_tokens("กินข้าวกับปลา");
819 for t in &indexed {
820 assert!(
821 all.iter().any(|a| a.position == t.position),
822 "indexed token at position {} not found in full token list",
823 t.position
824 );
825 }
826 }
827
828 // ── lexemes ───────────────────────────────────────────────────────────────
829
830 #[test]
831 fn lexemes_returns_non_stop_texts() {
832 let lexemes = fts().lexemes("กินข้าวกับปลา");
833 // "กับ" is a stopword — should not appear
834 assert!(!lexemes.contains(&String::from("กับ")));
835 // Content words should appear
836 assert!(
837 lexemes
838 .iter()
839 .any(|l| l == "กิน" || l == "ข้าว" || l == "ปลา"),
840 "expected content words in lexemes: {lexemes:?}"
841 );
842 }
843
844 #[test]
845 fn lexemes_empty_input_is_empty() {
846 assert!(fts().lexemes("").is_empty());
847 }
848
849 // ── multi-token NE ────────────────────────────────────────────────────────
850
851 #[test]
852 fn multi_token_ne_merged_in_pipeline() {
853 // กรุงเทพ is in the NE gazetteer as PLACE; the segmenter splits it
854 // into กรุง+เทพ. The FTS pipeline must merge them into one Named token.
855 let fts = FtsTokenizer::new();
856 let tokens = fts.segment_for_fts("ไปกรุงเทพ");
857 let named: Vec<_> = tokens
858 .iter()
859 .filter(|t| matches!(t.kind, TokenKind::Named(_)))
860 .collect();
861 assert!(
862 named.iter().any(|t| t.text == "กรุงเทพ"),
863 "กรุงเทพ should be tagged Named after multi-token merge, tokens: {:?}",
864 tokens
865 .iter()
866 .map(|t| (&t.text, &t.kind))
867 .collect::<alloc::vec::Vec<_>>()
868 );
869 }
870
871 #[test]
872 fn multi_token_ne_reconstructable() {
873 // Texts of all non-whitespace tokens must still reconstruct the normalized input.
874 let fts = FtsTokenizer::new();
875 let text = "ไปกรุงเทพ";
876 let normalized = fts.tokenizer.normalize(text);
877 let tokens = fts.segment_for_fts(text);
878 let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
879 assert_eq!(rebuilt, normalized);
880 }
881
882 // ── builder ───────────────────────────────────────────────────────────────
883
884 #[test]
885 fn builder_custom_stopwords() {
886 let stops = StopwordSet::from_text("กิน\n");
887 let fts = FtsTokenizer::builder().stopwords(stops).build();
888 let tokens = fts.segment_for_fts("กินข้าว");
889 let gin = tokens.iter().find(|t| t.text == "กิน");
890 if let Some(t) = gin {
891 assert!(t.is_stop, "'กิน' should be stop with custom list");
892 }
893 }
894
895 #[test]
896 fn builder_default_equals_new() {
897 // Both paths should produce the same result for a simple input
898 let a = FtsTokenizer::new().lexemes("กินข้าว");
899 let b = FtsTokenizer::builder().build().lexemes("กินข้าว");
900 assert_eq!(a, b);
901 }
902
903 // ── number normalization ──────────────────────────────────────────────────
904
905 #[test]
906 fn thai_digit_token_gets_ascii_synonym() {
907 let fts = FtsTokenizer::new();
908 let tokens = fts.segment_for_fts("๑๒๓");
909 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
910 assert!(num.is_some(), "expected a Number token");
911 let t = num.unwrap();
912 assert!(
913 t.synonyms.contains(&String::from("123")),
914 "Thai digit token should have ASCII synonym, got {:?}",
915 t.synonyms
916 );
917 }
918
919 #[test]
920 fn ascii_digit_token_has_no_extra_synonym() {
921 // ASCII digits need no conversion — synonyms should be empty (no map, no rom).
922 let fts = FtsTokenizer::new();
923 let tokens = fts.segment_for_fts("123");
924 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
925 assert!(num.is_some(), "expected a Number token");
926 assert!(
927 !num.unwrap().synonyms.contains(&String::from("123")),
928 "ASCII digit token should not duplicate itself as a synonym"
929 );
930 }
931
932 #[test]
933 fn thai_number_word_gets_decimal_synonym() {
934 // หนึ่งร้อย may segment as a single Thai token or multiple tokens depending
935 // on the dictionary. We check that at least one token carries "100" in synonyms.
936 let fts = FtsTokenizer::new();
937 let tokens = fts.segment_for_fts("หนึ่งร้อย");
938 let has_hundred = tokens
939 .iter()
940 .any(|t| t.synonyms.contains(&String::from("100")));
941 // หนึ่ง alone = Some(1), ร้อย alone = Some(100) — at least ร้อย should match.
942 assert!(
943 has_hundred,
944 "expected a token with decimal synonym '100', tokens: {:?}",
945 tokens
946 .iter()
947 .map(|t| (&t.text, &t.synonyms))
948 .collect::<alloc::vec::Vec<_>>()
949 );
950 }
951
952 #[test]
953 fn number_normalize_false_disables_conversion() {
954 let fts = FtsTokenizer::builder()
955 .number_normalize(false)
956 .stopwords(StopwordSet::from_text(""))
957 .build();
958 let tokens = fts.segment_for_fts("๑๒๓");
959 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
960 assert!(num.is_some());
961 assert!(
962 !num.unwrap().synonyms.contains(&String::from("123")),
963 "number_normalize=false should suppress ASCII synonym"
964 );
965 }
966
967 #[test]
968 fn mixed_thai_digit_in_context() {
969 // "ธนาคาร๑๐๐แห่ง" — the ๑๐๐ part should be a Number token with synonym "100"
970 let fts = FtsTokenizer::new();
971 let tokens = fts.segment_for_fts("ธนาคาร๑๐๐แห่ง");
972 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
973 assert!(num.is_some(), "expected Number token in mixed string");
974 assert!(
975 num.unwrap().synonyms.contains(&String::from("100")),
976 "expected ASCII synonym '100' for ๑๐๐"
977 );
978 }
979
980 // ── abbreviation expansion ────────────────────────────────────────────────
981
982 #[test]
983 fn abbrev_map_expands_before_segmentation() {
984 use crate::abbrev::AbbrevMap;
985 let fts = FtsTokenizer::builder()
986 .abbrevs(AbbrevMap::builtin())
987 .stopwords(StopwordSet::from_text(""))
988 .build();
989 // ก.ค. → กรกฎาคม before segmentation. The segmenter may split the
990 // expansion further (กรกฎา + คม) — what matters is that dots are gone
991 // and the Thai characters of กรกฎาคม are present.
992 let tokens = fts.segment_for_fts("ก.ค.");
993 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
994 let joined: String = texts.concat();
995 assert!(
996 joined.contains("กรกฎา") || joined.contains("กรกฎาคม"),
997 "expected กรกฎา(คม) characters after abbrev expansion, got: {texts:?}"
998 );
999 assert!(
1000 !texts.contains(&"."),
1001 "dots should be consumed by abbrev expansion, got: {texts:?}"
1002 );
1003 }
1004
1005 #[test]
1006 fn abbrev_expansion_disabled_by_default() {
1007 // FtsTokenizer::new() has no abbrev_map — ก.ค. stays as individual tokens.
1008 let fts = FtsTokenizer::new();
1009 let tokens = fts.segment_for_fts("ก.ค.");
1010 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1011 // Without expansion the dot(s) must still be present as punctuation tokens.
1012 assert!(
1013 texts.contains(&"."),
1014 "without abbrev expansion, dots should remain as tokens, got: {texts:?}"
1015 );
1016 }
1017
1018 // ── soundex synonyms ──────────────────────────────────────────────────────
1019
1020 #[test]
1021 fn soundex_lk82_appended_to_thai_synonyms() {
1022 use crate::soundex::lk82;
1023 let fts = FtsTokenizer::builder()
1024 .soundex(SoundexAlgorithm::Lk82)
1025 .stopwords(StopwordSet::from_text(""))
1026 .build();
1027 let tokens = fts.segment_for_fts("กิน");
1028 let t = tokens.iter().find(|t| t.text == "กิน");
1029 assert!(t.is_some(), "expected token 'กิน'");
1030 let expected_code = lk82("กิน");
1031 assert!(
1032 t.unwrap().synonyms.contains(&expected_code),
1033 "expected lk82 code '{expected_code}' in synonyms, got {:?}",
1034 t.unwrap().synonyms
1035 );
1036 }
1037
1038 #[test]
1039 fn soundex_not_emitted_by_default() {
1040 // Without .soundex() in the builder, no soundex codes should appear.
1041 let fts = FtsTokenizer::new();
1042 let tokens = fts.segment_for_fts("กินข้าว");
1043 for t in &tokens {
1044 // A soundex code is 4 ASCII chars (lk82/udom83); no synonym should look like one.
1045 for syn in &t.synonyms {
1046 let looks_like_soundex =
1047 syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
1048 assert!(
1049 !looks_like_soundex,
1050 "unexpected soundex-like synonym '{}' on token '{}'",
1051 syn, t.text
1052 );
1053 }
1054 }
1055 }
1056
1057 #[test]
1058 fn soundex_same_sounding_words_share_code_in_index() {
1059 // กาน and ขาน share lk82 code "1600"; both should carry it as a synonym.
1060 use crate::soundex::lk82;
1061 let fts = FtsTokenizer::builder()
1062 .soundex(SoundexAlgorithm::Lk82)
1063 .stopwords(StopwordSet::from_text(""))
1064 .build();
1065 let code = lk82("กาน");
1066 for word in &["กาน", "ขาน", "คาน"] {
1067 let tokens = fts.segment_for_fts(word);
1068 let t = tokens.first().expect("expected at least one token");
1069 assert!(
1070 t.synonyms.contains(&code),
1071 "'{word}' should carry lk82 code '{code}', got {:?}",
1072 t.synonyms
1073 );
1074 }
1075 }
1076
1077 #[test]
1078 fn soundex_not_emitted_for_non_thai_tokens() {
1079 let fts = FtsTokenizer::builder()
1080 .soundex(SoundexAlgorithm::Lk82)
1081 .stopwords(StopwordSet::from_text(""))
1082 .build();
1083 let tokens = fts.segment_for_fts("hello 123");
1084 for t in &tokens {
1085 for syn in &t.synonyms {
1086 let looks_like_soundex =
1087 syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
1088 assert!(
1089 !looks_like_soundex,
1090 "non-Thai token '{}' should not get a soundex synonym, got '{syn}'",
1091 t.text
1092 );
1093 }
1094 }
1095 }
1096
1097 #[test]
1098 fn soundex_udom83_appended() {
1099 use crate::soundex::udom83;
1100 let fts = FtsTokenizer::builder()
1101 .soundex(SoundexAlgorithm::Udom83)
1102 .stopwords(StopwordSet::from_text(""))
1103 .build();
1104 let tokens = fts.segment_for_fts("กิน");
1105 let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
1106 let expected = udom83("กิน");
1107 assert!(
1108 t.synonyms.contains(&expected),
1109 "expected udom83 code '{expected}' in synonyms, got {:?}",
1110 t.synonyms
1111 );
1112 }
1113
1114 #[test]
1115 fn abbrev_expansion_date_sentence() {
1116 use crate::abbrev::AbbrevMap;
1117 let fts = FtsTokenizer::builder()
1118 .abbrevs(AbbrevMap::builtin())
1119 .stopwords(StopwordSet::from_text(""))
1120 .build();
1121 // พ.ศ. → พุทธศักราช; the segmenter may split it further — verify the
1122 // chars are present and dots are gone.
1123 let tokens = fts.segment_for_fts("พ.ศ.2567");
1124 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1125 let joined: String = texts.concat();
1126 assert!(
1127 joined.contains("พุทธ") || joined.contains("พุทธศักราช"),
1128 "expected พุทธ(ศักราช) chars after expanding พ.ศ., got: {texts:?}"
1129 );
1130 assert!(
1131 !texts.contains(&"."),
1132 "dots should be consumed by expansion, got: {texts:?}"
1133 );
1134 }
1135}