kham_core/fts.rs
1//! Full-text search pipeline for Thai text.
2//!
3//! [`FtsTokenizer`] orchestrates the complete FTS indexing pipeline:
4//! normalise → segment → tag stopwords → expand synonyms → attach position.
5//!
6//! The output [`FtsToken`] slice is consumed by the PostgreSQL `kham-pg`
7//! extension and by any other caller that needs FTS-ready lexemes.
8//!
9//! # Positions
10//!
11//! `position` is the ordinal index of the token in the non-whitespace token
12//! sequence (0-based). Stopwords retain their position so that phrase-distance
13//! scoring remains correct when stopwords are later omitted from the index.
14//!
15//! # Example
16//!
17//! ```rust
18//! use kham_core::fts::{FtsTokenizer, FtsToken};
19//!
20//! let fts = FtsTokenizer::new();
21//! let tokens = fts.segment_for_fts("กินข้าวกับปลา");
22//! for t in &tokens {
23//! println!("{} pos={} stop={}", t.text, t.position, t.is_stop);
24//! }
25//! ```
26
27use alloc::string::String;
28use alloc::vec::Vec;
29
30use crate::abbrev::AbbrevMap;
31use crate::ne::NeTagger;
32use crate::ngram::char_ngrams;
33use crate::number::{thai_digits_to_ascii, thai_word_to_decimal};
34use crate::pos::{PosTag, PosTagger};
35use crate::romanizer::RomanizationMap;
36use crate::soundex::{soundex, SoundexAlgorithm};
37use crate::stopwords::StopwordSet;
38use crate::synonym::SynonymMap;
39use crate::token::{NamedEntityKind, TokenKind};
40use crate::Tokenizer;
41
42/// A token produced by the FTS pipeline, ready for lexeme indexing.
43#[derive(Debug, Clone, PartialEq, Eq)]
44pub struct FtsToken {
45 /// The token text (owned; may be normalised).
46 pub text: String,
47 /// Ordinal position in the token sequence (0-based, gaps for whitespace).
48 pub position: usize,
49 /// Script / category of the original token.
50 pub kind: TokenKind,
51 /// `true` if this token matches the stopword list.
52 pub is_stop: bool,
53 /// Synonym expansions (empty if none configured or no match).
54 pub synonyms: Vec<String>,
55 /// Character trigrams — populated only for [`TokenKind::Unknown`] tokens.
56 pub trigrams: Vec<String>,
57 /// Primary part-of-speech tag from the lookup table, or `None` if the word
58 /// is not in the table (OOV) or is not a Thai token.
59 pub pos: Option<PosTag>,
60 /// Named entity category, or `None` if the token is not in the NE
61 /// gazetteer. When set, `kind` is [`TokenKind::Named`]`(ne)`.
62 pub ne: Option<NamedEntityKind>,
63}
64
65/// Builder for [`FtsTokenizer`].
66#[derive(Default)]
67pub struct FtsTokenizerBuilder {
68 stopwords: Option<StopwordSet>,
69 synonyms: Option<SynonymMap>,
70 ngram_size: Option<usize>,
71 pos_tagger: Option<PosTagger>,
72 ne_tagger: Option<NeTagger>,
73 romanization: Option<RomanizationMap>,
74 abbrev_map: Option<AbbrevMap>,
75 /// `None` means "use default (true)".
76 number_normalize: Option<bool>,
77 soundex: Option<SoundexAlgorithm>,
78}
79
80impl FtsTokenizerBuilder {
81 /// Use a custom stopword set instead of the built-in list.
82 ///
83 /// # Example
84 ///
85 /// ```rust
86 /// use kham_core::fts::FtsTokenizer;
87 /// use kham_core::stopwords::StopwordSet;
88 ///
89 /// let stops = StopwordSet::from_text("กิน\nข้าว\n");
90 /// let fts = FtsTokenizer::builder().stopwords(stops).build();
91 /// let tokens = fts.segment_for_fts("กินข้าว");
92 /// assert!(tokens.iter().all(|t| t.is_stop || t.text != "กิน"));
93 /// ```
94 pub fn stopwords(mut self, s: StopwordSet) -> Self {
95 self.stopwords = Some(s);
96 self
97 }
98
99 /// Attach a synonym map for expansion.
100 ///
101 /// # Example
102 ///
103 /// ```rust
104 /// use kham_core::fts::FtsTokenizer;
105 /// use kham_core::synonym::SynonymMap;
106 ///
107 /// // TSV: canonical TAB synonym1 TAB synonym2 …
108 /// let syns = SynonymMap::from_tsv("รถ\tรถยนต์\tยานพาหนะ\n");
109 /// let fts = FtsTokenizer::builder().synonyms(syns).build();
110 /// let tokens = fts.segment_for_fts("รถ");
111 /// let t = tokens.iter().find(|t| t.text == "รถ").unwrap();
112 /// assert!(t.synonyms.contains(&String::from("รถยนต์")));
113 /// ```
114 pub fn synonyms(mut self, m: SynonymMap) -> Self {
115 self.synonyms = Some(m);
116 self
117 }
118
119 /// Override the n-gram size used for [`TokenKind::Unknown`] tokens.
120 ///
121 /// Default: 3 (trigrams). Set to 0 to disable n-gram generation.
122 ///
123 /// # Example
124 ///
125 /// ```rust
126 /// use kham_core::fts::FtsTokenizer;
127 /// use kham_core::stopwords::StopwordSet;
128 ///
129 /// // Disable n-grams entirely — useful when index size must be small
130 /// let fts = FtsTokenizer::builder()
131 /// .ngram_size(0)
132 /// .stopwords(StopwordSet::from_text(""))
133 /// .build();
134 /// let tokens = fts.segment_for_fts("กขคง"); // unknown word → no trigrams
135 /// assert!(tokens.iter().all(|t| t.trigrams.is_empty()));
136 /// ```
137 pub fn ngram_size(mut self, n: usize) -> Self {
138 self.ngram_size = Some(n);
139 self
140 }
141
142 /// Use a custom POS tagger instead of the built-in table.
143 ///
144 /// # Example
145 ///
146 /// ```rust
147 /// use kham_core::fts::FtsTokenizer;
148 /// use kham_core::pos::{PosTag, PosTagger};
149 ///
150 /// // Custom TSV: word TAB POS_TAG
151 /// let tagger = PosTagger::from_tsv("กิน\tVERB\n");
152 /// let fts = FtsTokenizer::builder().pos_tagger(tagger).build();
153 /// // Segment กิน alone so it is not merged into a compound
154 /// let tokens = fts.segment_for_fts("กิน");
155 /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
156 /// assert_eq!(t.pos, Some(PosTag::Verb));
157 /// ```
158 pub fn pos_tagger(mut self, t: PosTagger) -> Self {
159 self.pos_tagger = Some(t);
160 self
161 }
162
163 /// Use a custom NE gazetteer instead of the built-in table.
164 ///
165 /// # Example
166 ///
167 /// ```rust
168 /// use kham_core::fts::FtsTokenizer;
169 /// use kham_core::ne::NeTagger;
170 /// use kham_core::TokenKind;
171 ///
172 /// // Domain-specific NE list: word TAB NE_TAG
173 /// let ne = NeTagger::from_tsv("เซเรน่า\tPERSON\n");
174 /// let fts = FtsTokenizer::builder().ne_tagger(ne).build();
175 /// let tokens = fts.segment_for_fts("เซเรน่า");
176 /// assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));
177 /// ```
178 pub fn ne_tagger(mut self, t: NeTagger) -> Self {
179 self.ne_tagger = Some(t);
180 self
181 }
182
183 /// Attach a romanization map so RTGS forms are added to [`FtsToken::synonyms`].
184 ///
185 /// When set, each Thai and Named token whose text is found in the map gets its
186 /// RTGS romanization appended to `synonyms`, enabling Latin-script queries
187 /// (e.g. `kin`) to match Thai-script documents (e.g. `กิน`) in PostgreSQL FTS.
188 ///
189 /// Disabled by default — call this method to opt in.
190 ///
191 /// # Example
192 ///
193 /// ```rust
194 /// use kham_core::fts::FtsTokenizer;
195 /// use kham_core::romanizer::RomanizationMap;
196 ///
197 /// // TSV: Thai word TAB RTGS romanization
198 /// let rom = RomanizationMap::from_tsv("กิน\tkin\n");
199 /// let fts = FtsTokenizer::builder().romanization(rom).build();
200 /// let tokens = fts.segment_for_fts("กิน");
201 /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
202 /// // Latin synonym "kin" enables queries like `WHERE doc @@ 'kin'`
203 /// assert!(t.synonyms.contains(&String::from("kin")));
204 /// ```
205 pub fn romanization(mut self, m: RomanizationMap) -> Self {
206 self.romanization = Some(m);
207 self
208 }
209
210 /// Attach an abbreviation map for pre-tokenisation expansion.
211 ///
212 /// When set, [`FtsTokenizer::segment_for_fts`] calls
213 /// [`AbbrevMap::expand_text`] on the normalised input before segmentation.
214 /// This replaces abbreviated forms (e.g. `ก.ค.`) with their canonical
215 /// expansions (`กรกฎาคม`) so they are indexed and searchable by full form.
216 ///
217 /// Disabled by default — call this method to opt in.
218 ///
219 /// # Example
220 ///
221 /// ```rust
222 /// use kham_core::fts::FtsTokenizer;
223 /// use kham_core::abbrev::AbbrevMap;
224 /// use kham_core::stopwords::StopwordSet;
225 ///
226 /// let fts = FtsTokenizer::builder()
227 /// .abbrevs(AbbrevMap::builtin())
228 /// .stopwords(StopwordSet::from_text(""))
229 /// .build();
230 /// // ก.ค. expands to กรกฎาคม before segmentation — dots disappear
231 /// let tokens = fts.segment_for_fts("ก.ค.");
232 /// let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
233 /// assert!(!texts.contains(&"."), "dots should be consumed by expansion");
234 /// ```
235 pub fn abbrevs(mut self, m: AbbrevMap) -> Self {
236 self.abbrev_map = Some(m);
237 self
238 }
239
240 /// Enable or disable number normalization (default: `true`).
241 ///
242 /// When enabled:
243 /// - [`TokenKind::Number`] tokens that contain Thai digits (๐–๙) get the
244 /// ASCII digit string added to their [`FtsToken::synonyms`]
245 /// (e.g. `๑๒๓` → synonym `"123"`).
246 /// - [`TokenKind::Thai`] tokens that are recognised Thai cardinal number
247 /// words get their decimal value added to `synonyms`
248 /// (e.g. `หนึ่งร้อย` → synonym `"100"`).
249 ///
250 /// This lets queries using either script match documents written in the
251 /// other. Set to `false` to opt out.
252 ///
253 /// # Example
254 ///
255 /// ```rust
256 /// use kham_core::fts::FtsTokenizer;
257 /// use kham_core::TokenKind;
258 ///
259 /// // Default (true): ๑๒๓ gets ASCII synonym "123"
260 /// let fts = FtsTokenizer::new();
261 /// let tokens = fts.segment_for_fts("๑๒๓");
262 /// let num = tokens.iter().find(|t| t.kind == TokenKind::Number).unwrap();
263 /// assert!(num.synonyms.contains(&String::from("123")));
264 ///
265 /// // Opt out: no conversion performed
266 /// let fts_off = FtsTokenizer::builder().number_normalize(false).build();
267 /// let tokens_off = fts_off.segment_for_fts("๑๒๓");
268 /// let num_off = tokens_off.iter().find(|t| t.kind == TokenKind::Number).unwrap();
269 /// assert!(!num_off.synonyms.contains(&String::from("123")));
270 /// ```
271 pub fn number_normalize(mut self, v: bool) -> Self {
272 self.number_normalize = Some(v);
273 self
274 }
275
276 /// Emit a Thai phonetic soundex code as an additional synonym for Thai and Named tokens.
277 ///
278 /// When set, each Thai and Named token whose text contains Thai consonants gets its
279 /// soundex code appended to [`FtsToken::synonyms`], enabling phonetic fuzzy matching
280 /// in full-text search (e.g. querying `"1600"` matches กาน, ขาน, and คาน with lk82).
281 ///
282 /// [`SoundexAlgorithm::Lk82`] and [`SoundexAlgorithm::Udom83`] produce fixed
283 /// 4-character codes and are the recommended choices for FTS indexing.
284 /// [`SoundexAlgorithm::MetaSound`] produces variable-length codes and is more
285 /// collision-prone at word level — prefer lk82 or udom83 for general FTS use.
286 ///
287 /// Disabled by default — call this method to opt in.
288 ///
289 /// # Example
290 ///
291 /// ```rust
292 /// use kham_core::fts::FtsTokenizer;
293 /// use kham_core::soundex::{lk82, SoundexAlgorithm};
294 /// use kham_core::stopwords::StopwordSet;
295 ///
296 /// let fts = FtsTokenizer::builder()
297 /// .soundex(SoundexAlgorithm::Lk82)
298 /// .stopwords(StopwordSet::from_text(""))
299 /// .build();
300 /// // กาน / ขาน / คาน all map to the same lk82 code — stored once per token
301 /// for word in &["กาน", "ขาน", "คาน"] {
302 /// let tokens = fts.segment_for_fts(word);
303 /// let t = tokens.first().unwrap();
304 /// assert!(t.synonyms.contains(&lk82(word)), "{word} missing lk82 synonym");
305 /// }
306 /// ```
307 pub fn soundex(mut self, algo: SoundexAlgorithm) -> Self {
308 self.soundex = Some(algo);
309 self
310 }
311
312 /// Consume the builder and return a configured [`FtsTokenizer`].
313 ///
314 /// # Example
315 ///
316 /// ```rust
317 /// use kham_core::fts::FtsTokenizer;
318 /// use kham_core::soundex::SoundexAlgorithm;
319 /// use kham_core::stopwords::StopwordSet;
320 ///
321 /// let fts = FtsTokenizer::builder()
322 /// .soundex(SoundexAlgorithm::Lk82)
323 /// .stopwords(StopwordSet::from_text(""))
324 /// .build();
325 /// assert!(!fts.segment_for_fts("กินข้าว").is_empty());
326 /// ```
327 pub fn build(self) -> FtsTokenizer {
328 FtsTokenizer {
329 tokenizer: Tokenizer::new(),
330 stopwords: self.stopwords.unwrap_or_else(StopwordSet::builtin),
331 synonyms: self.synonyms.unwrap_or_else(SynonymMap::empty),
332 ngram_size: self.ngram_size.unwrap_or(3),
333 pos_tagger: self.pos_tagger.unwrap_or_else(PosTagger::builtin),
334 ne_tagger: self.ne_tagger.unwrap_or_else(NeTagger::builtin),
335 romanization: self.romanization,
336 abbrev_map: self.abbrev_map,
337 number_normalize: self.number_normalize.unwrap_or(true),
338 soundex: self.soundex,
339 }
340 }
341}
342
343/// Full-text search tokenizer for Thai text.
344///
345/// Wraps [`Tokenizer`] with stopword filtering, synonym expansion, and n-gram
346/// generation for out-of-vocabulary tokens.
347///
348/// Construct once and reuse:
349///
350/// ```rust
351/// use kham_core::fts::FtsTokenizer;
352///
353/// let fts = FtsTokenizer::new();
354/// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
355/// assert!(!tokens.is_empty());
356/// ```
357pub struct FtsTokenizer {
358 tokenizer: Tokenizer,
359 stopwords: StopwordSet,
360 synonyms: SynonymMap,
361 ngram_size: usize,
362 pos_tagger: PosTagger,
363 ne_tagger: NeTagger,
364 romanization: Option<RomanizationMap>,
365 abbrev_map: Option<AbbrevMap>,
366 number_normalize: bool,
367 soundex: Option<SoundexAlgorithm>,
368}
369
370impl FtsTokenizer {
371 /// Create an [`FtsTokenizer`] with built-in stopwords and no synonyms.
372 ///
373 /// # Example
374 ///
375 /// ```rust
376 /// use kham_core::fts::FtsTokenizer;
377 ///
378 /// let fts = FtsTokenizer::new();
379 /// let lexemes = fts.lexemes("กินข้าวกับปลา");
380 /// // Built-in stopword กับ is excluded; content words are present
381 /// assert!(!lexemes.contains(&String::from("กับ")));
382 /// assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
383 /// ```
384 pub fn new() -> Self {
385 FtsTokenizerBuilder::default().build()
386 }
387
388 /// Return a [`FtsTokenizerBuilder`] for custom configuration.
389 ///
390 /// # Example
391 ///
392 /// ```rust
393 /// use kham_core::fts::FtsTokenizer;
394 /// use kham_core::soundex::SoundexAlgorithm;
395 /// use kham_core::synonym::SynonymMap;
396 ///
397 /// let fts = FtsTokenizer::builder()
398 /// .synonyms(SynonymMap::from_tsv("รถ\tรถยนต์\n"))
399 /// .soundex(SoundexAlgorithm::Lk82)
400 /// .build();
401 /// assert!(!fts.segment_for_fts("รถ").is_empty());
402 /// ```
403 pub fn builder() -> FtsTokenizerBuilder {
404 FtsTokenizerBuilder::default()
405 }
406
407 /// Segment `text` and annotate each token for FTS indexing.
408 ///
409 /// Normalises the input text before segmentation so that สระลอย and stacked
410 /// tone marks are handled correctly. Whitespace tokens are excluded.
411 ///
412 /// The returned `Vec<FtsToken>` covers all non-whitespace tokens. Call
413 /// [`index_tokens`] instead when you only need the tokens to be indexed
414 /// (stopwords excluded).
415 ///
416 /// [`index_tokens`]: FtsTokenizer::index_tokens
417 ///
418 /// # Examples
419 ///
420 /// ```rust
421 /// use kham_core::fts::FtsTokenizer;
422 ///
423 /// let fts = FtsTokenizer::new();
424 /// let tokens = fts.segment_for_fts("กินข้าวกับปลา");
425 /// // Positions are 0-based and sequential across non-whitespace tokens
426 /// for (i, t) in tokens.iter().enumerate() {
427 /// assert_eq!(t.position, i);
428 /// }
429 /// // กับ is a common conjunction — marked as a stopword
430 /// let kap = tokens.iter().find(|t| t.text == "กับ").unwrap();
431 /// assert!(kap.is_stop);
432 /// ```
433 ///
434 /// Named entities are tagged automatically — `kind` becomes `TokenKind::Named`:
435 ///
436 /// ```rust
437 /// use kham_core::fts::FtsTokenizer;
438 /// use kham_core::TokenKind;
439 ///
440 /// let fts = FtsTokenizer::new();
441 /// let tokens = fts.segment_for_fts("ไปกรุงเทพ");
442 /// assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));
443 /// ```
444 ///
445 /// Enable phonetic synonyms with [`FtsTokenizerBuilder::soundex`]:
446 ///
447 /// ```rust
448 /// use kham_core::fts::FtsTokenizer;
449 /// use kham_core::soundex::SoundexAlgorithm;
450 ///
451 /// let fts = FtsTokenizer::builder()
452 /// .soundex(SoundexAlgorithm::Lk82)
453 /// .build();
454 /// let tokens = fts.segment_for_fts("กิน");
455 /// let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
456 /// // synonyms now contains the lk82 code, enabling fuzzy phonetic matching
457 /// assert!(!t.synonyms.is_empty());
458 /// ```
459 pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken> {
460 let normalized = self.tokenizer.normalize(text);
461 // Expand abbreviations (e.g. ก.ค. → กรกฎาคม) before segmentation so
462 // dot-containing patterns are replaced as single units.
463 let expanded = match self.abbrev_map.as_ref() {
464 Some(am) => am.expand_text(&normalized),
465 None => normalized,
466 };
467 let raw_tokens = self
468 .ne_tagger
469 .tag_tokens(self.tokenizer.segment(&expanded), &expanded);
470
471 let mut result = Vec::with_capacity(raw_tokens.len());
472 let mut position = 0usize;
473
474 for token in &raw_tokens {
475 if token.kind == TokenKind::Whitespace {
476 continue;
477 }
478
479 let is_stop = self.stopwords.contains(token.text);
480 let is_thai_or_named = matches!(token.kind, TokenKind::Thai | TokenKind::Named(_));
481 let mut synonyms = self
482 .synonyms
483 .expand(token.text)
484 .map(|s| s.to_vec())
485 .unwrap_or_default();
486 if is_thai_or_named {
487 if let Some(ref rom) = self.romanization {
488 if let Some(rtgs) = rom.romanize(token.text) {
489 synonyms.push(String::from(rtgs));
490 }
491 }
492 if let Some(algo) = self.soundex {
493 let code = soundex(token.text, algo);
494 if !code.chars().all(|c| c == '0') {
495 synonyms.push(code);
496 }
497 }
498 }
499 if self.number_normalize {
500 match token.kind {
501 // Number token with Thai digits → add ASCII form as synonym.
502 TokenKind::Number => {
503 let ascii = thai_digits_to_ascii(token.text);
504 if ascii != token.text {
505 synonyms.push(ascii);
506 }
507 }
508 // Thai token that is a recognised number word → add decimal string.
509 TokenKind::Thai => {
510 if let Some(decimal) = thai_word_to_decimal(token.text) {
511 synonyms.push(decimal);
512 }
513 }
514 _ => {}
515 }
516 }
517 let trigrams = if token.kind == TokenKind::Unknown && self.ngram_size > 0 {
518 char_ngrams(token.text, self.ngram_size)
519 .map(String::from)
520 .collect()
521 } else {
522 Vec::new()
523 };
524 let ne = if let TokenKind::Named(k) = token.kind {
525 Some(k)
526 } else {
527 None
528 };
529 let pos = if token.kind == TokenKind::Thai {
530 self.pos_tagger.tag(token.text)
531 } else {
532 None
533 };
534
535 result.push(FtsToken {
536 text: String::from(token.text),
537 position,
538 kind: token.kind,
539 is_stop,
540 synonyms,
541 trigrams,
542 pos,
543 ne,
544 });
545
546 position += 1;
547 }
548
549 result
550 }
551
552 /// Return only the tokens to be written into a search index.
553 ///
554 /// Filters out stopwords and whitespace. Each [`FtsToken`] still carries
555 /// its original `position` so phrase-distance scoring remains correct.
556 ///
557 /// # Example
558 ///
559 /// ```rust
560 /// use kham_core::fts::FtsTokenizer;
561 ///
562 /// let fts = FtsTokenizer::new();
563 /// let tokens = fts.index_tokens("กินข้าวกับปลา");
564 /// // No stopwords in the index
565 /// assert!(tokens.iter().all(|t| !t.is_stop));
566 /// // Positions are preserved from the full sequence for phrase scoring
567 /// let positions: Vec<usize> = tokens.iter().map(|t| t.position).collect();
568 /// assert!(positions.windows(2).all(|w| w[0] < w[1]));
569 /// ```
570 pub fn index_tokens(&self, text: &str) -> Vec<FtsToken> {
571 self.segment_for_fts(text)
572 .into_iter()
573 .filter(|t| !t.is_stop)
574 .collect()
575 }
576
577 /// Collect all lexeme strings to be stored in a `tsvector`.
578 ///
579 /// Returns one string per non-stop token, plus synonym expansions and
580 /// trigrams for unknown tokens. Duplicates are not removed (the caller or
581 /// PostgreSQL handles deduplication).
582 ///
583 /// # Example
584 ///
585 /// ```rust
586 /// use kham_core::fts::FtsTokenizer;
587 ///
588 /// let fts = FtsTokenizer::new();
589 /// let lexemes = fts.lexemes("กินข้าวกับปลา");
590 /// // Content words are present; stopword กับ is absent
591 /// assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
592 /// assert!(!lexemes.contains(&String::from("กับ")));
593 /// ```
594 ///
595 /// With Thai digit normalization (enabled by default), both scripts match:
596 ///
597 /// ```rust
598 /// use kham_core::fts::FtsTokenizer;
599 ///
600 /// let fts = FtsTokenizer::new();
601 /// let lexemes = fts.lexemes("ธนาคาร๑๐๐แห่ง");
602 /// // ๑๐๐ (Thai digits) → synonym "100" (ASCII) — both appear in lexemes
603 /// assert!(lexemes.contains(&String::from("100")));
604 /// ```
605 pub fn lexemes(&self, text: &str) -> Vec<String> {
606 let tokens = self.index_tokens(text);
607 let mut out: Vec<String> = Vec::with_capacity(tokens.len() * 2);
608 for t in tokens {
609 out.push(t.text.clone());
610 out.extend(t.synonyms);
611 out.extend(t.trigrams);
612 }
613 out
614 }
615}
616
617impl Default for FtsTokenizer {
618 fn default() -> Self {
619 Self::new()
620 }
621}
622
623// ---------------------------------------------------------------------------
624// Tests
625// ---------------------------------------------------------------------------
626
627#[cfg(test)]
628mod tests {
629 use super::*;
630 use crate::stopwords::StopwordSet;
631 use crate::synonym::SynonymMap;
632
633 fn fts() -> FtsTokenizer {
634 FtsTokenizer::new()
635 }
636
637 // ── segment_for_fts ───────────────────────────────────────────────────────
638
639 #[test]
640 fn empty_input_returns_empty() {
641 assert!(fts().segment_for_fts("").is_empty());
642 }
643
644 #[test]
645 fn whitespace_tokens_excluded() {
646 let tokens = fts().segment_for_fts("กิน ข้าว");
647 assert!(tokens.iter().all(|t| t.kind != TokenKind::Whitespace));
648 }
649
650 #[test]
651 fn positions_are_sequential() {
652 let tokens = fts().segment_for_fts("กินข้าวกับปลา");
653 for (i, t) in tokens.iter().enumerate() {
654 assert_eq!(t.position, i, "position mismatch at index {i}");
655 }
656 }
657
658 #[test]
659 fn known_stopword_is_tagged() {
660 // "กับ" is a common conjunction and should be in the built-in stopword list
661 let tokens = fts().segment_for_fts("กินข้าวกับปลา");
662 let kap = tokens.iter().find(|t| t.text == "กับ");
663 assert!(kap.is_some(), "expected 'กับ' token");
664 assert!(kap.unwrap().is_stop, "'กับ' should be tagged as stopword");
665 }
666
667 #[test]
668 fn content_words_not_tagged_as_stop() {
669 let tokens = fts().segment_for_fts("โรงพยาบาล");
670 // May be OOV but should not be a stopword
671 for t in &tokens {
672 assert!(!t.is_stop, "'{}' should not be a stopword", t.text);
673 }
674 }
675
676 #[test]
677 fn text_is_reconstructable() {
678 // All tokens joined == normalised input (whitespace dropped)
679 let fts = fts();
680 let text = "กินข้าวกับปลา";
681 let normalized = fts.tokenizer.normalize(text);
682 let tokens = fts.segment_for_fts(text);
683 let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
684 assert_eq!(rebuilt, normalized);
685 }
686
687 // ── synonym expansion ─────────────────────────────────────────────────────
688
689 #[test]
690 fn synonym_expansion_attached() {
691 let synonyms = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
692 let fts = FtsTokenizer::builder()
693 .synonyms(synonyms)
694 .stopwords(StopwordSet::from_text(""))
695 .build();
696 // Segment a text containing "คอม" — need it in dict or it lands as Unknown
697 // Use builder with custom word so the segmenter recognises it
698 let tokens = fts.segment_for_fts("คอม");
699 let t = tokens.iter().find(|t| t.text == "คอม");
700 if let Some(tok) = t {
701 assert!(
702 tok.synonyms.contains(&String::from("คอมพิวเตอร์")),
703 "expected synonym expansion, got {:?}",
704 tok.synonyms
705 );
706 }
707 }
708
709 #[test]
710 fn no_synonyms_when_map_empty() {
711 let tokens = fts().segment_for_fts("กินข้าว");
712 for t in &tokens {
713 assert!(t.synonyms.is_empty());
714 }
715 }
716
717 // ── unknown token trigrams ────────────────────────────────────────────────
718
719 #[test]
720 fn unknown_token_gets_trigrams() {
721 // "กิ" = consonant + sara-i, a single 2-char TCC that is not a word.
722 // With ngram_size=2 the token should yield one bigram ("กิ").
723 // The newmm DP emits Unknown tokens one TCC at a time, so multi-char TCCs
724 // (like "กิ") are the shortest unit that can produce n-grams.
725 let fts = FtsTokenizer::builder()
726 .ngram_size(2)
727 .stopwords(StopwordSet::from_text(""))
728 .build();
729 let tokens = fts.segment_for_fts("กิ");
730 let unknown: Vec<_> = tokens
731 .iter()
732 .filter(|t| t.kind == TokenKind::Unknown && t.text.chars().count() >= 2)
733 .collect();
734 assert!(
735 !unknown.is_empty(),
736 "expected at least one multi-char Unknown token for 'กิ'"
737 );
738 for u in &unknown {
739 assert!(
740 !u.trigrams.is_empty(),
741 "unknown token '{}' ({} chars) should have bigrams",
742 u.text,
743 u.text.chars().count()
744 );
745 }
746 }
747
748 #[test]
749 fn known_thai_token_has_no_trigrams() {
750 let tokens = fts().segment_for_fts("กิน");
751 for t in &tokens {
752 if t.kind == TokenKind::Thai {
753 assert!(
754 t.trigrams.is_empty(),
755 "known Thai token '{}' should not have trigrams",
756 t.text
757 );
758 }
759 }
760 }
761
762 #[test]
763 fn ngram_size_zero_disables_trigrams() {
764 let fts = FtsTokenizer::builder()
765 .ngram_size(0)
766 .stopwords(StopwordSet::from_text(""))
767 .build();
768 let tokens = fts.segment_for_fts("กขคง");
769 for t in &tokens {
770 assert!(t.trigrams.is_empty());
771 }
772 }
773
774 // ── index_tokens ──────────────────────────────────────────────────────────
775
776 #[test]
777 fn index_tokens_excludes_stopwords() {
778 let tokens = fts().index_tokens("กินข้าวกับปลา");
779 assert!(tokens.iter().all(|t| !t.is_stop));
780 }
781
782 #[test]
783 fn index_tokens_preserves_positions() {
784 // Positions in index_tokens must be a subset of segment_for_fts positions
785 let all = fts().segment_for_fts("กินข้าวกับปลา");
786 let indexed = fts().index_tokens("กินข้าวกับปลา");
787 for t in &indexed {
788 assert!(
789 all.iter().any(|a| a.position == t.position),
790 "indexed token at position {} not found in full token list",
791 t.position
792 );
793 }
794 }
795
796 // ── lexemes ───────────────────────────────────────────────────────────────
797
798 #[test]
799 fn lexemes_returns_non_stop_texts() {
800 let lexemes = fts().lexemes("กินข้าวกับปลา");
801 // "กับ" is a stopword — should not appear
802 assert!(!lexemes.contains(&String::from("กับ")));
803 // Content words should appear
804 assert!(
805 lexemes
806 .iter()
807 .any(|l| l == "กิน" || l == "ข้าว" || l == "ปลา"),
808 "expected content words in lexemes: {lexemes:?}"
809 );
810 }
811
812 #[test]
813 fn lexemes_empty_input_is_empty() {
814 assert!(fts().lexemes("").is_empty());
815 }
816
817 // ── multi-token NE ────────────────────────────────────────────────────────
818
819 #[test]
820 fn multi_token_ne_merged_in_pipeline() {
821 // กรุงเทพ is in the NE gazetteer as PLACE; the segmenter splits it
822 // into กรุง+เทพ. The FTS pipeline must merge them into one Named token.
823 let fts = FtsTokenizer::new();
824 let tokens = fts.segment_for_fts("ไปกรุงเทพ");
825 let named: Vec<_> = tokens
826 .iter()
827 .filter(|t| matches!(t.kind, TokenKind::Named(_)))
828 .collect();
829 assert!(
830 named.iter().any(|t| t.text == "กรุงเทพ"),
831 "กรุงเทพ should be tagged Named after multi-token merge, tokens: {:?}",
832 tokens
833 .iter()
834 .map(|t| (&t.text, &t.kind))
835 .collect::<alloc::vec::Vec<_>>()
836 );
837 }
838
839 #[test]
840 fn multi_token_ne_reconstructable() {
841 // Texts of all non-whitespace tokens must still reconstruct the normalized input.
842 let fts = FtsTokenizer::new();
843 let text = "ไปกรุงเทพ";
844 let normalized = fts.tokenizer.normalize(text);
845 let tokens = fts.segment_for_fts(text);
846 let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
847 assert_eq!(rebuilt, normalized);
848 }
849
850 // ── builder ───────────────────────────────────────────────────────────────
851
852 #[test]
853 fn builder_custom_stopwords() {
854 let stops = StopwordSet::from_text("กิน\n");
855 let fts = FtsTokenizer::builder().stopwords(stops).build();
856 let tokens = fts.segment_for_fts("กินข้าว");
857 let gin = tokens.iter().find(|t| t.text == "กิน");
858 if let Some(t) = gin {
859 assert!(t.is_stop, "'กิน' should be stop with custom list");
860 }
861 }
862
863 #[test]
864 fn builder_default_equals_new() {
865 // Both paths should produce the same result for a simple input
866 let a = FtsTokenizer::new().lexemes("กินข้าว");
867 let b = FtsTokenizer::builder().build().lexemes("กินข้าว");
868 assert_eq!(a, b);
869 }
870
871 // ── number normalization ──────────────────────────────────────────────────
872
873 #[test]
874 fn thai_digit_token_gets_ascii_synonym() {
875 let fts = FtsTokenizer::new();
876 let tokens = fts.segment_for_fts("๑๒๓");
877 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
878 assert!(num.is_some(), "expected a Number token");
879 let t = num.unwrap();
880 assert!(
881 t.synonyms.contains(&String::from("123")),
882 "Thai digit token should have ASCII synonym, got {:?}",
883 t.synonyms
884 );
885 }
886
887 #[test]
888 fn ascii_digit_token_has_no_extra_synonym() {
889 // ASCII digits need no conversion — synonyms should be empty (no map, no rom).
890 let fts = FtsTokenizer::new();
891 let tokens = fts.segment_for_fts("123");
892 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
893 assert!(num.is_some(), "expected a Number token");
894 assert!(
895 !num.unwrap().synonyms.contains(&String::from("123")),
896 "ASCII digit token should not duplicate itself as a synonym"
897 );
898 }
899
900 #[test]
901 fn thai_number_word_gets_decimal_synonym() {
902 // หนึ่งร้อย may segment as a single Thai token or multiple tokens depending
903 // on the dictionary. We check that at least one token carries "100" in synonyms.
904 let fts = FtsTokenizer::new();
905 let tokens = fts.segment_for_fts("หนึ่งร้อย");
906 let has_hundred = tokens
907 .iter()
908 .any(|t| t.synonyms.contains(&String::from("100")));
909 // หนึ่ง alone = Some(1), ร้อย alone = Some(100) — at least ร้อย should match.
910 assert!(
911 has_hundred,
912 "expected a token with decimal synonym '100', tokens: {:?}",
913 tokens
914 .iter()
915 .map(|t| (&t.text, &t.synonyms))
916 .collect::<alloc::vec::Vec<_>>()
917 );
918 }
919
920 #[test]
921 fn number_normalize_false_disables_conversion() {
922 let fts = FtsTokenizer::builder()
923 .number_normalize(false)
924 .stopwords(StopwordSet::from_text(""))
925 .build();
926 let tokens = fts.segment_for_fts("๑๒๓");
927 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
928 assert!(num.is_some());
929 assert!(
930 !num.unwrap().synonyms.contains(&String::from("123")),
931 "number_normalize=false should suppress ASCII synonym"
932 );
933 }
934
935 #[test]
936 fn mixed_thai_digit_in_context() {
937 // "ธนาคาร๑๐๐แห่ง" — the ๑๐๐ part should be a Number token with synonym "100"
938 let fts = FtsTokenizer::new();
939 let tokens = fts.segment_for_fts("ธนาคาร๑๐๐แห่ง");
940 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
941 assert!(num.is_some(), "expected Number token in mixed string");
942 assert!(
943 num.unwrap().synonyms.contains(&String::from("100")),
944 "expected ASCII synonym '100' for ๑๐๐"
945 );
946 }
947
948 // ── abbreviation expansion ────────────────────────────────────────────────
949
950 #[test]
951 fn abbrev_map_expands_before_segmentation() {
952 use crate::abbrev::AbbrevMap;
953 let fts = FtsTokenizer::builder()
954 .abbrevs(AbbrevMap::builtin())
955 .stopwords(StopwordSet::from_text(""))
956 .build();
957 // ก.ค. → กรกฎาคม before segmentation. The segmenter may split the
958 // expansion further (กรกฎา + คม) — what matters is that dots are gone
959 // and the Thai characters of กรกฎาคม are present.
960 let tokens = fts.segment_for_fts("ก.ค.");
961 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
962 let joined: String = texts.concat();
963 assert!(
964 joined.contains("กรกฎา") || joined.contains("กรกฎาคม"),
965 "expected กรกฎา(คม) characters after abbrev expansion, got: {texts:?}"
966 );
967 assert!(
968 !texts.contains(&"."),
969 "dots should be consumed by abbrev expansion, got: {texts:?}"
970 );
971 }
972
973 #[test]
974 fn abbrev_expansion_disabled_by_default() {
975 // FtsTokenizer::new() has no abbrev_map — ก.ค. stays as individual tokens.
976 let fts = FtsTokenizer::new();
977 let tokens = fts.segment_for_fts("ก.ค.");
978 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
979 // Without expansion the dot(s) must still be present as punctuation tokens.
980 assert!(
981 texts.contains(&"."),
982 "without abbrev expansion, dots should remain as tokens, got: {texts:?}"
983 );
984 }
985
986 // ── soundex synonyms ──────────────────────────────────────────────────────
987
988 #[test]
989 fn soundex_lk82_appended_to_thai_synonyms() {
990 use crate::soundex::lk82;
991 let fts = FtsTokenizer::builder()
992 .soundex(SoundexAlgorithm::Lk82)
993 .stopwords(StopwordSet::from_text(""))
994 .build();
995 let tokens = fts.segment_for_fts("กิน");
996 let t = tokens.iter().find(|t| t.text == "กิน");
997 assert!(t.is_some(), "expected token 'กิน'");
998 let expected_code = lk82("กิน");
999 assert!(
1000 t.unwrap().synonyms.contains(&expected_code),
1001 "expected lk82 code '{expected_code}' in synonyms, got {:?}",
1002 t.unwrap().synonyms
1003 );
1004 }
1005
1006 #[test]
1007 fn soundex_not_emitted_by_default() {
1008 // Without .soundex() in the builder, no soundex codes should appear.
1009 let fts = FtsTokenizer::new();
1010 let tokens = fts.segment_for_fts("กินข้าว");
1011 for t in &tokens {
1012 // A soundex code is 4 ASCII chars (lk82/udom83); no synonym should look like one.
1013 for syn in &t.synonyms {
1014 let looks_like_soundex =
1015 syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
1016 assert!(
1017 !looks_like_soundex,
1018 "unexpected soundex-like synonym '{}' on token '{}'",
1019 syn, t.text
1020 );
1021 }
1022 }
1023 }
1024
1025 #[test]
1026 fn soundex_same_sounding_words_share_code_in_index() {
1027 // กาน and ขาน share lk82 code "1600"; both should carry it as a synonym.
1028 use crate::soundex::lk82;
1029 let fts = FtsTokenizer::builder()
1030 .soundex(SoundexAlgorithm::Lk82)
1031 .stopwords(StopwordSet::from_text(""))
1032 .build();
1033 let code = lk82("กาน");
1034 for word in &["กาน", "ขาน", "คาน"] {
1035 let tokens = fts.segment_for_fts(word);
1036 let t = tokens.first().expect("expected at least one token");
1037 assert!(
1038 t.synonyms.contains(&code),
1039 "'{word}' should carry lk82 code '{code}', got {:?}",
1040 t.synonyms
1041 );
1042 }
1043 }
1044
1045 #[test]
1046 fn soundex_not_emitted_for_non_thai_tokens() {
1047 let fts = FtsTokenizer::builder()
1048 .soundex(SoundexAlgorithm::Lk82)
1049 .stopwords(StopwordSet::from_text(""))
1050 .build();
1051 let tokens = fts.segment_for_fts("hello 123");
1052 for t in &tokens {
1053 for syn in &t.synonyms {
1054 let looks_like_soundex =
1055 syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
1056 assert!(
1057 !looks_like_soundex,
1058 "non-Thai token '{}' should not get a soundex synonym, got '{syn}'",
1059 t.text
1060 );
1061 }
1062 }
1063 }
1064
1065 #[test]
1066 fn soundex_udom83_appended() {
1067 use crate::soundex::udom83;
1068 let fts = FtsTokenizer::builder()
1069 .soundex(SoundexAlgorithm::Udom83)
1070 .stopwords(StopwordSet::from_text(""))
1071 .build();
1072 let tokens = fts.segment_for_fts("กิน");
1073 let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
1074 let expected = udom83("กิน");
1075 assert!(
1076 t.synonyms.contains(&expected),
1077 "expected udom83 code '{expected}' in synonyms, got {:?}",
1078 t.synonyms
1079 );
1080 }
1081
1082 #[test]
1083 fn abbrev_expansion_date_sentence() {
1084 use crate::abbrev::AbbrevMap;
1085 let fts = FtsTokenizer::builder()
1086 .abbrevs(AbbrevMap::builtin())
1087 .stopwords(StopwordSet::from_text(""))
1088 .build();
1089 // พ.ศ. → พุทธศักราช; the segmenter may split it further — verify the
1090 // chars are present and dots are gone.
1091 let tokens = fts.segment_for_fts("พ.ศ.2567");
1092 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
1093 let joined: String = texts.concat();
1094 assert!(
1095 joined.contains("พุทธ") || joined.contains("พุทธศักราช"),
1096 "expected พุทธ(ศักราช) chars after expanding พ.ศ., got: {texts:?}"
1097 );
1098 assert!(
1099 !texts.contains(&"."),
1100 "dots should be consumed by expansion, got: {texts:?}"
1101 );
1102 }
1103}