1use alloc::string::String;
28use alloc::vec::Vec;
29
30use crate::abbrev::AbbrevMap;
31use crate::ne::NeTagger;
32use crate::ngram::char_ngrams;
33use crate::number::{thai_digits_to_ascii, thai_word_to_decimal};
34use crate::pos::{PosTag, PosTagger};
35use crate::romanizer::RomanizationMap;
36use crate::soundex::{soundex, SoundexAlgorithm};
37use crate::stopwords::StopwordSet;
38use crate::synonym::SynonymMap;
39use crate::token::{NamedEntityKind, TokenKind};
40use crate::Tokenizer;
41
42#[derive(Debug, Clone, PartialEq, Eq)]
44pub struct FtsToken {
45 pub text: String,
47 pub position: usize,
49 pub kind: TokenKind,
51 pub is_stop: bool,
53 pub synonyms: Vec<String>,
55 pub trigrams: Vec<String>,
57 pub pos: Option<PosTag>,
60 pub ne: Option<NamedEntityKind>,
63}
64
65#[derive(Default)]
67pub struct FtsTokenizerBuilder {
68 stopwords: Option<StopwordSet>,
69 synonyms: Option<SynonymMap>,
70 ngram_size: Option<usize>,
71 pos_tagger: Option<PosTagger>,
72 ne_tagger: Option<NeTagger>,
73 romanization: Option<RomanizationMap>,
74 abbrev_map: Option<AbbrevMap>,
75 number_normalize: Option<bool>,
77 soundex: Option<SoundexAlgorithm>,
78}
79
80impl FtsTokenizerBuilder {
81 pub fn stopwords(mut self, s: StopwordSet) -> Self {
83 self.stopwords = Some(s);
84 self
85 }
86
87 pub fn synonyms(mut self, m: SynonymMap) -> Self {
89 self.synonyms = Some(m);
90 self
91 }
92
93 pub fn ngram_size(mut self, n: usize) -> Self {
97 self.ngram_size = Some(n);
98 self
99 }
100
101 pub fn pos_tagger(mut self, t: PosTagger) -> Self {
103 self.pos_tagger = Some(t);
104 self
105 }
106
107 pub fn ne_tagger(mut self, t: NeTagger) -> Self {
109 self.ne_tagger = Some(t);
110 self
111 }
112
113 pub fn romanization(mut self, m: RomanizationMap) -> Self {
121 self.romanization = Some(m);
122 self
123 }
124
125 pub fn abbrevs(mut self, m: AbbrevMap) -> Self {
134 self.abbrev_map = Some(m);
135 self
136 }
137
138 pub fn number_normalize(mut self, v: bool) -> Self {
151 self.number_normalize = Some(v);
152 self
153 }
154
155 pub fn soundex(mut self, algo: SoundexAlgorithm) -> Self {
168 self.soundex = Some(algo);
169 self
170 }
171
172 pub fn build(self) -> FtsTokenizer {
174 FtsTokenizer {
175 tokenizer: Tokenizer::new(),
176 stopwords: self.stopwords.unwrap_or_else(StopwordSet::builtin),
177 synonyms: self.synonyms.unwrap_or_else(SynonymMap::empty),
178 ngram_size: self.ngram_size.unwrap_or(3),
179 pos_tagger: self.pos_tagger.unwrap_or_else(PosTagger::builtin),
180 ne_tagger: self.ne_tagger.unwrap_or_else(NeTagger::builtin),
181 romanization: self.romanization,
182 abbrev_map: self.abbrev_map,
183 number_normalize: self.number_normalize.unwrap_or(true),
184 soundex: self.soundex,
185 }
186 }
187}
188
189pub struct FtsTokenizer {
204 tokenizer: Tokenizer,
205 stopwords: StopwordSet,
206 synonyms: SynonymMap,
207 ngram_size: usize,
208 pos_tagger: PosTagger,
209 ne_tagger: NeTagger,
210 romanization: Option<RomanizationMap>,
211 abbrev_map: Option<AbbrevMap>,
212 number_normalize: bool,
213 soundex: Option<SoundexAlgorithm>,
214}
215
216impl FtsTokenizer {
217 pub fn new() -> Self {
219 FtsTokenizerBuilder::default().build()
220 }
221
222 pub fn builder() -> FtsTokenizerBuilder {
224 FtsTokenizerBuilder::default()
225 }
226
227 pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken> {
238 let normalized = self.tokenizer.normalize(text);
239 let expanded = match self.abbrev_map.as_ref() {
242 Some(am) => am.expand_text(&normalized),
243 None => normalized,
244 };
245 let raw_tokens = self
246 .ne_tagger
247 .tag_tokens(self.tokenizer.segment(&expanded), &expanded);
248
249 let mut result = Vec::with_capacity(raw_tokens.len());
250 let mut position = 0usize;
251
252 for token in &raw_tokens {
253 if token.kind == TokenKind::Whitespace {
254 continue;
255 }
256
257 let is_stop = self.stopwords.contains(token.text);
258 let is_thai_or_named = matches!(token.kind, TokenKind::Thai | TokenKind::Named(_));
259 let mut synonyms = self
260 .synonyms
261 .expand(token.text)
262 .map(|s| s.to_vec())
263 .unwrap_or_default();
264 if is_thai_or_named {
265 if let Some(ref rom) = self.romanization {
266 if let Some(rtgs) = rom.romanize(token.text) {
267 synonyms.push(String::from(rtgs));
268 }
269 }
270 if let Some(algo) = self.soundex {
271 let code = soundex(token.text, algo);
272 if !code.chars().all(|c| c == '0') {
273 synonyms.push(code);
274 }
275 }
276 }
277 if self.number_normalize {
278 match token.kind {
279 TokenKind::Number => {
281 let ascii = thai_digits_to_ascii(token.text);
282 if ascii != token.text {
283 synonyms.push(ascii);
284 }
285 }
286 TokenKind::Thai => {
288 if let Some(decimal) = thai_word_to_decimal(token.text) {
289 synonyms.push(decimal);
290 }
291 }
292 _ => {}
293 }
294 }
295 let trigrams = if token.kind == TokenKind::Unknown && self.ngram_size > 0 {
296 char_ngrams(token.text, self.ngram_size)
297 .map(String::from)
298 .collect()
299 } else {
300 Vec::new()
301 };
302 let ne = if let TokenKind::Named(k) = token.kind {
303 Some(k)
304 } else {
305 None
306 };
307 let pos = if token.kind == TokenKind::Thai {
308 self.pos_tagger.tag(token.text)
309 } else {
310 None
311 };
312
313 result.push(FtsToken {
314 text: String::from(token.text),
315 position,
316 kind: token.kind,
317 is_stop,
318 synonyms,
319 trigrams,
320 pos,
321 ne,
322 });
323
324 position += 1;
325 }
326
327 result
328 }
329
330 pub fn index_tokens(&self, text: &str) -> Vec<FtsToken> {
335 self.segment_for_fts(text)
336 .into_iter()
337 .filter(|t| !t.is_stop)
338 .collect()
339 }
340
341 pub fn lexemes(&self, text: &str) -> Vec<String> {
347 let tokens = self.index_tokens(text);
348 let mut out: Vec<String> = Vec::with_capacity(tokens.len() * 2);
349 for t in tokens {
350 out.push(t.text.clone());
351 out.extend(t.synonyms);
352 out.extend(t.trigrams);
353 }
354 out
355 }
356}
357
358impl Default for FtsTokenizer {
359 fn default() -> Self {
360 Self::new()
361 }
362}
363
364#[cfg(test)]
369mod tests {
370 use super::*;
371 use crate::stopwords::StopwordSet;
372 use crate::synonym::SynonymMap;
373
374 fn fts() -> FtsTokenizer {
375 FtsTokenizer::new()
376 }
377
378 #[test]
381 fn empty_input_returns_empty() {
382 assert!(fts().segment_for_fts("").is_empty());
383 }
384
385 #[test]
386 fn whitespace_tokens_excluded() {
387 let tokens = fts().segment_for_fts("กิน ข้าว");
388 assert!(tokens.iter().all(|t| t.kind != TokenKind::Whitespace));
389 }
390
391 #[test]
392 fn positions_are_sequential() {
393 let tokens = fts().segment_for_fts("กินข้าวกับปลา");
394 for (i, t) in tokens.iter().enumerate() {
395 assert_eq!(t.position, i, "position mismatch at index {i}");
396 }
397 }
398
399 #[test]
400 fn known_stopword_is_tagged() {
401 let tokens = fts().segment_for_fts("กินข้าวกับปลา");
403 let kap = tokens.iter().find(|t| t.text == "กับ");
404 assert!(kap.is_some(), "expected 'กับ' token");
405 assert!(kap.unwrap().is_stop, "'กับ' should be tagged as stopword");
406 }
407
408 #[test]
409 fn content_words_not_tagged_as_stop() {
410 let tokens = fts().segment_for_fts("โรงพยาบาล");
411 for t in &tokens {
413 assert!(!t.is_stop, "'{}' should not be a stopword", t.text);
414 }
415 }
416
417 #[test]
418 fn text_is_reconstructable() {
419 let fts = fts();
421 let text = "กินข้าวกับปลา";
422 let normalized = fts.tokenizer.normalize(text);
423 let tokens = fts.segment_for_fts(text);
424 let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
425 assert_eq!(rebuilt, normalized);
426 }
427
428 #[test]
431 fn synonym_expansion_attached() {
432 let synonyms = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
433 let fts = FtsTokenizer::builder()
434 .synonyms(synonyms)
435 .stopwords(StopwordSet::from_text(""))
436 .build();
437 let tokens = fts.segment_for_fts("คอม");
440 let t = tokens.iter().find(|t| t.text == "คอม");
441 if let Some(tok) = t {
442 assert!(
443 tok.synonyms.contains(&String::from("คอมพิวเตอร์")),
444 "expected synonym expansion, got {:?}",
445 tok.synonyms
446 );
447 }
448 }
449
450 #[test]
451 fn no_synonyms_when_map_empty() {
452 let tokens = fts().segment_for_fts("กินข้าว");
453 for t in &tokens {
454 assert!(t.synonyms.is_empty());
455 }
456 }
457
458 #[test]
461 fn unknown_token_gets_trigrams() {
462 let fts = FtsTokenizer::builder()
467 .ngram_size(2)
468 .stopwords(StopwordSet::from_text(""))
469 .build();
470 let tokens = fts.segment_for_fts("กิ");
471 let unknown: Vec<_> = tokens
472 .iter()
473 .filter(|t| t.kind == TokenKind::Unknown && t.text.chars().count() >= 2)
474 .collect();
475 assert!(
476 !unknown.is_empty(),
477 "expected at least one multi-char Unknown token for 'กิ'"
478 );
479 for u in &unknown {
480 assert!(
481 !u.trigrams.is_empty(),
482 "unknown token '{}' ({} chars) should have bigrams",
483 u.text,
484 u.text.chars().count()
485 );
486 }
487 }
488
489 #[test]
490 fn known_thai_token_has_no_trigrams() {
491 let tokens = fts().segment_for_fts("กิน");
492 for t in &tokens {
493 if t.kind == TokenKind::Thai {
494 assert!(
495 t.trigrams.is_empty(),
496 "known Thai token '{}' should not have trigrams",
497 t.text
498 );
499 }
500 }
501 }
502
503 #[test]
504 fn ngram_size_zero_disables_trigrams() {
505 let fts = FtsTokenizer::builder()
506 .ngram_size(0)
507 .stopwords(StopwordSet::from_text(""))
508 .build();
509 let tokens = fts.segment_for_fts("กขคง");
510 for t in &tokens {
511 assert!(t.trigrams.is_empty());
512 }
513 }
514
515 #[test]
518 fn index_tokens_excludes_stopwords() {
519 let tokens = fts().index_tokens("กินข้าวกับปลา");
520 assert!(tokens.iter().all(|t| !t.is_stop));
521 }
522
523 #[test]
524 fn index_tokens_preserves_positions() {
525 let all = fts().segment_for_fts("กินข้าวกับปลา");
527 let indexed = fts().index_tokens("กินข้าวกับปลา");
528 for t in &indexed {
529 assert!(
530 all.iter().any(|a| a.position == t.position),
531 "indexed token at position {} not found in full token list",
532 t.position
533 );
534 }
535 }
536
537 #[test]
540 fn lexemes_returns_non_stop_texts() {
541 let lexemes = fts().lexemes("กินข้าวกับปลา");
542 assert!(!lexemes.contains(&String::from("กับ")));
544 assert!(
546 lexemes
547 .iter()
548 .any(|l| l == "กิน" || l == "ข้าว" || l == "ปลา"),
549 "expected content words in lexemes: {lexemes:?}"
550 );
551 }
552
553 #[test]
554 fn lexemes_empty_input_is_empty() {
555 assert!(fts().lexemes("").is_empty());
556 }
557
558 #[test]
561 fn multi_token_ne_merged_in_pipeline() {
562 let fts = FtsTokenizer::new();
565 let tokens = fts.segment_for_fts("ไปกรุงเทพ");
566 let named: Vec<_> = tokens
567 .iter()
568 .filter(|t| matches!(t.kind, TokenKind::Named(_)))
569 .collect();
570 assert!(
571 named.iter().any(|t| t.text == "กรุงเทพ"),
572 "กรุงเทพ should be tagged Named after multi-token merge, tokens: {:?}",
573 tokens
574 .iter()
575 .map(|t| (&t.text, &t.kind))
576 .collect::<alloc::vec::Vec<_>>()
577 );
578 }
579
580 #[test]
581 fn multi_token_ne_reconstructable() {
582 let fts = FtsTokenizer::new();
584 let text = "ไปกรุงเทพ";
585 let normalized = fts.tokenizer.normalize(text);
586 let tokens = fts.segment_for_fts(text);
587 let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
588 assert_eq!(rebuilt, normalized);
589 }
590
591 #[test]
594 fn builder_custom_stopwords() {
595 let stops = StopwordSet::from_text("กิน\n");
596 let fts = FtsTokenizer::builder().stopwords(stops).build();
597 let tokens = fts.segment_for_fts("กินข้าว");
598 let gin = tokens.iter().find(|t| t.text == "กิน");
599 if let Some(t) = gin {
600 assert!(t.is_stop, "'กิน' should be stop with custom list");
601 }
602 }
603
604 #[test]
605 fn builder_default_equals_new() {
606 let a = FtsTokenizer::new().lexemes("กินข้าว");
608 let b = FtsTokenizer::builder().build().lexemes("กินข้าว");
609 assert_eq!(a, b);
610 }
611
612 #[test]
615 fn thai_digit_token_gets_ascii_synonym() {
616 let fts = FtsTokenizer::new();
617 let tokens = fts.segment_for_fts("๑๒๓");
618 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
619 assert!(num.is_some(), "expected a Number token");
620 let t = num.unwrap();
621 assert!(
622 t.synonyms.contains(&String::from("123")),
623 "Thai digit token should have ASCII synonym, got {:?}",
624 t.synonyms
625 );
626 }
627
628 #[test]
629 fn ascii_digit_token_has_no_extra_synonym() {
630 let fts = FtsTokenizer::new();
632 let tokens = fts.segment_for_fts("123");
633 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
634 assert!(num.is_some(), "expected a Number token");
635 assert!(
636 !num.unwrap().synonyms.contains(&String::from("123")),
637 "ASCII digit token should not duplicate itself as a synonym"
638 );
639 }
640
641 #[test]
642 fn thai_number_word_gets_decimal_synonym() {
643 let fts = FtsTokenizer::new();
646 let tokens = fts.segment_for_fts("หนึ่งร้อย");
647 let has_hundred = tokens
648 .iter()
649 .any(|t| t.synonyms.contains(&String::from("100")));
650 assert!(
652 has_hundred,
653 "expected a token with decimal synonym '100', tokens: {:?}",
654 tokens
655 .iter()
656 .map(|t| (&t.text, &t.synonyms))
657 .collect::<alloc::vec::Vec<_>>()
658 );
659 }
660
661 #[test]
662 fn number_normalize_false_disables_conversion() {
663 let fts = FtsTokenizer::builder()
664 .number_normalize(false)
665 .stopwords(StopwordSet::from_text(""))
666 .build();
667 let tokens = fts.segment_for_fts("๑๒๓");
668 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
669 assert!(num.is_some());
670 assert!(
671 !num.unwrap().synonyms.contains(&String::from("123")),
672 "number_normalize=false should suppress ASCII synonym"
673 );
674 }
675
676 #[test]
677 fn mixed_thai_digit_in_context() {
678 let fts = FtsTokenizer::new();
680 let tokens = fts.segment_for_fts("ธนาคาร๑๐๐แห่ง");
681 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
682 assert!(num.is_some(), "expected Number token in mixed string");
683 assert!(
684 num.unwrap().synonyms.contains(&String::from("100")),
685 "expected ASCII synonym '100' for ๑๐๐"
686 );
687 }
688
689 #[test]
692 fn abbrev_map_expands_before_segmentation() {
693 use crate::abbrev::AbbrevMap;
694 let fts = FtsTokenizer::builder()
695 .abbrevs(AbbrevMap::builtin())
696 .stopwords(StopwordSet::from_text(""))
697 .build();
698 let tokens = fts.segment_for_fts("ก.ค.");
702 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
703 let joined: String = texts.concat();
704 assert!(
705 joined.contains("กรกฎา") || joined.contains("กรกฎาคม"),
706 "expected กรกฎา(คม) characters after abbrev expansion, got: {texts:?}"
707 );
708 assert!(
709 !texts.contains(&"."),
710 "dots should be consumed by abbrev expansion, got: {texts:?}"
711 );
712 }
713
714 #[test]
715 fn abbrev_expansion_disabled_by_default() {
716 let fts = FtsTokenizer::new();
718 let tokens = fts.segment_for_fts("ก.ค.");
719 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
720 assert!(
722 texts.contains(&"."),
723 "without abbrev expansion, dots should remain as tokens, got: {texts:?}"
724 );
725 }
726
727 #[test]
730 fn soundex_lk82_appended_to_thai_synonyms() {
731 use crate::soundex::lk82;
732 let fts = FtsTokenizer::builder()
733 .soundex(SoundexAlgorithm::Lk82)
734 .stopwords(StopwordSet::from_text(""))
735 .build();
736 let tokens = fts.segment_for_fts("กิน");
737 let t = tokens.iter().find(|t| t.text == "กิน");
738 assert!(t.is_some(), "expected token 'กิน'");
739 let expected_code = lk82("กิน");
740 assert!(
741 t.unwrap().synonyms.contains(&expected_code),
742 "expected lk82 code '{expected_code}' in synonyms, got {:?}",
743 t.unwrap().synonyms
744 );
745 }
746
747 #[test]
748 fn soundex_not_emitted_by_default() {
749 let fts = FtsTokenizer::new();
751 let tokens = fts.segment_for_fts("กินข้าว");
752 for t in &tokens {
753 for syn in &t.synonyms {
755 let looks_like_soundex =
756 syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
757 assert!(
758 !looks_like_soundex,
759 "unexpected soundex-like synonym '{}' on token '{}'",
760 syn, t.text
761 );
762 }
763 }
764 }
765
766 #[test]
767 fn soundex_same_sounding_words_share_code_in_index() {
768 use crate::soundex::lk82;
770 let fts = FtsTokenizer::builder()
771 .soundex(SoundexAlgorithm::Lk82)
772 .stopwords(StopwordSet::from_text(""))
773 .build();
774 let code = lk82("กาน");
775 for word in &["กาน", "ขาน", "คาน"] {
776 let tokens = fts.segment_for_fts(word);
777 let t = tokens.first().expect("expected at least one token");
778 assert!(
779 t.synonyms.contains(&code),
780 "'{word}' should carry lk82 code '{code}', got {:?}",
781 t.synonyms
782 );
783 }
784 }
785
786 #[test]
787 fn soundex_not_emitted_for_non_thai_tokens() {
788 let fts = FtsTokenizer::builder()
789 .soundex(SoundexAlgorithm::Lk82)
790 .stopwords(StopwordSet::from_text(""))
791 .build();
792 let tokens = fts.segment_for_fts("hello 123");
793 for t in &tokens {
794 for syn in &t.synonyms {
795 let looks_like_soundex =
796 syn.len() == 4 && syn.chars().all(|c| c.is_ascii_alphanumeric());
797 assert!(
798 !looks_like_soundex,
799 "non-Thai token '{}' should not get a soundex synonym, got '{syn}'",
800 t.text
801 );
802 }
803 }
804 }
805
806 #[test]
807 fn soundex_udom83_appended() {
808 use crate::soundex::udom83;
809 let fts = FtsTokenizer::builder()
810 .soundex(SoundexAlgorithm::Udom83)
811 .stopwords(StopwordSet::from_text(""))
812 .build();
813 let tokens = fts.segment_for_fts("กิน");
814 let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
815 let expected = udom83("กิน");
816 assert!(
817 t.synonyms.contains(&expected),
818 "expected udom83 code '{expected}' in synonyms, got {:?}",
819 t.synonyms
820 );
821 }
822
823 #[test]
824 fn abbrev_expansion_date_sentence() {
825 use crate::abbrev::AbbrevMap;
826 let fts = FtsTokenizer::builder()
827 .abbrevs(AbbrevMap::builtin())
828 .stopwords(StopwordSet::from_text(""))
829 .build();
830 let tokens = fts.segment_for_fts("พ.ศ.2567");
833 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
834 let joined: String = texts.concat();
835 assert!(
836 joined.contains("พุทธ") || joined.contains("พุทธศักราช"),
837 "expected พุทธ(ศักราช) chars after expanding พ.ศ., got: {texts:?}"
838 );
839 assert!(
840 !texts.contains(&"."),
841 "dots should be consumed by expansion, got: {texts:?}"
842 );
843 }
844}