1use alloc::string::String;
28use alloc::vec::Vec;
29
30use crate::abbrev::AbbrevMap;
31use crate::ne::NeTagger;
32use crate::ngram::char_ngrams;
33use crate::number::{thai_digits_to_ascii, thai_word_to_decimal};
34use crate::pos::{PosTag, PosTagger};
35use crate::romanizer::RomanizationMap;
36use crate::stopwords::StopwordSet;
37use crate::synonym::SynonymMap;
38use crate::token::{NamedEntityKind, TokenKind};
39use crate::Tokenizer;
40
41#[derive(Debug, Clone, PartialEq, Eq)]
43pub struct FtsToken {
44 pub text: String,
46 pub position: usize,
48 pub kind: TokenKind,
50 pub is_stop: bool,
52 pub synonyms: Vec<String>,
54 pub trigrams: Vec<String>,
56 pub pos: Option<PosTag>,
59 pub ne: Option<NamedEntityKind>,
62}
63
64#[derive(Default)]
66pub struct FtsTokenizerBuilder {
67 stopwords: Option<StopwordSet>,
68 synonyms: Option<SynonymMap>,
69 ngram_size: Option<usize>,
70 pos_tagger: Option<PosTagger>,
71 ne_tagger: Option<NeTagger>,
72 romanization: Option<RomanizationMap>,
73 abbrev_map: Option<AbbrevMap>,
74 number_normalize: Option<bool>,
76}
77
78impl FtsTokenizerBuilder {
79 pub fn stopwords(mut self, s: StopwordSet) -> Self {
81 self.stopwords = Some(s);
82 self
83 }
84
85 pub fn synonyms(mut self, m: SynonymMap) -> Self {
87 self.synonyms = Some(m);
88 self
89 }
90
91 pub fn ngram_size(mut self, n: usize) -> Self {
95 self.ngram_size = Some(n);
96 self
97 }
98
99 pub fn pos_tagger(mut self, t: PosTagger) -> Self {
101 self.pos_tagger = Some(t);
102 self
103 }
104
105 pub fn ne_tagger(mut self, t: NeTagger) -> Self {
107 self.ne_tagger = Some(t);
108 self
109 }
110
111 pub fn romanization(mut self, m: RomanizationMap) -> Self {
119 self.romanization = Some(m);
120 self
121 }
122
123 pub fn abbrevs(mut self, m: AbbrevMap) -> Self {
132 self.abbrev_map = Some(m);
133 self
134 }
135
136 pub fn number_normalize(mut self, v: bool) -> Self {
149 self.number_normalize = Some(v);
150 self
151 }
152
153 pub fn build(self) -> FtsTokenizer {
155 FtsTokenizer {
156 tokenizer: Tokenizer::new(),
157 stopwords: self.stopwords.unwrap_or_else(StopwordSet::builtin),
158 synonyms: self.synonyms.unwrap_or_else(SynonymMap::empty),
159 ngram_size: self.ngram_size.unwrap_or(3),
160 pos_tagger: self.pos_tagger.unwrap_or_else(PosTagger::builtin),
161 ne_tagger: self.ne_tagger.unwrap_or_else(NeTagger::builtin),
162 romanization: self.romanization,
163 abbrev_map: self.abbrev_map,
164 number_normalize: self.number_normalize.unwrap_or(true),
165 }
166 }
167}
168
169pub struct FtsTokenizer {
184 tokenizer: Tokenizer,
185 stopwords: StopwordSet,
186 synonyms: SynonymMap,
187 ngram_size: usize,
188 pos_tagger: PosTagger,
189 ne_tagger: NeTagger,
190 romanization: Option<RomanizationMap>,
191 abbrev_map: Option<AbbrevMap>,
192 number_normalize: bool,
193}
194
195impl FtsTokenizer {
196 pub fn new() -> Self {
198 FtsTokenizerBuilder::default().build()
199 }
200
201 pub fn builder() -> FtsTokenizerBuilder {
203 FtsTokenizerBuilder::default()
204 }
205
206 pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken> {
217 let normalized = self.tokenizer.normalize(text);
218 let expanded = match self.abbrev_map.as_ref() {
221 Some(am) => am.expand_text(&normalized),
222 None => normalized,
223 };
224 let raw_tokens = self
225 .ne_tagger
226 .tag_tokens(self.tokenizer.segment(&expanded), &expanded);
227
228 let mut result = Vec::with_capacity(raw_tokens.len());
229 let mut position = 0usize;
230
231 for token in &raw_tokens {
232 if token.kind == TokenKind::Whitespace {
233 continue;
234 }
235
236 let is_stop = self.stopwords.contains(token.text);
237 let is_thai_or_named = matches!(token.kind, TokenKind::Thai | TokenKind::Named(_));
238 let mut synonyms = self
239 .synonyms
240 .expand(token.text)
241 .map(|s| s.to_vec())
242 .unwrap_or_default();
243 if is_thai_or_named {
244 if let Some(ref rom) = self.romanization {
245 if let Some(rtgs) = rom.romanize(token.text) {
246 synonyms.push(String::from(rtgs));
247 }
248 }
249 }
250 if self.number_normalize {
251 match token.kind {
252 TokenKind::Number => {
254 let ascii = thai_digits_to_ascii(token.text);
255 if ascii != token.text {
256 synonyms.push(ascii);
257 }
258 }
259 TokenKind::Thai => {
261 if let Some(decimal) = thai_word_to_decimal(token.text) {
262 synonyms.push(decimal);
263 }
264 }
265 _ => {}
266 }
267 }
268 let trigrams = if token.kind == TokenKind::Unknown && self.ngram_size > 0 {
269 char_ngrams(token.text, self.ngram_size)
270 .map(String::from)
271 .collect()
272 } else {
273 Vec::new()
274 };
275 let ne = if let TokenKind::Named(k) = token.kind {
276 Some(k)
277 } else {
278 None
279 };
280 let pos = if token.kind == TokenKind::Thai {
281 self.pos_tagger.tag(token.text)
282 } else {
283 None
284 };
285
286 result.push(FtsToken {
287 text: String::from(token.text),
288 position,
289 kind: token.kind,
290 is_stop,
291 synonyms,
292 trigrams,
293 pos,
294 ne,
295 });
296
297 position += 1;
298 }
299
300 result
301 }
302
303 pub fn index_tokens(&self, text: &str) -> Vec<FtsToken> {
308 self.segment_for_fts(text)
309 .into_iter()
310 .filter(|t| !t.is_stop)
311 .collect()
312 }
313
314 pub fn lexemes(&self, text: &str) -> Vec<String> {
320 let tokens = self.index_tokens(text);
321 let mut out: Vec<String> = Vec::with_capacity(tokens.len() * 2);
322 for t in tokens {
323 out.push(t.text.clone());
324 out.extend(t.synonyms);
325 out.extend(t.trigrams);
326 }
327 out
328 }
329}
330
331impl Default for FtsTokenizer {
332 fn default() -> Self {
333 Self::new()
334 }
335}
336
337#[cfg(test)]
342mod tests {
343 use super::*;
344 use crate::stopwords::StopwordSet;
345 use crate::synonym::SynonymMap;
346
347 fn fts() -> FtsTokenizer {
348 FtsTokenizer::new()
349 }
350
351 #[test]
354 fn empty_input_returns_empty() {
355 assert!(fts().segment_for_fts("").is_empty());
356 }
357
358 #[test]
359 fn whitespace_tokens_excluded() {
360 let tokens = fts().segment_for_fts("กิน ข้าว");
361 assert!(tokens.iter().all(|t| t.kind != TokenKind::Whitespace));
362 }
363
364 #[test]
365 fn positions_are_sequential() {
366 let tokens = fts().segment_for_fts("กินข้าวกับปลา");
367 for (i, t) in tokens.iter().enumerate() {
368 assert_eq!(t.position, i, "position mismatch at index {i}");
369 }
370 }
371
372 #[test]
373 fn known_stopword_is_tagged() {
374 let tokens = fts().segment_for_fts("กินข้าวกับปลา");
376 let kap = tokens.iter().find(|t| t.text == "กับ");
377 assert!(kap.is_some(), "expected 'กับ' token");
378 assert!(kap.unwrap().is_stop, "'กับ' should be tagged as stopword");
379 }
380
381 #[test]
382 fn content_words_not_tagged_as_stop() {
383 let tokens = fts().segment_for_fts("โรงพยาบาล");
384 for t in &tokens {
386 assert!(!t.is_stop, "'{}' should not be a stopword", t.text);
387 }
388 }
389
390 #[test]
391 fn text_is_reconstructable() {
392 let fts = fts();
394 let text = "กินข้าวกับปลา";
395 let normalized = fts.tokenizer.normalize(text);
396 let tokens = fts.segment_for_fts(text);
397 let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
398 assert_eq!(rebuilt, normalized);
399 }
400
401 #[test]
404 fn synonym_expansion_attached() {
405 let synonyms = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
406 let fts = FtsTokenizer::builder()
407 .synonyms(synonyms)
408 .stopwords(StopwordSet::from_text(""))
409 .build();
410 let tokens = fts.segment_for_fts("คอม");
413 let t = tokens.iter().find(|t| t.text == "คอม");
414 if let Some(tok) = t {
415 assert!(
416 tok.synonyms.contains(&String::from("คอมพิวเตอร์")),
417 "expected synonym expansion, got {:?}",
418 tok.synonyms
419 );
420 }
421 }
422
423 #[test]
424 fn no_synonyms_when_map_empty() {
425 let tokens = fts().segment_for_fts("กินข้าว");
426 for t in &tokens {
427 assert!(t.synonyms.is_empty());
428 }
429 }
430
431 #[test]
434 fn unknown_token_gets_trigrams() {
435 let fts = FtsTokenizer::builder()
440 .ngram_size(2)
441 .stopwords(StopwordSet::from_text(""))
442 .build();
443 let tokens = fts.segment_for_fts("กิ");
444 let unknown: Vec<_> = tokens
445 .iter()
446 .filter(|t| t.kind == TokenKind::Unknown && t.text.chars().count() >= 2)
447 .collect();
448 assert!(
449 !unknown.is_empty(),
450 "expected at least one multi-char Unknown token for 'กิ'"
451 );
452 for u in &unknown {
453 assert!(
454 !u.trigrams.is_empty(),
455 "unknown token '{}' ({} chars) should have bigrams",
456 u.text,
457 u.text.chars().count()
458 );
459 }
460 }
461
462 #[test]
463 fn known_thai_token_has_no_trigrams() {
464 let tokens = fts().segment_for_fts("กิน");
465 for t in &tokens {
466 if t.kind == TokenKind::Thai {
467 assert!(
468 t.trigrams.is_empty(),
469 "known Thai token '{}' should not have trigrams",
470 t.text
471 );
472 }
473 }
474 }
475
476 #[test]
477 fn ngram_size_zero_disables_trigrams() {
478 let fts = FtsTokenizer::builder()
479 .ngram_size(0)
480 .stopwords(StopwordSet::from_text(""))
481 .build();
482 let tokens = fts.segment_for_fts("กขคง");
483 for t in &tokens {
484 assert!(t.trigrams.is_empty());
485 }
486 }
487
488 #[test]
491 fn index_tokens_excludes_stopwords() {
492 let tokens = fts().index_tokens("กินข้าวกับปลา");
493 assert!(tokens.iter().all(|t| !t.is_stop));
494 }
495
496 #[test]
497 fn index_tokens_preserves_positions() {
498 let all = fts().segment_for_fts("กินข้าวกับปลา");
500 let indexed = fts().index_tokens("กินข้าวกับปลา");
501 for t in &indexed {
502 assert!(
503 all.iter().any(|a| a.position == t.position),
504 "indexed token at position {} not found in full token list",
505 t.position
506 );
507 }
508 }
509
510 #[test]
513 fn lexemes_returns_non_stop_texts() {
514 let lexemes = fts().lexemes("กินข้าวกับปลา");
515 assert!(!lexemes.contains(&String::from("กับ")));
517 assert!(
519 lexemes
520 .iter()
521 .any(|l| l == "กิน" || l == "ข้าว" || l == "ปลา"),
522 "expected content words in lexemes: {lexemes:?}"
523 );
524 }
525
526 #[test]
527 fn lexemes_empty_input_is_empty() {
528 assert!(fts().lexemes("").is_empty());
529 }
530
531 #[test]
534 fn multi_token_ne_merged_in_pipeline() {
535 let fts = FtsTokenizer::new();
538 let tokens = fts.segment_for_fts("ไปกรุงเทพ");
539 let named: Vec<_> = tokens
540 .iter()
541 .filter(|t| matches!(t.kind, TokenKind::Named(_)))
542 .collect();
543 assert!(
544 named.iter().any(|t| t.text == "กรุงเทพ"),
545 "กรุงเทพ should be tagged Named after multi-token merge, tokens: {:?}",
546 tokens
547 .iter()
548 .map(|t| (&t.text, &t.kind))
549 .collect::<alloc::vec::Vec<_>>()
550 );
551 }
552
553 #[test]
554 fn multi_token_ne_reconstructable() {
555 let fts = FtsTokenizer::new();
557 let text = "ไปกรุงเทพ";
558 let normalized = fts.tokenizer.normalize(text);
559 let tokens = fts.segment_for_fts(text);
560 let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
561 assert_eq!(rebuilt, normalized);
562 }
563
564 #[test]
567 fn builder_custom_stopwords() {
568 let stops = StopwordSet::from_text("กิน\n");
569 let fts = FtsTokenizer::builder().stopwords(stops).build();
570 let tokens = fts.segment_for_fts("กินข้าว");
571 let gin = tokens.iter().find(|t| t.text == "กิน");
572 if let Some(t) = gin {
573 assert!(t.is_stop, "'กิน' should be stop with custom list");
574 }
575 }
576
577 #[test]
578 fn builder_default_equals_new() {
579 let a = FtsTokenizer::new().lexemes("กินข้าว");
581 let b = FtsTokenizer::builder().build().lexemes("กินข้าว");
582 assert_eq!(a, b);
583 }
584
585 #[test]
588 fn thai_digit_token_gets_ascii_synonym() {
589 let fts = FtsTokenizer::new();
590 let tokens = fts.segment_for_fts("๑๒๓");
591 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
592 assert!(num.is_some(), "expected a Number token");
593 let t = num.unwrap();
594 assert!(
595 t.synonyms.contains(&String::from("123")),
596 "Thai digit token should have ASCII synonym, got {:?}",
597 t.synonyms
598 );
599 }
600
601 #[test]
602 fn ascii_digit_token_has_no_extra_synonym() {
603 let fts = FtsTokenizer::new();
605 let tokens = fts.segment_for_fts("123");
606 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
607 assert!(num.is_some(), "expected a Number token");
608 assert!(
609 !num.unwrap().synonyms.contains(&String::from("123")),
610 "ASCII digit token should not duplicate itself as a synonym"
611 );
612 }
613
614 #[test]
615 fn thai_number_word_gets_decimal_synonym() {
616 let fts = FtsTokenizer::new();
619 let tokens = fts.segment_for_fts("หนึ่งร้อย");
620 let has_hundred = tokens
621 .iter()
622 .any(|t| t.synonyms.contains(&String::from("100")));
623 assert!(
625 has_hundred,
626 "expected a token with decimal synonym '100', tokens: {:?}",
627 tokens
628 .iter()
629 .map(|t| (&t.text, &t.synonyms))
630 .collect::<alloc::vec::Vec<_>>()
631 );
632 }
633
634 #[test]
635 fn number_normalize_false_disables_conversion() {
636 let fts = FtsTokenizer::builder()
637 .number_normalize(false)
638 .stopwords(StopwordSet::from_text(""))
639 .build();
640 let tokens = fts.segment_for_fts("๑๒๓");
641 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
642 assert!(num.is_some());
643 assert!(
644 !num.unwrap().synonyms.contains(&String::from("123")),
645 "number_normalize=false should suppress ASCII synonym"
646 );
647 }
648
649 #[test]
650 fn mixed_thai_digit_in_context() {
651 let fts = FtsTokenizer::new();
653 let tokens = fts.segment_for_fts("ธนาคาร๑๐๐แห่ง");
654 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
655 assert!(num.is_some(), "expected Number token in mixed string");
656 assert!(
657 num.unwrap().synonyms.contains(&String::from("100")),
658 "expected ASCII synonym '100' for ๑๐๐"
659 );
660 }
661
662 #[test]
665 fn abbrev_map_expands_before_segmentation() {
666 use crate::abbrev::AbbrevMap;
667 let fts = FtsTokenizer::builder()
668 .abbrevs(AbbrevMap::builtin())
669 .stopwords(StopwordSet::from_text(""))
670 .build();
671 let tokens = fts.segment_for_fts("ก.ค.");
675 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
676 let joined: String = texts.concat();
677 assert!(
678 joined.contains("กรกฎา") || joined.contains("กรกฎาคม"),
679 "expected กรกฎา(คม) characters after abbrev expansion, got: {texts:?}"
680 );
681 assert!(
682 !texts.contains(&"."),
683 "dots should be consumed by abbrev expansion, got: {texts:?}"
684 );
685 }
686
687 #[test]
688 fn abbrev_expansion_disabled_by_default() {
689 let fts = FtsTokenizer::new();
691 let tokens = fts.segment_for_fts("ก.ค.");
692 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
693 assert!(
695 texts.contains(&"."),
696 "without abbrev expansion, dots should remain as tokens, got: {texts:?}"
697 );
698 }
699
700 #[test]
701 fn abbrev_expansion_date_sentence() {
702 use crate::abbrev::AbbrevMap;
703 let fts = FtsTokenizer::builder()
704 .abbrevs(AbbrevMap::builtin())
705 .stopwords(StopwordSet::from_text(""))
706 .build();
707 let tokens = fts.segment_for_fts("พ.ศ.2567");
710 let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
711 let joined: String = texts.concat();
712 assert!(
713 joined.contains("พุทธ") || joined.contains("พุทธศักราช"),
714 "expected พุทธ(ศักราช) chars after expanding พ.ศ., got: {texts:?}"
715 );
716 assert!(
717 !texts.contains(&"."),
718 "dots should be consumed by expansion, got: {texts:?}"
719 );
720 }
721}