use alloc::string::String;
use alloc::vec::Vec;
use crate::abbrev::AbbrevMap;
use crate::ne::NeTagger;
use crate::ngram::char_ngrams;
use crate::number::{thai_digits_to_ascii, thai_word_to_decimal};
use crate::pos::{PosTag, PosTagger};
use crate::romanizer::RomanizationMap;
use crate::stopwords::StopwordSet;
use crate::synonym::SynonymMap;
use crate::token::{NamedEntityKind, TokenKind};
use crate::Tokenizer;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct FtsToken {
pub text: String,
pub position: usize,
pub kind: TokenKind,
pub is_stop: bool,
pub synonyms: Vec<String>,
pub trigrams: Vec<String>,
pub pos: Option<PosTag>,
pub ne: Option<NamedEntityKind>,
}
#[derive(Default)]
pub struct FtsTokenizerBuilder {
stopwords: Option<StopwordSet>,
synonyms: Option<SynonymMap>,
ngram_size: Option<usize>,
pos_tagger: Option<PosTagger>,
ne_tagger: Option<NeTagger>,
romanization: Option<RomanizationMap>,
abbrev_map: Option<AbbrevMap>,
number_normalize: Option<bool>,
}
impl FtsTokenizerBuilder {
pub fn stopwords(mut self, s: StopwordSet) -> Self {
self.stopwords = Some(s);
self
}
pub fn synonyms(mut self, m: SynonymMap) -> Self {
self.synonyms = Some(m);
self
}
pub fn ngram_size(mut self, n: usize) -> Self {
self.ngram_size = Some(n);
self
}
pub fn pos_tagger(mut self, t: PosTagger) -> Self {
self.pos_tagger = Some(t);
self
}
pub fn ne_tagger(mut self, t: NeTagger) -> Self {
self.ne_tagger = Some(t);
self
}
pub fn romanization(mut self, m: RomanizationMap) -> Self {
self.romanization = Some(m);
self
}
pub fn abbrevs(mut self, m: AbbrevMap) -> Self {
self.abbrev_map = Some(m);
self
}
pub fn number_normalize(mut self, v: bool) -> Self {
self.number_normalize = Some(v);
self
}
pub fn build(self) -> FtsTokenizer {
FtsTokenizer {
tokenizer: Tokenizer::new(),
stopwords: self.stopwords.unwrap_or_else(StopwordSet::builtin),
synonyms: self.synonyms.unwrap_or_else(SynonymMap::empty),
ngram_size: self.ngram_size.unwrap_or(3),
pos_tagger: self.pos_tagger.unwrap_or_else(PosTagger::builtin),
ne_tagger: self.ne_tagger.unwrap_or_else(NeTagger::builtin),
romanization: self.romanization,
abbrev_map: self.abbrev_map,
number_normalize: self.number_normalize.unwrap_or(true),
}
}
}
pub struct FtsTokenizer {
tokenizer: Tokenizer,
stopwords: StopwordSet,
synonyms: SynonymMap,
ngram_size: usize,
pos_tagger: PosTagger,
ne_tagger: NeTagger,
romanization: Option<RomanizationMap>,
abbrev_map: Option<AbbrevMap>,
number_normalize: bool,
}
impl FtsTokenizer {
pub fn new() -> Self {
FtsTokenizerBuilder::default().build()
}
pub fn builder() -> FtsTokenizerBuilder {
FtsTokenizerBuilder::default()
}
pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken> {
let normalized = self.tokenizer.normalize(text);
let expanded = match self.abbrev_map.as_ref() {
Some(am) => am.expand_text(&normalized),
None => normalized,
};
let raw_tokens = self
.ne_tagger
.tag_tokens(self.tokenizer.segment(&expanded), &expanded);
let mut result = Vec::with_capacity(raw_tokens.len());
let mut position = 0usize;
for token in &raw_tokens {
if token.kind == TokenKind::Whitespace {
continue;
}
let is_stop = self.stopwords.contains(token.text);
let is_thai_or_named = matches!(token.kind, TokenKind::Thai | TokenKind::Named(_));
let mut synonyms = self
.synonyms
.expand(token.text)
.map(|s| s.to_vec())
.unwrap_or_default();
if is_thai_or_named {
if let Some(ref rom) = self.romanization {
if let Some(rtgs) = rom.romanize(token.text) {
synonyms.push(String::from(rtgs));
}
}
}
if self.number_normalize {
match token.kind {
TokenKind::Number => {
let ascii = thai_digits_to_ascii(token.text);
if ascii != token.text {
synonyms.push(ascii);
}
}
TokenKind::Thai => {
if let Some(decimal) = thai_word_to_decimal(token.text) {
synonyms.push(decimal);
}
}
_ => {}
}
}
let trigrams = if token.kind == TokenKind::Unknown && self.ngram_size > 0 {
char_ngrams(token.text, self.ngram_size)
.map(String::from)
.collect()
} else {
Vec::new()
};
let ne = if let TokenKind::Named(k) = token.kind {
Some(k)
} else {
None
};
let pos = if token.kind == TokenKind::Thai {
self.pos_tagger.tag(token.text)
} else {
None
};
result.push(FtsToken {
text: String::from(token.text),
position,
kind: token.kind,
is_stop,
synonyms,
trigrams,
pos,
ne,
});
position += 1;
}
result
}
pub fn index_tokens(&self, text: &str) -> Vec<FtsToken> {
self.segment_for_fts(text)
.into_iter()
.filter(|t| !t.is_stop)
.collect()
}
pub fn lexemes(&self, text: &str) -> Vec<String> {
let tokens = self.index_tokens(text);
let mut out: Vec<String> = Vec::with_capacity(tokens.len() * 2);
for t in tokens {
out.push(t.text.clone());
out.extend(t.synonyms);
out.extend(t.trigrams);
}
out
}
}
impl Default for FtsTokenizer {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::stopwords::StopwordSet;
use crate::synonym::SynonymMap;
fn fts() -> FtsTokenizer {
FtsTokenizer::new()
}
#[test]
fn empty_input_returns_empty() {
assert!(fts().segment_for_fts("").is_empty());
}
#[test]
fn whitespace_tokens_excluded() {
let tokens = fts().segment_for_fts("กิน ข้าว");
assert!(tokens.iter().all(|t| t.kind != TokenKind::Whitespace));
}
#[test]
fn positions_are_sequential() {
let tokens = fts().segment_for_fts("กินข้าวกับปลา");
for (i, t) in tokens.iter().enumerate() {
assert_eq!(t.position, i, "position mismatch at index {i}");
}
}
#[test]
fn known_stopword_is_tagged() {
let tokens = fts().segment_for_fts("กินข้าวกับปลา");
let kap = tokens.iter().find(|t| t.text == "กับ");
assert!(kap.is_some(), "expected 'กับ' token");
assert!(kap.unwrap().is_stop, "'กับ' should be tagged as stopword");
}
#[test]
fn content_words_not_tagged_as_stop() {
let tokens = fts().segment_for_fts("โรงพยาบาล");
for t in &tokens {
assert!(!t.is_stop, "'{}' should not be a stopword", t.text);
}
}
#[test]
fn text_is_reconstructable() {
let fts = fts();
let text = "กินข้าวกับปลา";
let normalized = fts.tokenizer.normalize(text);
let tokens = fts.segment_for_fts(text);
let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(rebuilt, normalized);
}
#[test]
fn synonym_expansion_attached() {
let synonyms = SynonymMap::from_tsv("คอม\tคอมพิวเตอร์\tcomputer\n");
let fts = FtsTokenizer::builder()
.synonyms(synonyms)
.stopwords(StopwordSet::from_text(""))
.build();
let tokens = fts.segment_for_fts("คอม");
let t = tokens.iter().find(|t| t.text == "คอม");
if let Some(tok) = t {
assert!(
tok.synonyms.contains(&String::from("คอมพิวเตอร์")),
"expected synonym expansion, got {:?}",
tok.synonyms
);
}
}
#[test]
fn no_synonyms_when_map_empty() {
let tokens = fts().segment_for_fts("กินข้าว");
for t in &tokens {
assert!(t.synonyms.is_empty());
}
}
#[test]
fn unknown_token_gets_trigrams() {
let fts = FtsTokenizer::builder()
.ngram_size(2)
.stopwords(StopwordSet::from_text(""))
.build();
let tokens = fts.segment_for_fts("กิ");
let unknown: Vec<_> = tokens
.iter()
.filter(|t| t.kind == TokenKind::Unknown && t.text.chars().count() >= 2)
.collect();
assert!(
!unknown.is_empty(),
"expected at least one multi-char Unknown token for 'กิ'"
);
for u in &unknown {
assert!(
!u.trigrams.is_empty(),
"unknown token '{}' ({} chars) should have bigrams",
u.text,
u.text.chars().count()
);
}
}
#[test]
fn known_thai_token_has_no_trigrams() {
let tokens = fts().segment_for_fts("กิน");
for t in &tokens {
if t.kind == TokenKind::Thai {
assert!(
t.trigrams.is_empty(),
"known Thai token '{}' should not have trigrams",
t.text
);
}
}
}
#[test]
fn ngram_size_zero_disables_trigrams() {
let fts = FtsTokenizer::builder()
.ngram_size(0)
.stopwords(StopwordSet::from_text(""))
.build();
let tokens = fts.segment_for_fts("กขคง");
for t in &tokens {
assert!(t.trigrams.is_empty());
}
}
#[test]
fn index_tokens_excludes_stopwords() {
let tokens = fts().index_tokens("กินข้าวกับปลา");
assert!(tokens.iter().all(|t| !t.is_stop));
}
#[test]
fn index_tokens_preserves_positions() {
let all = fts().segment_for_fts("กินข้าวกับปลา");
let indexed = fts().index_tokens("กินข้าวกับปลา");
for t in &indexed {
assert!(
all.iter().any(|a| a.position == t.position),
"indexed token at position {} not found in full token list",
t.position
);
}
}
#[test]
fn lexemes_returns_non_stop_texts() {
let lexemes = fts().lexemes("กินข้าวกับปลา");
assert!(!lexemes.contains(&String::from("กับ")));
assert!(
lexemes
.iter()
.any(|l| l == "กิน" || l == "ข้าว" || l == "ปลา"),
"expected content words in lexemes: {lexemes:?}"
);
}
#[test]
fn lexemes_empty_input_is_empty() {
assert!(fts().lexemes("").is_empty());
}
#[test]
fn multi_token_ne_merged_in_pipeline() {
let fts = FtsTokenizer::new();
let tokens = fts.segment_for_fts("ไปกรุงเทพ");
let named: Vec<_> = tokens
.iter()
.filter(|t| matches!(t.kind, TokenKind::Named(_)))
.collect();
assert!(
named.iter().any(|t| t.text == "กรุงเทพ"),
"กรุงเทพ should be tagged Named after multi-token merge, tokens: {:?}",
tokens
.iter()
.map(|t| (&t.text, &t.kind))
.collect::<alloc::vec::Vec<_>>()
);
}
#[test]
fn multi_token_ne_reconstructable() {
let fts = FtsTokenizer::new();
let text = "ไปกรุงเทพ";
let normalized = fts.tokenizer.normalize(text);
let tokens = fts.segment_for_fts(text);
let rebuilt: String = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(rebuilt, normalized);
}
#[test]
fn builder_custom_stopwords() {
let stops = StopwordSet::from_text("กิน\n");
let fts = FtsTokenizer::builder().stopwords(stops).build();
let tokens = fts.segment_for_fts("กินข้าว");
let gin = tokens.iter().find(|t| t.text == "กิน");
if let Some(t) = gin {
assert!(t.is_stop, "'กิน' should be stop with custom list");
}
}
#[test]
fn builder_default_equals_new() {
let a = FtsTokenizer::new().lexemes("กินข้าว");
let b = FtsTokenizer::builder().build().lexemes("กินข้าว");
assert_eq!(a, b);
}
#[test]
fn thai_digit_token_gets_ascii_synonym() {
let fts = FtsTokenizer::new();
let tokens = fts.segment_for_fts("๑๒๓");
let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
assert!(num.is_some(), "expected a Number token");
let t = num.unwrap();
assert!(
t.synonyms.contains(&String::from("123")),
"Thai digit token should have ASCII synonym, got {:?}",
t.synonyms
);
}
#[test]
fn ascii_digit_token_has_no_extra_synonym() {
let fts = FtsTokenizer::new();
let tokens = fts.segment_for_fts("123");
let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
assert!(num.is_some(), "expected a Number token");
assert!(
!num.unwrap().synonyms.contains(&String::from("123")),
"ASCII digit token should not duplicate itself as a synonym"
);
}
#[test]
fn thai_number_word_gets_decimal_synonym() {
let fts = FtsTokenizer::new();
let tokens = fts.segment_for_fts("หนึ่งร้อย");
let has_hundred = tokens
.iter()
.any(|t| t.synonyms.contains(&String::from("100")));
assert!(
has_hundred,
"expected a token with decimal synonym '100', tokens: {:?}",
tokens
.iter()
.map(|t| (&t.text, &t.synonyms))
.collect::<alloc::vec::Vec<_>>()
);
}
#[test]
fn number_normalize_false_disables_conversion() {
let fts = FtsTokenizer::builder()
.number_normalize(false)
.stopwords(StopwordSet::from_text(""))
.build();
let tokens = fts.segment_for_fts("๑๒๓");
let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
assert!(num.is_some());
assert!(
!num.unwrap().synonyms.contains(&String::from("123")),
"number_normalize=false should suppress ASCII synonym"
);
}
#[test]
fn mixed_thai_digit_in_context() {
let fts = FtsTokenizer::new();
let tokens = fts.segment_for_fts("ธนาคาร๑๐๐แห่ง");
let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
assert!(num.is_some(), "expected Number token in mixed string");
assert!(
num.unwrap().synonyms.contains(&String::from("100")),
"expected ASCII synonym '100' for ๑๐๐"
);
}
#[test]
fn abbrev_map_expands_before_segmentation() {
use crate::abbrev::AbbrevMap;
let fts = FtsTokenizer::builder()
.abbrevs(AbbrevMap::builtin())
.stopwords(StopwordSet::from_text(""))
.build();
let tokens = fts.segment_for_fts("ก.ค.");
let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
let joined: String = texts.concat();
assert!(
joined.contains("กรกฎา") || joined.contains("กรกฎาคม"),
"expected กรกฎา(คม) characters after abbrev expansion, got: {texts:?}"
);
assert!(
!texts.contains(&"."),
"dots should be consumed by abbrev expansion, got: {texts:?}"
);
}
#[test]
fn abbrev_expansion_disabled_by_default() {
let fts = FtsTokenizer::new();
let tokens = fts.segment_for_fts("ก.ค.");
let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(
texts.contains(&"."),
"without abbrev expansion, dots should remain as tokens, got: {texts:?}"
);
}
#[test]
fn abbrev_expansion_date_sentence() {
use crate::abbrev::AbbrevMap;
let fts = FtsTokenizer::builder()
.abbrevs(AbbrevMap::builtin())
.stopwords(StopwordSet::from_text(""))
.build();
let tokens = fts.segment_for_fts("พ.ศ.2567");
let texts: alloc::vec::Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
let joined: String = texts.concat();
assert!(
joined.contains("พุทธ") || joined.contains("พุทธศักราช"),
"expected พุทธ(ศักราช) chars after expanding พ.ศ., got: {texts:?}"
);
assert!(
!texts.contains(&"."),
"dots should be consumed by expansion, got: {texts:?}"
);
}
}