pub struct FtsTokenizer { /* private fields */ }Expand description
Full-text search tokenizer for Thai text.
Wraps Tokenizer with stopword filtering, synonym expansion, and n-gram
generation for out-of-vocabulary tokens.
Construct once and reuse:
use kham_core::fts::FtsTokenizer;
let fts = FtsTokenizer::new();
let tokens = fts.segment_for_fts("กินข้าวกับปลา");
assert!(!tokens.is_empty());Implementations§
Source§impl FtsTokenizer
impl FtsTokenizer
Sourcepub fn new() -> Self
pub fn new() -> Self
Create an FtsTokenizer with built-in stopwords and no synonyms.
§Example
use kham_core::fts::FtsTokenizer;
let fts = FtsTokenizer::new();
let lexemes = fts.lexemes("กินข้าวกับปลา");
// Built-in stopword กับ is excluded; content words are present
assert!(!lexemes.contains(&String::from("กับ")));
assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));Sourcepub fn builder() -> FtsTokenizerBuilder
pub fn builder() -> FtsTokenizerBuilder
Return a FtsTokenizerBuilder for custom configuration.
§Example
use kham_core::fts::FtsTokenizer;
use kham_core::soundex::SoundexAlgorithm;
use kham_core::synonym::SynonymMap;
let fts = FtsTokenizer::builder()
.synonyms(SynonymMap::from_tsv("รถ\tรถยนต์\n"))
.soundex(SoundexAlgorithm::Lk82)
.build();
assert!(!fts.segment_for_fts("รถ").is_empty());Sourcepub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken>
pub fn segment_for_fts(&self, text: &str) -> Vec<FtsToken>
Segment text and annotate each token for FTS indexing.
Normalises the input text before segmentation so that สระลอย and stacked tone marks are handled correctly. Whitespace tokens are excluded.
The returned Vec<FtsToken> covers all non-whitespace tokens. Call
index_tokens instead when you only need the tokens to be indexed
(stopwords excluded).
§Examples
use kham_core::fts::FtsTokenizer;
let fts = FtsTokenizer::new();
let tokens = fts.segment_for_fts("กินข้าวกับปลา");
// Positions are 0-based and sequential across non-whitespace tokens
for (i, t) in tokens.iter().enumerate() {
assert_eq!(t.position, i);
}
// กับ is a common conjunction — marked as a stopword
let kap = tokens.iter().find(|t| t.text == "กับ").unwrap();
assert!(kap.is_stop);Named entities are tagged automatically — kind becomes TokenKind::Named:
use kham_core::fts::FtsTokenizer;
use kham_core::TokenKind;
let fts = FtsTokenizer::new();
let tokens = fts.segment_for_fts("ไปกรุงเทพ");
assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));Enable phonetic synonyms with FtsTokenizerBuilder::soundex:
use kham_core::fts::FtsTokenizer;
use kham_core::soundex::SoundexAlgorithm;
let fts = FtsTokenizer::builder()
.soundex(SoundexAlgorithm::Lk82)
.build();
let tokens = fts.segment_for_fts("กิน");
let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
// synonyms now contains the lk82 code, enabling fuzzy phonetic matching
assert!(!t.synonyms.is_empty());Sourcepub fn index_tokens(&self, text: &str) -> Vec<FtsToken>
pub fn index_tokens(&self, text: &str) -> Vec<FtsToken>
Return only the tokens to be written into a search index.
Filters out stopwords and whitespace. Each FtsToken still carries
its original position so phrase-distance scoring remains correct.
§Example
use kham_core::fts::FtsTokenizer;
let fts = FtsTokenizer::new();
let tokens = fts.index_tokens("กินข้าวกับปลา");
// No stopwords in the index
assert!(tokens.iter().all(|t| !t.is_stop));
// Positions are preserved from the full sequence for phrase scoring
let positions: Vec<usize> = tokens.iter().map(|t| t.position).collect();
assert!(positions.windows(2).all(|w| w[0] < w[1]));Sourcepub fn lexemes(&self, text: &str) -> Vec<String>
pub fn lexemes(&self, text: &str) -> Vec<String>
Collect all lexeme strings to be stored in a tsvector.
Returns one string per non-stop token, plus synonym expansions and trigrams for unknown tokens. Duplicates are not removed (the caller or PostgreSQL handles deduplication).
§Example
use kham_core::fts::FtsTokenizer;
let fts = FtsTokenizer::new();
let lexemes = fts.lexemes("กินข้าวกับปลา");
// Content words are present; stopword กับ is absent
assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
assert!(!lexemes.contains(&String::from("กับ")));With Thai digit normalization (enabled by default), both scripts match:
use kham_core::fts::FtsTokenizer;
let fts = FtsTokenizer::new();
let lexemes = fts.lexemes("ธนาคาร๑๐๐แห่ง");
// ๑๐๐ (Thai digits) → synonym "100" (ASCII) — both appear in lexemes
assert!(lexemes.contains(&String::from("100")));