pub struct FtsTokenizerBuilder { /* private fields */ }Expand description
Builder for FtsTokenizer.
Implementations§
Source§impl FtsTokenizerBuilder
impl FtsTokenizerBuilder
Sourcepub fn stopwords(self, s: StopwordSet) -> Self
pub fn stopwords(self, s: StopwordSet) -> Self
Use a custom stopword set instead of the built-in list.
§Example
use kham_core::fts::FtsTokenizer;
use kham_core::stopwords::StopwordSet;
let stops = StopwordSet::from_text("กิน\nข้าว\n");
let fts = FtsTokenizer::builder().stopwords(stops).build();
let tokens = fts.segment_for_fts("กินข้าว");
assert!(tokens.iter().all(|t| t.is_stop || t.text != "กิน"));Sourcepub fn synonyms(self, m: SynonymMap) -> Self
pub fn synonyms(self, m: SynonymMap) -> Self
Attach a synonym map for expansion.
§Example
use kham_core::fts::FtsTokenizer;
use kham_core::synonym::SynonymMap;
// TSV: canonical TAB synonym1 TAB synonym2 …
let syns = SynonymMap::from_tsv("รถ\tรถยนต์\tยานพาหนะ\n");
let fts = FtsTokenizer::builder().synonyms(syns).build();
let tokens = fts.segment_for_fts("รถ");
let t = tokens.iter().find(|t| t.text == "รถ").unwrap();
assert!(t.synonyms.contains(&String::from("รถยนต์")));Sourcepub fn ngram_size(self, n: usize) -> Self
pub fn ngram_size(self, n: usize) -> Self
Override the n-gram size used for TokenKind::Unknown tokens.
Default: 3 (trigrams). Set to 0 to disable n-gram generation.
§Example
use kham_core::fts::FtsTokenizer;
use kham_core::stopwords::StopwordSet;
// Disable n-grams entirely — useful when index size must be small
let fts = FtsTokenizer::builder()
.ngram_size(0)
.stopwords(StopwordSet::from_text(""))
.build();
let tokens = fts.segment_for_fts("กขคง"); // unknown word → no trigrams
assert!(tokens.iter().all(|t| t.trigrams.is_empty()));Sourcepub fn pos_tagger(self, t: PosTagger) -> Self
pub fn pos_tagger(self, t: PosTagger) -> Self
Use a custom POS tagger instead of the built-in table.
§Example
use kham_core::fts::FtsTokenizer;
use kham_core::pos::{PosTag, PosTagger};
// Custom TSV: word TAB POS_TAG
let tagger = PosTagger::from_tsv("กิน\tVERB\n");
let fts = FtsTokenizer::builder().pos_tagger(tagger).build();
// Segment กิน alone so it is not merged into a compound
let tokens = fts.segment_for_fts("กิน");
let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
assert_eq!(t.pos, Some(PosTag::Verb));Sourcepub fn ne_tagger(self, t: NeTagger) -> Self
pub fn ne_tagger(self, t: NeTagger) -> Self
Use a custom NE gazetteer instead of the built-in table.
§Example
use kham_core::fts::FtsTokenizer;
use kham_core::ne::NeTagger;
use kham_core::TokenKind;
// Domain-specific NE list: word TAB NE_TAG
let ne = NeTagger::from_tsv("เซเรน่า\tPERSON\n");
let fts = FtsTokenizer::builder().ne_tagger(ne).build();
let tokens = fts.segment_for_fts("เซเรน่า");
assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));Sourcepub fn romanization(self, m: RomanizationMap) -> Self
pub fn romanization(self, m: RomanizationMap) -> Self
Attach a romanization map so RTGS forms are added to FtsToken::synonyms.
When set, each Thai and Named token whose text is found in the map gets its
RTGS romanization appended to synonyms, enabling Latin-script queries
(e.g. kin) to match Thai-script documents (e.g. กิน) in PostgreSQL FTS.
Disabled by default — call this method to opt in.
§Example
use kham_core::fts::FtsTokenizer;
use kham_core::romanizer::RomanizationMap;
// TSV: Thai word TAB RTGS romanization
let rom = RomanizationMap::from_tsv("กิน\tkin\n");
let fts = FtsTokenizer::builder().romanization(rom).build();
let tokens = fts.segment_for_fts("กิน");
let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
// Latin synonym "kin" enables queries like `WHERE doc @@ 'kin'`
assert!(t.synonyms.contains(&String::from("kin")));Sourcepub fn abbrevs(self, m: AbbrevMap) -> Self
pub fn abbrevs(self, m: AbbrevMap) -> Self
Attach an abbreviation map for pre-tokenisation expansion.
When set, FtsTokenizer::segment_for_fts calls
AbbrevMap::expand_text on the normalised input before segmentation.
This replaces abbreviated forms (e.g. ก.ค.) with their canonical
expansions (กรกฎาคม) so they are indexed and searchable by full form.
Disabled by default — call this method to opt in.
§Example
use kham_core::fts::FtsTokenizer;
use kham_core::abbrev::AbbrevMap;
use kham_core::stopwords::StopwordSet;
let fts = FtsTokenizer::builder()
.abbrevs(AbbrevMap::builtin())
.stopwords(StopwordSet::from_text(""))
.build();
// ก.ค. expands to กรกฎาคม before segmentation — dots disappear
let tokens = fts.segment_for_fts("ก.ค.");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(!texts.contains(&"."), "dots should be consumed by expansion");Sourcepub fn number_normalize(self, v: bool) -> Self
pub fn number_normalize(self, v: bool) -> Self
Enable or disable number normalization (default: true).
When enabled:
TokenKind::Numbertokens that contain Thai digits (๐–๙) get the ASCII digit string added to theirFtsToken::synonyms(e.g.๑๒๓→ synonym"123").TokenKind::Thaitokens that are recognised Thai cardinal number words get their decimal value added tosynonyms(e.g.หนึ่งร้อย→ synonym"100").
This lets queries using either script match documents written in the
other. Set to false to opt out.
§Example
use kham_core::fts::FtsTokenizer;
use kham_core::TokenKind;
// Default (true): ๑๒๓ gets ASCII synonym "123"
let fts = FtsTokenizer::new();
let tokens = fts.segment_for_fts("๑๒๓");
let num = tokens.iter().find(|t| t.kind == TokenKind::Number).unwrap();
assert!(num.synonyms.contains(&String::from("123")));
// Opt out: no conversion performed
let fts_off = FtsTokenizer::builder().number_normalize(false).build();
let tokens_off = fts_off.segment_for_fts("๑๒๓");
let num_off = tokens_off.iter().find(|t| t.kind == TokenKind::Number).unwrap();
assert!(!num_off.synonyms.contains(&String::from("123")));Sourcepub fn soundex(self, algo: SoundexAlgorithm) -> Self
pub fn soundex(self, algo: SoundexAlgorithm) -> Self
Emit a Thai phonetic soundex code as an additional synonym for Thai and Named tokens.
When set, each Thai and Named token whose text contains Thai consonants gets its
soundex code appended to FtsToken::synonyms, enabling phonetic fuzzy matching
in full-text search (e.g. querying "1600" matches กาน, ขาน, and คาน with lk82).
SoundexAlgorithm::Lk82 and SoundexAlgorithm::Udom83 produce fixed
4-character codes and are the recommended choices for FTS indexing.
SoundexAlgorithm::MetaSound produces variable-length codes and is more
collision-prone at word level — prefer lk82 or udom83 for general FTS use.
Disabled by default — call this method to opt in.
§Example
use kham_core::fts::FtsTokenizer;
use kham_core::soundex::{lk82, SoundexAlgorithm};
use kham_core::stopwords::StopwordSet;
let fts = FtsTokenizer::builder()
.soundex(SoundexAlgorithm::Lk82)
.stopwords(StopwordSet::from_text(""))
.build();
// กาน / ขาน / คาน all map to the same lk82 code — stored once per token
for word in &["กาน", "ขาน", "คาน"] {
let tokens = fts.segment_for_fts(word);
let t = tokens.first().unwrap();
assert!(t.synonyms.contains(&lk82(word)), "{word} missing lk82 synonym");
}Sourcepub fn build(self) -> FtsTokenizer
pub fn build(self) -> FtsTokenizer
Consume the builder and return a configured FtsTokenizer.
§Example
use kham_core::fts::FtsTokenizer;
use kham_core::soundex::SoundexAlgorithm;
use kham_core::stopwords::StopwordSet;
let fts = FtsTokenizer::builder()
.soundex(SoundexAlgorithm::Lk82)
.stopwords(StopwordSet::from_text(""))
.build();
assert!(!fts.segment_for_fts("กินข้าว").is_empty());