Struct FtsTokenizerBuilder

Source

pub struct FtsTokenizerBuilder { /* private fields */ }

Expand description

Builder for FtsTokenizer.

Implementations§

Source §

impl FtsTokenizerBuilder

Source

pub fn stopwords(self, s: StopwordSet) -> Self

Use a custom stopword set instead of the built-in list.

§Example

use kham_core::fts::FtsTokenizer;
use kham_core::stopwords::StopwordSet;

let stops = StopwordSet::from_text("กิน\nข้าว\n");
let fts = FtsTokenizer::builder().stopwords(stops).build();
let tokens = fts.segment_for_fts("กินข้าว");
assert!(tokens.iter().all(|t| t.is_stop || t.text != "กิน"));

Source

pub fn synonyms(self, m: SynonymMap) -> Self

Attach a synonym map for expansion.

§Example

use kham_core::fts::FtsTokenizer;
use kham_core::synonym::SynonymMap;

// TSV: canonical TAB synonym1 TAB synonym2 …
let syns = SynonymMap::from_tsv("รถ\tรถยนต์\tยานพาหนะ\n");
let fts = FtsTokenizer::builder().synonyms(syns).build();
let tokens = fts.segment_for_fts("รถ");
let t = tokens.iter().find(|t| t.text == "รถ").unwrap();
assert!(t.synonyms.contains(&String::from("รถยนต์")));

Source

pub fn ngram_size(self, n: usize) -> Self

Override the n-gram size used for TokenKind::Unknown tokens.

Default: 3 (trigrams). Set to 0 to disable n-gram generation.

§Example

use kham_core::fts::FtsTokenizer;
use kham_core::stopwords::StopwordSet;

// Disable n-grams entirely — useful when index size must be small
let fts = FtsTokenizer::builder()
    .ngram_size(0)
    .stopwords(StopwordSet::from_text(""))
    .build();
let tokens = fts.segment_for_fts("กขคง"); // unknown word → no trigrams
assert!(tokens.iter().all(|t| t.trigrams.is_empty()));

Source

pub fn pos_tagger(self, t: PosTagger) -> Self

Use a custom POS tagger instead of the built-in table.

§Example

use kham_core::fts::FtsTokenizer;
use kham_core::pos::{PosTag, PosTagger};

// Custom TSV: word TAB POS_TAG
let tagger = PosTagger::from_tsv("กิน\tVERB\n");
let fts = FtsTokenizer::builder().pos_tagger(tagger).build();
// Segment กิน alone so it is not merged into a compound
let tokens = fts.segment_for_fts("กิน");
let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
assert_eq!(t.pos, Some(PosTag::Verb));

Source

pub fn ne_tagger(self, t: NeTagger) -> Self

Use a custom NE gazetteer instead of the built-in table.

§Example

use kham_core::fts::FtsTokenizer;
use kham_core::ne::NeTagger;
use kham_core::TokenKind;

// Domain-specific NE list: word TAB NE_TAG
let ne = NeTagger::from_tsv("เซเรน่า\tPERSON\n");
let fts = FtsTokenizer::builder().ne_tagger(ne).build();
let tokens = fts.segment_for_fts("เซเรน่า");
assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Named(_))));

Source

pub fn romanization(self, m: RomanizationMap) -> Self

Attach a romanization map so RTGS forms are added to FtsToken::synonyms.

When set, each Thai and Named token whose text is found in the map gets its RTGS romanization appended to synonyms, enabling Latin-script queries (e.g. kin) to match Thai-script documents (e.g. กิน) in PostgreSQL FTS.

Disabled by default — call this method to opt in.

§Example

use kham_core::fts::FtsTokenizer;
use kham_core::romanizer::RomanizationMap;

// TSV: Thai word TAB RTGS romanization
let rom = RomanizationMap::from_tsv("กิน\tkin\n");
let fts = FtsTokenizer::builder().romanization(rom).build();
let tokens = fts.segment_for_fts("กิน");
let t = tokens.iter().find(|t| t.text == "กิน").unwrap();
// Latin synonym "kin" enables queries like `WHERE doc @@ 'kin'`
assert!(t.synonyms.contains(&String::from("kin")));

Source

pub fn abbrevs(self, m: AbbrevMap) -> Self

Attach an abbreviation map for pre-tokenisation expansion.

When set, FtsTokenizer::segment_for_fts calls AbbrevMap::expand_text on the normalised input before segmentation. This replaces abbreviated forms (e.g. ก.ค.) with their canonical expansions (กรกฎาคม) so they are indexed and searchable by full form.

Disabled by default — call this method to opt in.

§Example

use kham_core::fts::FtsTokenizer;
use kham_core::abbrev::AbbrevMap;
use kham_core::stopwords::StopwordSet;

let fts = FtsTokenizer::builder()
    .abbrevs(AbbrevMap::builtin())
    .stopwords(StopwordSet::from_text(""))
    .build();
// ก.ค. expands to กรกฎาคม before segmentation — dots disappear
let tokens = fts.segment_for_fts("ก.ค.");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(!texts.contains(&"."), "dots should be consumed by expansion");

Source

pub fn number_normalize(self, v: bool) -> Self

Enable or disable number normalization (default: true).

When enabled:

TokenKind::Number tokens that contain Thai digits (๐–๙) get the ASCII digit string added to their FtsToken::synonyms (e.g. ๑๒๓ → synonym "123").
TokenKind::Thai tokens that are recognised Thai cardinal number words get their decimal value added to synonyms (e.g. หนึ่งร้อย → synonym "100").

This lets queries using either script match documents written in the other. Set to false to opt out.

§Example

use kham_core::fts::FtsTokenizer;
use kham_core::TokenKind;

// Default (true): ๑๒๓ gets ASCII synonym "123"
let fts = FtsTokenizer::new();
let tokens = fts.segment_for_fts("๑๒๓");
let num = tokens.iter().find(|t| t.kind == TokenKind::Number).unwrap();
assert!(num.synonyms.contains(&String::from("123")));

// Opt out: no conversion performed
let fts_off = FtsTokenizer::builder().number_normalize(false).build();
let tokens_off = fts_off.segment_for_fts("๑๒๓");
let num_off = tokens_off.iter().find(|t| t.kind == TokenKind::Number).unwrap();
assert!(!num_off.synonyms.contains(&String::from("123")));

Source

pub fn soundex(self, algo: SoundexAlgorithm) -> Self

Emit a Thai phonetic soundex code as an additional synonym for Thai and Named tokens.

When set, each Thai and Named token whose text contains Thai consonants gets its soundex code appended to FtsToken::synonyms, enabling phonetic fuzzy matching in full-text search (e.g. querying "1600" matches กาน, ขาน, and คาน with lk82).

SoundexAlgorithm::Lk82 and SoundexAlgorithm::Udom83 produce fixed 4-character codes and are the recommended choices for FTS indexing. SoundexAlgorithm::MetaSound produces variable-length codes and is more collision-prone at word level — prefer lk82 or udom83 for general FTS use.

Disabled by default — call this method to opt in.

§Example

use kham_core::fts::FtsTokenizer;
use kham_core::soundex::{lk82, SoundexAlgorithm};
use kham_core::stopwords::StopwordSet;

let fts = FtsTokenizer::builder()
    .soundex(SoundexAlgorithm::Lk82)
    .stopwords(StopwordSet::from_text(""))
    .build();
// กาน / ขาน / คาน all map to the same lk82 code — stored once per token
for word in &["กาน", "ขาน", "คาน"] {
    let tokens = fts.segment_for_fts(word);
    let t = tokens.first().unwrap();
    assert!(t.synonyms.contains(&lk82(word)), "{word} missing lk82 synonym");
}

Source

pub fn build(self) -> FtsTokenizer

Consume the builder and return a configured FtsTokenizer.

§Example

use kham_core::fts::FtsTokenizer;
use kham_core::soundex::SoundexAlgorithm;
use kham_core::stopwords::StopwordSet;

let fts = FtsTokenizer::builder()
    .soundex(SoundexAlgorithm::Lk82)
    .stopwords(StopwordSet::from_text(""))
    .build();
assert!(!fts.segment_for_fts("กินข้าว").is_empty());