pub struct TokenizerBuilder { /* private fields */ }Expand description
Implementations§
Source§impl TokenizerBuilder
impl TokenizerBuilder
Sourcepub fn dict_words(self, words: &str) -> Self
pub fn dict_words(self, words: &str) -> Self
Load an additional word list from a string (newline-separated words).
Words are merged with the built-in dictionary.
§Example
use kham_core::{Tokenizer, TokenKind};
let tok = Tokenizer::builder()
.dict_words("ปัญญาประดิษฐ์\n")
.build();
let tokens = tok.segment("ปัญญาประดิษฐ์คือ");
assert!(tokens.iter().any(|t| t.text == "ปัญญาประดิษฐ์" && t.kind == TokenKind::Thai));Sourcepub fn dict_merge(self, words: &str) -> Self
pub fn dict_merge(self, words: &str) -> Self
Configure whether whitespace tokens are included in the output.
Default: false (whitespace is discarded).
§Example
use kham_core::{Tokenizer, TokenKind};
let tok = Tokenizer::builder().keep_whitespace(true).build();
let tokens = tok.segment("กิน ข้าว");
assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
// Byte spans are contiguous when whitespace is kept
for w in tokens.windows(2) {
assert_eq!(w[0].span.end, w[1].span.start);
}Add extra words via a lightweight overlay — no trie rebuild.
Words are stored in a sorted list alongside the pre-compiled trie.
This is O(k log k) in the number of custom words and avoids the O(N)
full trie rebuild that dict_words performs.
Prefer dict_merge over dict_words when adding a small custom
vocabulary (e.g. domain-specific terms, product names).
If both dict_merge and dict_words are called, dict_words takes
precedence (it performs a full rebuild that subsumes any overlay).
§Example
use kham_core::{Tokenizer, TokenKind};
let tok = Tokenizer::builder()
.dict_merge("ปัญญาประดิษฐ์\nโปรแกรมเมอร์\n")
.build();
let tokens = tok.segment("ปัญญาประดิษฐ์คือ");
assert!(tokens.iter().any(|t| t.text == "ปัญญาประดิษฐ์" && t.kind == TokenKind::Thai));Sourcepub fn keep_whitespace(self, keep: bool) -> Self
pub fn keep_whitespace(self, keep: bool) -> Self
Configure whether whitespace tokens are included in the output.
Default: false (whitespace is discarded).
§Example
use kham_core::{Tokenizer, TokenKind};
let tok = Tokenizer::builder().keep_whitespace(true).build();
let tokens = tok.segment("กิน ข้าว");
assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
// Byte spans are contiguous when whitespace is kept
for w in tokens.windows(2) {
assert_eq!(w[0].span.end, w[1].span.start);
}Sourcepub fn dict_file(self, path: &str) -> Result<Self, KhamError>
pub fn dict_file(self, path: &str) -> Result<Self, KhamError>
Try to load a custom word list from a file path.
Only available when the std feature is enabled.
§Errors
Returns KhamError::DictLoadError if the file cannot be read.
§Example
use kham_core::Tokenizer;
let tok = Tokenizer::builder()
.dict_file("my_words.txt")
.expect("failed to load dict")
.build();