pub struct TokenizerBuilder { /* private fields */ }Expand description
Implementations§
Source§impl TokenizerBuilder
impl TokenizerBuilder
Sourcepub fn dict_words(self, words: &str) -> Self
pub fn dict_words(self, words: &str) -> Self
Load an additional word list from a string (newline-separated words).
Words are merged with the built-in dictionary.
§Example
use kham_core::{Tokenizer, TokenKind};
let tok = Tokenizer::builder()
.dict_words("ปัญญาประดิษฐ์\n")
.build();
let tokens = tok.segment("ปัญญาประดิษฐ์คือ");
assert!(tokens.iter().any(|t| t.text == "ปัญญาประดิษฐ์" && t.kind == TokenKind::Thai));Sourcepub fn keep_whitespace(self, keep: bool) -> Self
pub fn keep_whitespace(self, keep: bool) -> Self
Configure whether whitespace tokens are included in the output.
Default: false (whitespace is discarded).
§Example
use kham_core::{Tokenizer, TokenKind};
let tok = Tokenizer::builder().keep_whitespace(true).build();
let tokens = tok.segment("กิน ข้าว");
assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
// Byte spans are contiguous when whitespace is kept
for w in tokens.windows(2) {
assert_eq!(w[0].span.end, w[1].span.start);
}Sourcepub fn dict_file(self, path: &str) -> Result<Self, KhamError>
pub fn dict_file(self, path: &str) -> Result<Self, KhamError>
Try to load a custom word list from a file path.
Only available when the std feature is enabled.
§Errors
Returns KhamError::DictLoadError if the file cannot be read.
§Example
use kham_core::Tokenizer;
let tok = Tokenizer::builder()
.dict_file("my_words.txt")
.expect("failed to load dict")
.build();Trait Implementations§
Source§impl Debug for TokenizerBuilder
impl Debug for TokenizerBuilder
Source§impl Default for TokenizerBuilder
impl Default for TokenizerBuilder
Source§fn default() -> TokenizerBuilder
fn default() -> TokenizerBuilder
Returns the “default value” for a type. Read more
Auto Trait Implementations§
impl Freeze for TokenizerBuilder
impl RefUnwindSafe for TokenizerBuilder
impl Send for TokenizerBuilder
impl Sync for TokenizerBuilder
impl Unpin for TokenizerBuilder
impl UnsafeUnpin for TokenizerBuilder
impl UnwindSafe for TokenizerBuilder
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more