pub struct Tokenizer { /* private fields */ }Expand description
High-level tokenizer. Holds a compiled dictionary and segmentation options.
§Example
use kham_core::Tokenizer;
let tok = Tokenizer::new();
let tokens = tok.segment("กินข้าวกับปลา");
assert!(!tokens.is_empty());Implementations§
Source§impl Tokenizer
impl Tokenizer
Sourcepub fn normalize(&self, text: &str) -> String
pub fn normalize(&self, text: &str) -> String
Normalise Thai text into canonical form.
This is a convenience wrapper around normalizer::normalize.
Because segment is zero-copy, normalization must happen before
segmentation. The caller owns the returned alloc::string::String and can then
borrow it for segment:
use kham_core::Tokenizer;
let tok = Tokenizer::new();
// Input with a doubled tone mark and decomposed Sara Am
let raw = "\u{0E01}\u{0E34}\u{0E19}\u{0E19}\u{0E49}\u{0E4D}\u{0E32}"; // กิน + น + ้ + อํ + อา
let normalized = tok.normalize(raw); // น้ำ composed, no dedup needed here
let tokens = tok.segment(&normalized); // tokens borrow `normalized`
assert!(!tokens.is_empty());Sourcepub fn builder() -> TokenizerBuilder
pub fn builder() -> TokenizerBuilder
Return a TokenizerBuilder for custom configuration.
§Example
use kham_core::Tokenizer;
// Use built-in dict (no extra words needed here)
let tok = Tokenizer::builder().build();
let tokens = tok.segment("สวัสดีชาวโลก");
assert!(!tokens.is_empty());Sourcepub fn segment<'t>(&self, text: &'t str) -> Vec<Token<'t>>
pub fn segment<'t>(&self, text: &'t str) -> Vec<Token<'t>>
Segment text into tokens.
Returns a Vec<Token<'_>> where every token’s text is a
zero-copy sub-slice of text.
Non-Thai spans (Latin, Number, Whitespace, Emoji, Punctuation) pass through unchanged. Thai spans are segmented with the newmm DAG algorithm constrained to TCC boundaries.
§Example
use kham_core::{Tokenizer, TokenKind};
let tok = Tokenizer::new();
// Mixed Thai + number + Thai
let tokens = tok.segment("ธนาคาร100แห่ง");
assert_eq!(tokens[1].text, "100");
assert_eq!(tokens[1].kind, TokenKind::Number);Trait Implementations§
Auto Trait Implementations§
impl Freeze for Tokenizer
impl RefUnwindSafe for Tokenizer
impl Send for Tokenizer
impl Sync for Tokenizer
impl Unpin for Tokenizer
impl UnsafeUnpin for Tokenizer
impl UnwindSafe for Tokenizer
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more