pub struct Tokenizer { /* private fields */ }Expand description
High-level tokenizer. Holds a compiled dictionary and segmentation options.
§Example
use kham_core::Tokenizer;
let tok = Tokenizer::new();
let tokens = tok.segment("กินข้าวกับปลา");
assert!(!tokens.is_empty());Implementations§
Source§impl Tokenizer
impl Tokenizer
Sourcepub fn normalize(&self, text: &str) -> String
pub fn normalize(&self, text: &str) -> String
Normalise Thai text into canonical form.
This is a convenience wrapper around normalizer::normalize.
Because segment is zero-copy, normalization must happen before
segmentation. The caller owns the returned alloc::string::String and can then
borrow it for segment:
use kham_core::Tokenizer;
let tok = Tokenizer::new();
// Input with a doubled tone mark and decomposed Sara Am
let raw = "\u{0E01}\u{0E34}\u{0E19}\u{0E19}\u{0E49}\u{0E4D}\u{0E32}"; // กิน + น + ้ + อํ + อา
let normalized = tok.normalize(raw); // น้ำ composed, no dedup needed here
let tokens = tok.segment(&normalized); // tokens borrow `normalized`
assert!(!tokens.is_empty());Sourcepub fn builder() -> TokenizerBuilder
pub fn builder() -> TokenizerBuilder
Return a TokenizerBuilder for custom configuration.
§Example
use kham_core::Tokenizer;
// Use built-in dict (no extra words needed here)
let tok = Tokenizer::builder().build();
let tokens = tok.segment("สวัสดีชาวโลก");
assert!(!tokens.is_empty());Sourcepub fn segment<'t>(&self, text: &'t str) -> Vec<Token<'t>>
pub fn segment<'t>(&self, text: &'t str) -> Vec<Token<'t>>
Segment text into tokens.
Returns a Vec<Token<'_>> where every token’s text is a
zero-copy sub-slice of text.
Non-Thai spans (Latin, Number, Whitespace, Emoji, Punctuation) pass through unchanged. Thai spans are segmented with the newmm DAG algorithm constrained to TCC boundaries.
§Examples
use kham_core::{Tokenizer, TokenKind};
let tok = Tokenizer::new();
// Mixed Thai + number + Thai — number token lands at index 1
let tokens = tok.segment("ธนาคาร100แห่ง");
assert_eq!(tokens[1].text, "100");
assert_eq!(tokens[1].kind, TokenKind::Number);Joining all token texts reconstructs the original string (whitespace is dropped by default, so the joined result omits whitespace):
use kham_core::Tokenizer;
let tok = Tokenizer::new();
let text = "กินข้าวกับปลา";
let tokens = tok.segment(text);
let rebuilt: String = tokens.iter().map(|t| t.text).collect();
assert_eq!(rebuilt, text);Every token carries both byte and char offsets into the original string:
use kham_core::Tokenizer;
let tok = Tokenizer::new();
let text = "ธนาคาร100แห่ง";
let tokens = tok.segment(text);
for t in &tokens {
// Byte span: valid UTF-8 slice
assert_eq!(&text[t.span.clone()], t.text);
// Char span: length matches Unicode scalar count
assert_eq!(t.char_span.end - t.char_span.start, t.text.chars().count());
}