kham_core/segmenter.rs
1//! DAG-based maximal matching segmenter (newmm algorithm).
2//!
3//! The segmenter builds a Directed Acyclic Word Graph (DAWG) over the input
4//! text using TCC boundaries as candidate split points, then finds the path
5//! that maximises the number of dictionary matches (fewest unknown tokens).
6//!
7//! ## Pipeline
8//!
9//! ```text
10//! raw text
11//! │
12//! ▼ (optional) Tokenizer::normalize() ← fixes tone dedup + Sara Am composition
13//! │
14//! ▼ pre_tokenize()
15//! [Thai span] [Number span] [Latin span] …
16//! │
17//! ▼ (Thai spans only) tcc_boundaries()
18//! TCC boundary positions: [0, b1, b2, …, len]
19//! │
20//! ▼ DP over boundary indices
21//! path of (start, end) pairs that maximises dict matches
22//! │
23//! ▼
24//! Vec<Token<'_>>
25//! ```
26//!
27//! ## Normalization and zero-copy
28//!
29//! [`Tokenizer::segment`] is zero-copy: every [`Token`] borrows directly from
30//! the `&str` you pass in. This means segment() cannot internally normalize
31//! the text (normalization may reorder/remove characters, producing a new
32//! allocation with different byte offsets).
33//!
34//! For input that may contain สระลอย in wrong order, stacked tone marks, or
35//! decomposed Sara Am, use the two-step pattern:
36//!
37//! ```rust
38//! use kham_core::Tokenizer;
39//!
40//! let tok = Tokenizer::new();
41//! let normalized = tok.normalize("กเินข้าว"); // fix any encoding issues
42//! let tokens = tok.segment(&normalized); // tokens borrow `normalized`
43//! ```
44
45use alloc::vec;
46use alloc::vec::Vec;
47
48use crate::dict::{builtin_dict, Dict, BUILTIN_WORDS};
49use crate::error::KhamError;
50use crate::freq::FreqMap;
51use crate::normalizer;
52use crate::pre_tokenizer::pre_tokenize;
53use crate::tcc::tcc_boundaries;
54use crate::token::{Token, TokenKind};
55
56/// High-level tokenizer. Holds a compiled dictionary and segmentation options.
57///
58/// # Example
59///
60/// ```rust
61/// use kham_core::Tokenizer;
62///
63/// let tok = Tokenizer::new();
64/// let tokens = tok.segment("กินข้าวกับปลา");
65/// assert!(!tokens.is_empty());
66/// ```
67pub struct Tokenizer {
68 dict: Dict,
69 freq: FreqMap,
70 keep_whitespace: bool,
71}
72
73impl Tokenizer {
74 /// Create a tokenizer with the built-in dictionary and TNC frequency table.
75 pub fn new() -> Self {
76 Self {
77 dict: builtin_dict(),
78 freq: FreqMap::builtin(),
79 keep_whitespace: false,
80 }
81 }
82
83 /// Normalise Thai text into canonical form.
84 ///
85 /// This is a convenience wrapper around [`normalizer::normalize`].
86 /// Because [`segment`] is zero-copy, normalization must happen **before**
87 /// segmentation. The caller owns the returned [`alloc::string::String`] and can then
88 /// borrow it for [`segment`]:
89 ///
90 /// ```rust
91 /// use kham_core::Tokenizer;
92 ///
93 /// let tok = Tokenizer::new();
94 /// // Input with a doubled tone mark and decomposed Sara Am
95 /// let raw = "\u{0E01}\u{0E34}\u{0E19}\u{0E19}\u{0E49}\u{0E4D}\u{0E32}"; // กิน + น + ้ + อํ + อา
96 /// let normalized = tok.normalize(raw); // น้ำ composed, no dedup needed here
97 /// let tokens = tok.segment(&normalized); // tokens borrow `normalized`
98 /// assert!(!tokens.is_empty());
99 /// ```
100 ///
101 /// [`segment`]: Tokenizer::segment
102 pub fn normalize(&self, text: &str) -> alloc::string::String {
103 normalizer::normalize(text)
104 }
105
106 /// Return a [`TokenizerBuilder`] for custom configuration.
107 ///
108 /// # Example
109 ///
110 /// ```rust
111 /// use kham_core::Tokenizer;
112 ///
113 /// // Use built-in dict (no extra words needed here)
114 /// let tok = Tokenizer::builder().build();
115 /// let tokens = tok.segment("สวัสดีชาวโลก");
116 /// assert!(!tokens.is_empty());
117 /// ```
118 pub fn builder() -> TokenizerBuilder {
119 TokenizerBuilder::default()
120 }
121
122 /// Segment `text` into tokens.
123 ///
124 /// Returns a `Vec<Token<'_>>` where every token's `text` is a
125 /// zero-copy sub-slice of `text`.
126 ///
127 /// Non-Thai spans (Latin, Number, Whitespace, Emoji, Punctuation) pass
128 /// through unchanged. Thai spans are segmented with the newmm DAG
129 /// algorithm constrained to TCC boundaries.
130 ///
131 /// # Examples
132 ///
133 /// ```rust
134 /// use kham_core::{Tokenizer, TokenKind};
135 ///
136 /// let tok = Tokenizer::new();
137 /// // Mixed Thai + number + Thai — number token lands at index 1
138 /// let tokens = tok.segment("ธนาคาร100แห่ง");
139 /// assert_eq!(tokens[1].text, "100");
140 /// assert_eq!(tokens[1].kind, TokenKind::Number);
141 /// ```
142 ///
143 /// Joining all token texts reconstructs the original string (whitespace
144 /// is dropped by default, so the joined result omits whitespace):
145 ///
146 /// ```rust
147 /// use kham_core::Tokenizer;
148 ///
149 /// let tok = Tokenizer::new();
150 /// let text = "กินข้าวกับปลา";
151 /// let tokens = tok.segment(text);
152 /// let rebuilt: String = tokens.iter().map(|t| t.text).collect();
153 /// assert_eq!(rebuilt, text);
154 /// ```
155 ///
156 /// Every token carries both byte and char offsets into the original string:
157 ///
158 /// ```rust
159 /// use kham_core::Tokenizer;
160 ///
161 /// let tok = Tokenizer::new();
162 /// let text = "ธนาคาร100แห่ง";
163 /// let tokens = tok.segment(text);
164 /// for t in &tokens {
165 /// // Byte span: valid UTF-8 slice
166 /// assert_eq!(&text[t.span.clone()], t.text);
167 /// // Char span: length matches Unicode scalar count
168 /// assert_eq!(t.char_span.end - t.char_span.start, t.text.chars().count());
169 /// }
170 /// ```
171 pub fn segment<'t>(&self, text: &'t str) -> Vec<Token<'t>> {
172 if text.is_empty() {
173 return Vec::new();
174 }
175
176 // Split into script-homogeneous spans. Non-Thai spans pass through;
177 // Thai spans go through the newmm DAG segmenter.
178 // Call normalize() first if the input may contain สระลอย in wrong
179 // order, stacked tone marks, or decomposed Sara Am.
180 let pre_tokens = pre_tokenize(text);
181
182 let mut result: Vec<Token<'t>> = Vec::with_capacity(pre_tokens.len() * 2);
183
184 for token in pre_tokens {
185 match token.kind {
186 TokenKind::Thai => {
187 segment_thai(&self.dict, &self.freq, text, token.span, &mut result);
188 }
189 TokenKind::Whitespace if !self.keep_whitespace => {
190 // Discard whitespace tokens unless keep_whitespace is set.
191 }
192 _ => {
193 result.push(token);
194 }
195 }
196 }
197
198 result
199 }
200
201 /// Segment `text` and return a [`TokenStream`] for incremental consumption.
202 ///
203 /// # Example
204 ///
205 /// ```rust
206 /// use kham_core::Tokenizer;
207 ///
208 /// let tok = Tokenizer::new();
209 /// let mut stream = tok.segment_stream("กินข้าวกับปลา");
210 /// while let Some(t) = stream.next_word() {
211 /// println!("{}", t.text);
212 /// }
213 /// ```
214 pub fn segment_stream<'t>(&self, text: &'t str) -> TokenStream<'t> {
215 TokenStream {
216 inner: self.segment(text).into_iter(),
217 }
218 }
219}
220
221// ---------------------------------------------------------------------------
222// TokenStream
223// ---------------------------------------------------------------------------
224
225/// A streaming iterator over [`Token`]s produced by [`Tokenizer::segment_stream`].
226///
227/// Wraps the full `Vec<Token>` as an [`alloc::vec::IntoIter`]; the streaming
228/// API lets callers consume tokens one at a time and filter by kind or
229/// confidence without allocating a second collection.
230///
231/// # Example
232///
233/// ```rust
234/// use kham_core::Tokenizer;
235///
236/// let tok = Tokenizer::builder().keep_whitespace(true).build();
237/// let mut stream = tok.segment_stream("กิน ข้าว");
238/// // next_word() skips whitespace tokens.
239/// while let Some(t) = stream.next_word() {
240/// println!("{}", t.text);
241/// }
242/// ```
243pub struct TokenStream<'t> {
244 inner: alloc::vec::IntoIter<Token<'t>>,
245}
246
247impl<'t> TokenStream<'t> {
248 /// Advance past [`TokenKind::Whitespace`] tokens and return the next
249 /// non-whitespace token, or `None` when the stream is exhausted.
250 pub fn next_word(&mut self) -> Option<Token<'t>> {
251 self.inner
252 .by_ref()
253 .find(|t| t.kind != TokenKind::Whitespace)
254 }
255
256 /// Advance past whitespace and [`TokenKind::Unknown`] tokens and return
257 /// the next token whose kind is neither, or `None` when exhausted.
258 pub fn next_known(&mut self) -> Option<Token<'t>> {
259 self.inner
260 .by_ref()
261 .find(|t| t.kind != TokenKind::Whitespace && t.kind != TokenKind::Unknown)
262 }
263
264 /// Advance past tokens with `confidence < min` and return the next
265 /// qualifying token, or `None` when the stream is exhausted.
266 pub fn next_above_confidence(&mut self, min: f32) -> Option<Token<'t>> {
267 self.inner.by_ref().find(|t| t.confidence >= min)
268 }
269}
270
271impl<'t> Iterator for TokenStream<'t> {
272 type Item = Token<'t>;
273
274 #[inline]
275 fn next(&mut self) -> Option<Token<'t>> {
276 self.inner.next()
277 }
278
279 #[inline]
280 fn size_hint(&self) -> (usize, Option<usize>) {
281 self.inner.size_hint()
282 }
283}
284
285// ---------------------------------------------------------------------------
286// newmm DAG segmentation — Thai spans only
287// ---------------------------------------------------------------------------
288
289/// Lexicographic DP score for a TCC boundary position.
290///
291/// Fields are ordered so that `Ord` naturally expresses the newmm preference:
292/// 1. Minimise unknowns (fewer unknowns → `neg_unknowns` less negative → greater).
293/// 2. Minimise total token count (prefer longer compounds over split components).
294/// 3. Maximise dictionary matches.
295/// 4. Maximise cumulative TNC frequency as the final tiebreaker.
296#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
297struct DpScore {
298 neg_unknowns: i32,
299 neg_tokens: i32,
300 dict_words: i32,
301 freq_score: u64,
302}
303
304impl DpScore {
305 const ZERO: Self = Self {
306 neg_unknowns: 0,
307 dict_words: 0,
308 freq_score: 0,
309 neg_tokens: 0,
310 };
311
312 fn dict_edge(self, freq: u32) -> Self {
313 Self {
314 dict_words: self.dict_words + 1,
315 freq_score: self.freq_score + freq as u64,
316 neg_tokens: self.neg_tokens - 1,
317 ..self
318 }
319 }
320
321 fn unknown_edge(self) -> Self {
322 Self {
323 neg_unknowns: self.neg_unknowns - 1,
324 neg_tokens: self.neg_tokens - 1,
325 ..self
326 }
327 }
328}
329
330/// Output of the forward DP pass.
331struct DpTable {
332 /// Predecessor boundary index for backtracking.
333 from: Vec<usize>,
334 /// Whether the incoming edge at index `i` was a dictionary match.
335 is_dict: Vec<bool>,
336 /// TNC frequency of the winning dict edge that arrived at boundary `i`.
337 /// `0` for unknown edges or dict words with zero corpus frequency.
338 edge_freq: Vec<u32>,
339 /// Number of edges (dict + unknown fallback) that were considered when
340 /// trying to arrive at boundary `i`. Capped at 255.
341 competing: Vec<u8>,
342}
343
344/// Forward DP over TCC boundary indices for a single Thai slice.
345///
346/// `bounds` must be the output of [`tcc_boundaries`] for `slice`.
347fn forward_dp(dict: &Dict, freqs: &FreqMap, slice: &str, bounds: &[usize]) -> DpTable {
348 let nb = bounds.len();
349 let mut best: Vec<Option<DpScore>> = vec![None; nb];
350 let mut from = vec![0usize; nb];
351 let mut is_dict = vec![false; nb];
352 let mut edge_freq = vec![0u32; nb];
353 let mut competing = vec![0u8; nb];
354
355 best[0] = Some(DpScore::ZERO);
356
357 for i in 0..nb - 1 {
358 let score = match best[i] {
359 Some(s) => s,
360 None => continue,
361 };
362 let pos = bounds[i];
363 let remaining = &slice[pos..];
364
365 // Dictionary edges — all prefixes, not just the longest, so the DP
366 // can make a globally optimal choice rather than a greedy one.
367 for prefix in dict.prefixes(remaining) {
368 let end_pos = pos + prefix.len();
369 if let Ok(j) = bounds.binary_search(&end_pos) {
370 // Count every dict edge considered at this boundary.
371 competing[j] = competing[j].saturating_add(1);
372 let freq = freqs.get(prefix);
373 let candidate = Some(score.dict_edge(freq));
374 if candidate > best[j] {
375 best[j] = candidate;
376 from[j] = i;
377 is_dict[j] = true;
378 edge_freq[j] = freq;
379 }
380 }
381 }
382
383 // Fallback edge: advance one TCC as an unknown token.
384 let j = i + 1;
385 // Count the unknown fallback edge as a competing edge too.
386 competing[j] = competing[j].saturating_add(1);
387 let candidate = Some(score.unknown_edge());
388 if candidate > best[j] {
389 best[j] = candidate;
390 from[j] = i;
391 is_dict[j] = false;
392 edge_freq[j] = 0;
393 }
394 }
395
396 DpTable {
397 from,
398 is_dict,
399 edge_freq,
400 competing,
401 }
402}
403
404/// Reconstruct the winning boundary-index path by following `from` pointers
405/// from the last index back to 0, then reversing.
406fn backtrack_path(from: &[usize]) -> Vec<usize> {
407 let nb = from.len();
408 let mut path = Vec::with_capacity(nb);
409 let mut cur = nb - 1;
410 loop {
411 path.push(cur);
412 if cur == 0 {
413 break;
414 }
415 cur = from[cur];
416 }
417 path.reverse();
418 path
419}
420
421/// Compute the segmentation confidence for a single token boundary.
422///
423/// - `is_dict`: whether the winning edge at this boundary was a dictionary match.
424/// - `freq`: TNC corpus frequency of the winning dict edge (`0` for unknown edges
425/// or dict words absent from the frequency table).
426/// - `competing`: total number of edges (dict + unknown fallback) that were
427/// considered when arriving at this boundary.
428///
429/// Returns a value in `[0.0, 1.0]` following the design:
430/// - Unknown token → `0.0`
431/// - Dict match, zero freq → base `0.7`
432/// - Dict match, nonzero freq → base `1.0`
433/// - Ambiguity penalty applied multiplicatively: 2 edges → ×0.9, 3 → ×0.8, 4+ → ×0.7
434fn compute_confidence(is_dict: bool, freq: u32, competing: u8) -> f32 {
435 if !is_dict {
436 return 0.0;
437 }
438 let base = if freq > 0 { 1.0_f32 } else { 0.7_f32 };
439 let amb = match competing {
440 0 | 1 => 1.0,
441 2 => 0.9,
442 3 => 0.8,
443 _ => 0.7,
444 };
445 base * amb
446}
447
448/// Segment a single Thai span using the newmm DAG algorithm and append tokens
449/// to `out`.
450///
451/// Steps: TCC boundaries → forward DP → backtrack → emit tokens.
452fn segment_thai<'t>(
453 dict: &Dict,
454 freqs: &FreqMap,
455 text: &'t str,
456 span: core::ops::Range<usize>,
457 out: &mut Vec<Token<'t>>,
458) {
459 let slice = &text[span.start..span.end];
460 let bounds = tcc_boundaries(slice);
461
462 if bounds.len() <= 1 {
463 return;
464 }
465
466 let dp = forward_dp(dict, freqs, slice, &bounds);
467 let path = backtrack_path(&dp.from);
468
469 // Char offset of span.start — computed once, then incremented per token.
470 let mut char_cursor = text[..span.start].chars().count();
471
472 for w in path.windows(2) {
473 let start_byte = span.start + bounds[w[0]];
474 let end_byte = span.start + bounds[w[1]];
475 let token_text = &text[start_byte..end_byte];
476 let char_start = char_cursor;
477 char_cursor += token_text.chars().count();
478 let kind = if dp.is_dict[w[1]] {
479 TokenKind::Thai
480 } else {
481 TokenKind::Unknown
482 };
483 let confidence =
484 compute_confidence(dp.is_dict[w[1]], dp.edge_freq[w[1]], dp.competing[w[1]]);
485 out.push(Token::new(
486 token_text,
487 start_byte..end_byte,
488 char_start..char_cursor,
489 kind,
490 confidence,
491 ));
492 }
493}
494
495// ---------------------------------------------------------------------------
496// Tokenizer trait impls
497// ---------------------------------------------------------------------------
498
499impl Default for Tokenizer {
500 fn default() -> Self {
501 Self::new()
502 }
503}
504
505// ---------------------------------------------------------------------------
506// TokenizerBuilder
507// ---------------------------------------------------------------------------
508
509/// Builder for [`Tokenizer`].
510///
511/// # Example
512///
513/// ```rust
514/// use kham_core::Tokenizer;
515///
516/// let tok = Tokenizer::builder()
517/// .keep_whitespace(true)
518/// .build();
519/// ```
520#[derive(Debug, Default)]
521pub struct TokenizerBuilder {
522 dict_words: Option<alloc::string::String>,
523 dict_merge: Option<alloc::string::String>,
524 keep_whitespace: bool,
525}
526
527impl TokenizerBuilder {
528 /// Load an additional word list from a string (newline-separated words).
529 ///
530 /// Words are merged with the built-in dictionary.
531 ///
532 /// # Example
533 ///
534 /// ```rust
535 /// use kham_core::{Tokenizer, TokenKind};
536 ///
537 /// let tok = Tokenizer::builder()
538 /// .dict_words("ปัญญาประดิษฐ์\n")
539 /// .build();
540 /// let tokens = tok.segment("ปัญญาประดิษฐ์คือ");
541 /// assert!(tokens.iter().any(|t| t.text == "ปัญญาประดิษฐ์" && t.kind == TokenKind::Thai));
542 /// ```
543 pub fn dict_words(mut self, words: &str) -> Self {
544 self.dict_words = Some(alloc::string::String::from(words));
545 self
546 }
547
548 /// Configure whether whitespace tokens are included in the output.
549 ///
550 /// Default: `false` (whitespace is discarded).
551 ///
552 /// # Example
553 ///
554 /// ```rust
555 /// use kham_core::{Tokenizer, TokenKind};
556 ///
557 /// let tok = Tokenizer::builder().keep_whitespace(true).build();
558 /// let tokens = tok.segment("กิน ข้าว");
559 /// assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
560 /// // Byte spans are contiguous when whitespace is kept
561 /// for w in tokens.windows(2) {
562 /// assert_eq!(w[0].span.end, w[1].span.start);
563 /// }
564 /// ```
565 /// Add extra words via a lightweight overlay — no trie rebuild.
566 ///
567 /// Words are stored in a sorted list alongside the pre-compiled trie.
568 /// This is O(k log k) in the number of custom words and avoids the O(N)
569 /// full trie rebuild that [`dict_words`](Self::dict_words) performs.
570 ///
571 /// Prefer `dict_merge` over `dict_words` when adding a small custom
572 /// vocabulary (e.g. domain-specific terms, product names).
573 ///
574 /// If both `dict_merge` and `dict_words` are called, `dict_words` takes
575 /// precedence (it performs a full rebuild that subsumes any overlay).
576 ///
577 /// # Example
578 ///
579 /// ```rust
580 /// use kham_core::{Tokenizer, TokenKind};
581 ///
582 /// let tok = Tokenizer::builder()
583 /// .dict_merge("ปัญญาประดิษฐ์\nโปรแกรมเมอร์\n")
584 /// .build();
585 /// let tokens = tok.segment("ปัญญาประดิษฐ์คือ");
586 /// assert!(tokens.iter().any(|t| t.text == "ปัญญาประดิษฐ์" && t.kind == TokenKind::Thai));
587 /// ```
588 pub fn dict_merge(mut self, words: &str) -> Self {
589 self.dict_merge = Some(alloc::string::String::from(words));
590 self
591 }
592
593 /// Configure whether whitespace tokens are included in the output.
594 ///
595 /// Default: `false` (whitespace is discarded).
596 ///
597 /// # Example
598 ///
599 /// ```rust
600 /// use kham_core::{Tokenizer, TokenKind};
601 ///
602 /// let tok = Tokenizer::builder().keep_whitespace(true).build();
603 /// let tokens = tok.segment("กิน ข้าว");
604 /// assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
605 /// // Byte spans are contiguous when whitespace is kept
606 /// for w in tokens.windows(2) {
607 /// assert_eq!(w[0].span.end, w[1].span.start);
608 /// }
609 /// ```
610 pub fn keep_whitespace(mut self, keep: bool) -> Self {
611 self.keep_whitespace = keep;
612 self
613 }
614
615 /// Consume the builder and return a configured [`Tokenizer`].
616 pub fn build(self) -> Tokenizer {
617 let dict = if let Some(extra) = &self.dict_words {
618 // Full rebuild path: merges BUILTIN_WORDS + custom words into a new trie.
619 let mut combined = alloc::string::String::from(BUILTIN_WORDS);
620 combined.push('\n');
621 combined.push_str(extra);
622 Dict::from_word_list(&combined)
623 } else if let Some(overlay) = &self.dict_merge {
624 // Fast overlay path: load pre-compiled binary, attach small sorted list.
625 builtin_dict().with_overlay(overlay)
626 } else {
627 // Default path: load from pre-compiled binary — O(S) copy.
628 builtin_dict()
629 };
630 Tokenizer {
631 dict,
632 freq: FreqMap::builtin(),
633 keep_whitespace: self.keep_whitespace,
634 }
635 }
636
637 /// Try to load a custom word list from a file path.
638 ///
639 /// Only available when the `std` feature is enabled.
640 ///
641 /// # Errors
642 ///
643 /// Returns [`KhamError::DictLoadError`] if the file cannot be read.
644 ///
645 /// # Example
646 ///
647 /// ```rust,no_run
648 /// use kham_core::Tokenizer;
649 ///
650 /// let tok = Tokenizer::builder()
651 /// .dict_file("my_words.txt")
652 /// .expect("failed to load dict")
653 /// .build();
654 /// ```
655 #[cfg(feature = "std")]
656 pub fn dict_file(self, path: &str) -> Result<Self, KhamError> {
657 extern crate std;
658 let content = std::fs::read_to_string(path)
659 .map_err(|e| KhamError::DictLoadError(alloc::format!("{path}: {e}")))?;
660 Ok(self.dict_words(&content))
661 }
662}
663
664// ---------------------------------------------------------------------------
665// Tests
666// ---------------------------------------------------------------------------
667
668#[cfg(test)]
669mod tests {
670 use super::*;
671
672 fn tok() -> Tokenizer {
673 Tokenizer::new()
674 }
675
676 // ── basic smoke tests ────────────────────────────────────────────────────
677
678 #[test]
679 fn empty_input() {
680 assert!(tok().segment("").is_empty());
681 }
682
683 #[test]
684 fn pure_latin_passthrough() {
685 let tokens = tok().segment("hello");
686 assert_eq!(tokens.len(), 1);
687 assert_eq!(tokens[0].text, "hello");
688 assert_eq!(tokens[0].kind, TokenKind::Latin);
689 }
690
691 #[test]
692 fn pure_number_passthrough() {
693 let tokens = tok().segment("12345");
694 assert_eq!(tokens.len(), 1);
695 assert_eq!(tokens[0].text, "12345");
696 assert_eq!(tokens[0].kind, TokenKind::Number);
697 }
698
699 #[test]
700 fn whitespace_dropped_by_default() {
701 let tokens = tok().segment("กิน ข้าว");
702 for t in &tokens {
703 assert_ne!(t.kind, TokenKind::Whitespace);
704 }
705 }
706
707 #[test]
708 fn whitespace_kept_when_requested() {
709 let tokens = Tokenizer::builder()
710 .keep_whitespace(true)
711 .build()
712 .segment("กิน ข้าว");
713 assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
714 }
715
716 // ── Thai segmentation ────────────────────────────────────────────────────
717
718 #[test]
719 fn gin_khao_gap_pla() {
720 // "กินข้าวกับปลา" — all words must be in the built-in dict
721 let tokens = tok().segment("กินข้าวกับปลา");
722 let words: Vec<&str> = tokens.iter().map(|t| t.text).collect();
723 // Must segment into at least 2 tokens (dict has กิน, ข้าว, กับ, ปลา)
724 assert!(words.len() >= 2, "expected multiple words, got {words:?}");
725 // Reconstructing must yield the original string
726 assert_eq!(words.join(""), "กินข้าวกับปลา");
727 }
728
729 #[test]
730 fn mixed_thai_number_thai() {
731 // Classic CLAUDE.md example
732 let tokens = tok().segment("ธนาคาร100แห่ง");
733 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
734 assert_eq!(rebuilt, "ธนาคาร100แห่ง");
735 // "100" must survive as a Number token
736 let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
737 assert!(num.is_some());
738 assert_eq!(num.unwrap().text, "100");
739 }
740
741 #[test]
742 fn mixed_thai_latin() {
743 let tokens = tok().segment("สวัสดี hello");
744 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
745 // Whitespace dropped by default
746 assert_eq!(rebuilt, "สวัสดีhello");
747 assert!(tokens
748 .iter()
749 .any(|t| t.kind == TokenKind::Latin && t.text == "hello"));
750 }
751
752 // ── span / byte-offset invariants ────────────────────────────────────────
753
754 #[test]
755 fn spans_cover_input_excluding_whitespace() {
756 let text = "กินข้าว123hello";
757 let tokens = tok().segment(text);
758 // Every span must be a valid UTF-8 slice of `text`.
759 for t in &tokens {
760 assert_eq!(&text[t.span.clone()], t.text);
761 assert!(text.is_char_boundary(t.span.start));
762 assert!(text.is_char_boundary(t.span.end));
763 }
764 }
765
766 #[test]
767 fn adjacent_spans_are_contiguous() {
768 let text = "กินข้าวกับปลา";
769 let tokens = Tokenizer::builder()
770 .keep_whitespace(true)
771 .build()
772 .segment(text);
773 for w in tokens.windows(2) {
774 assert_eq!(
775 w[0].span.end, w[1].span.start,
776 "gap between {:?} and {:?}",
777 w[0], w[1]
778 );
779 }
780 }
781
782 #[test]
783 fn no_empty_tokens() {
784 let tokens = tok().segment("กินข้าวกับปลา 100 hello!");
785 for t in &tokens {
786 assert!(!t.text.is_empty());
787 }
788 }
789
790 // ── custom dictionary ─────────────────────────────────────────────────────
791
792 #[test]
793 fn custom_dict_word_is_matched() {
794 // Use a nonsense word that is not in the built-in dictionary and cannot
795 // be decomposed into subwords — ensures the custom dict is actually used.
796 let tok = Tokenizer::builder().dict_words("กขคงจฉ\n").build();
797 let tokens = tok.segment("กขคงจฉ");
798 let thai: Vec<&str> = tokens
799 .iter()
800 .filter(|t| t.kind == TokenKind::Thai)
801 .map(|t| t.text)
802 .collect();
803 assert!(thai.contains(&"กขคงจฉ"), "got: {thai:?}");
804 }
805
806 // ── normalize then segment ────────────────────────────────────────────────
807
808 #[test]
809 fn normalize_deduplicates_tone_before_segment() {
810 // กินข้าว with a doubled tone mark on ข้ — normalize fixes it, segment proceeds.
811 let t = tok();
812 // Insert a doubled tone on ข: ข + อ้ + อ้ (ข้้)
813 let raw = "กิน\u{0E02}\u{0E49}\u{0E49}าว"; // กิน + ข้้ + าว
814 let normalized = t.normalize(raw);
815 let tokens = t.segment(&normalized);
816 assert!(!tokens.is_empty());
817 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
818 assert_eq!(rebuilt, normalized);
819 }
820
821 #[test]
822 fn normalize_clean_input_is_identity() {
823 // normalize() on already-clean text should not change it.
824 let t = tok();
825 let clean = "กินข้าวกับปลา";
826 assert_eq!(t.normalize(clean), clean);
827 }
828
829 #[test]
830 fn segment_without_normalize_on_clean_input() {
831 // segment() alone is sufficient when input is already canonical.
832 let tokens = tok().segment("กินข้าวกับปลา");
833 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
834 assert_eq!(rebuilt, "กินข้าวกับปลา");
835 }
836
837 // ── DpScore ordering ──────────────────────────────────────────────────────
838 //
839 // The score is a 4-field lexicographic key:
840 // 1. neg_unknowns — fewer unknowns is strictly better
841 // 2. neg_tokens — fewer tokens (prefer longer compounds over split components)
842 // 3. dict_words — more dictionary matches breaks token-count ties
843 // 4. freq_score — higher cumulative TNC frequency as the final tiebreaker
844
845 #[test]
846 fn dp_score_fewer_unknowns_is_primary() {
847 // A path with no unknowns beats one with unknowns regardless of other fields.
848 let no_unknown = DpScore::ZERO;
849 let one_unknown = DpScore::ZERO.unknown_edge();
850 assert!(no_unknown > one_unknown);
851 }
852
853 #[test]
854 fn dp_score_fewer_tokens_beats_more_dict_words() {
855 // Fewer tokens wins over more dict matches: เดินทาง (1 token, 1 match)
856 // beats เดิน+ทาง (2 tokens, 2 matches).
857 let compound = DpScore::ZERO.dict_edge(0); // 1 token, 1 dict
858 let split = DpScore::ZERO.dict_edge(0).dict_edge(0); // 2 tokens, 2 dict
859 assert!(compound > split);
860 }
861
862 #[test]
863 fn dp_score_higher_freq_breaks_token_tie() {
864 // Same unknowns and token count; higher TNC freq wins.
865 let low_freq = DpScore::ZERO.dict_edge(10);
866 let high_freq = DpScore::ZERO.dict_edge(100);
867 assert!(high_freq > low_freq);
868 }
869
870 #[test]
871 fn dp_score_fewer_tokens_beats_higher_freq() {
872 // Fewer tokens wins even when the competing path has higher TNC frequency.
873 let high_freq_more_tokens = DpScore {
874 neg_unknowns: 0,
875 neg_tokens: -2,
876 dict_words: 1,
877 freq_score: 200,
878 };
879 let low_freq_fewer_tokens = DpScore {
880 neg_unknowns: 0,
881 neg_tokens: -1,
882 dict_words: 1,
883 freq_score: 100,
884 };
885 assert!(low_freq_fewer_tokens > high_freq_more_tokens);
886 }
887
888 #[test]
889 fn dp_score_more_dict_words_breaks_token_tie() {
890 // Same unknowns and token count; more dict matches wins.
891 let fewer_dict = DpScore {
892 neg_unknowns: 0,
893 neg_tokens: -2,
894 dict_words: 1,
895 freq_score: 0,
896 };
897 let more_dict = DpScore {
898 neg_unknowns: 0,
899 neg_tokens: -2,
900 dict_words: 2,
901 freq_score: 0,
902 };
903 assert!(more_dict > fewer_dict);
904 }
905
906 #[test]
907 fn dict_edge_accumulates_freq_score() {
908 let after_one = DpScore::ZERO.dict_edge(50);
909 let after_two = after_one.dict_edge(30);
910 assert_eq!(after_one.freq_score, 50);
911 assert_eq!(after_two.freq_score, 80);
912 }
913
914 #[test]
915 fn dict_edge_increments_dict_words_and_neg_tokens() {
916 let s = DpScore::ZERO.dict_edge(0);
917 assert_eq!(s.dict_words, 1);
918 assert_eq!(s.neg_tokens, -1);
919 assert_eq!(s.neg_unknowns, 0);
920 }
921
922 #[test]
923 fn unknown_edge_increments_neg_unknowns_only() {
924 let s = DpScore::ZERO.unknown_edge();
925 assert_eq!(s.neg_unknowns, -1);
926 assert_eq!(s.neg_tokens, -1);
927 assert_eq!(s.dict_words, 0);
928 assert_eq!(s.freq_score, 0);
929 }
930
931 #[test]
932 fn unknown_edge_does_not_contribute_freq() {
933 let s = DpScore::ZERO.unknown_edge().unknown_edge();
934 assert_eq!(s.freq_score, 0);
935 }
936
937 // ── char_span invariants ──────────────────────────────────────────────────
938
939 #[test]
940 fn char_span_len_equals_char_count() {
941 let tokens = tok().segment("กินข้าวกับปลา");
942 for t in &tokens {
943 assert_eq!(
944 t.char_span.end - t.char_span.start,
945 t.text.chars().count(),
946 "char_span length mismatch for {:?}",
947 t.text
948 );
949 }
950 }
951
952 #[test]
953 fn char_spans_are_contiguous() {
954 let tokens = Tokenizer::builder()
955 .keep_whitespace(true)
956 .build()
957 .segment("กินข้าว 100 hello");
958 for w in tokens.windows(2) {
959 assert_eq!(
960 w[0].char_span.end, w[1].char_span.start,
961 "char_span gap between {:?} and {:?}",
962 w[0].text, w[1].text
963 );
964 }
965 }
966
967 #[test]
968 fn char_span_for_mixed_script() {
969 // "ธนาคาร100แห่ง": ธนาคาร=6 chars, 100=3 chars, แห่ง=4 chars
970 let tokens = tok().segment("ธนาคาร100แห่ง");
971 assert_eq!(tokens[0].char_span, 0..6);
972 assert_eq!(tokens[1].char_span, 6..9);
973 assert_eq!(tokens[2].char_span, 9..13);
974 }
975
976 #[test]
977 fn char_span_accounts_for_multibyte_chars() {
978 // Each Thai codepoint is 3 bytes but 1 char.
979 // "กิน" = 3 chars (9 bytes); char_span should be 0..3, span 0..9.
980 let tokens = tok().segment("กิน");
981 assert_eq!(tokens[0].span, 0..9);
982 assert_eq!(tokens[0].char_span, 0..3);
983 }
984
985 #[test]
986 fn char_span_emoji_is_single_char() {
987 // 😀 = 1 char, 4 bytes — verify char_span counts it as 1.
988 let tokens = tok().segment("😀");
989 assert_eq!(tokens[0].char_len(), 1);
990 assert_eq!(tokens[0].byte_len(), 4);
991 }
992
993 // ── edge cases ────────────────────────────────────────────────────────────
994
995 #[test]
996 fn single_thai_char() {
997 let tokens = tok().segment("ก");
998 assert_eq!(tokens.len(), 1);
999 assert_eq!(tokens[0].text, "ก");
1000 }
1001
1002 #[test]
1003 fn sawasdee_khao_lok() {
1004 let tokens = tok().segment("สวัสดีชาวโลก");
1005 let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
1006 assert_eq!(rebuilt, "สวัสดีชาวโลก");
1007 }
1008
1009 // ── confidence ────────────────────────────────────────────────────────────
1010
1011 #[test]
1012 fn confidence_unknown_token_is_zero() {
1013 // A token not in the dict should get confidence 0.0
1014 let tokens = tok().segment("กขคงจฉ"); // garbage Thai that is NOT in the dict
1015 // There should be at least one Unknown token with confidence 0.0
1016 let unknown = tokens.iter().find(|t| t.kind == TokenKind::Unknown);
1017 if let Some(u) = unknown {
1018 assert_eq!(u.confidence, 0.0, "Unknown token must have confidence 0.0");
1019 }
1020 }
1021
1022 #[test]
1023 fn confidence_dict_word_is_positive() {
1024 // กิน, ข้าว, ปลา are all in the dict and should have confidence > 0.0
1025 let tokens = tok().segment("กินข้าวกับปลา");
1026 for t in &tokens {
1027 if t.kind == TokenKind::Thai {
1028 assert!(
1029 t.confidence > 0.0,
1030 "dict Thai token {:?} must have confidence > 0",
1031 t.text
1032 );
1033 }
1034 }
1035 }
1036
1037 #[test]
1038 fn confidence_non_thai_tokens_are_1() {
1039 // Latin, Number, Emoji tokens always have confidence 1.0
1040 let tokens = tok().segment("hello 123 😀");
1041 for t in &tokens {
1042 assert_eq!(
1043 t.confidence, 1.0,
1044 "non-Thai token {:?} must have confidence 1.0",
1045 t.text
1046 );
1047 }
1048 }
1049
1050 #[test]
1051 fn confidence_range_valid() {
1052 // Confidence must always be in [0.0, 1.0]
1053 let texts = &["กินข้าวกับปลา", "สวัสดีครับ", "hello กรุงเทพ 2024 😀", "กขคง"];
1054 for text in texts {
1055 for t in tok().segment(text) {
1056 assert!(
1057 (0.0..=1.0).contains(&t.confidence),
1058 "token {:?} confidence {} out of range",
1059 t.text,
1060 t.confidence
1061 );
1062 }
1063 }
1064 }
1065
1066 // ── TokenStream ───────────────────────────────────────────────────────────
1067
1068 #[test]
1069 fn segment_stream_yields_same_as_segment() {
1070 let t = tok();
1071 let text = "กินข้าวกับปลา";
1072 let direct: alloc::vec::Vec<_> = t.segment(text);
1073 let streamed: alloc::vec::Vec<_> = t.segment_stream(text).collect();
1074 assert_eq!(direct.len(), streamed.len());
1075 for (a, b) in direct.iter().zip(streamed.iter()) {
1076 assert_eq!(a.text, b.text);
1077 assert_eq!(a.kind, b.kind);
1078 assert_eq!(a.span, b.span);
1079 }
1080 }
1081
1082 #[test]
1083 fn next_word_skips_whitespace() {
1084 let t = Tokenizer::builder().keep_whitespace(true).build();
1085 let mut stream = t.segment_stream("กิน ข้าว ปลา");
1086 while let Some(tok) = stream.next_word() {
1087 assert_ne!(
1088 tok.kind,
1089 TokenKind::Whitespace,
1090 "next_word() must not return a whitespace token"
1091 );
1092 }
1093 }
1094
1095 #[test]
1096 fn next_known_skips_unknown() {
1097 let t = tok();
1098 // Individual bare consonants unlikely to be dict words → Unknown tokens
1099 let mut stream = t.segment_stream("กขค");
1100 while let Some(tok) = stream.next_known() {
1101 assert_ne!(
1102 tok.kind,
1103 TokenKind::Unknown,
1104 "next_known() must not return an Unknown token"
1105 );
1106 assert_ne!(
1107 tok.kind,
1108 TokenKind::Whitespace,
1109 "next_known() must not return a Whitespace token"
1110 );
1111 }
1112 }
1113
1114 #[test]
1115 fn next_above_confidence_filters_low() {
1116 let t = tok();
1117 let text = "กินข้าวกับปลา";
1118 let threshold = 0.8_f32;
1119 let mut stream = t.segment_stream(text);
1120 while let Some(tok) = stream.next_above_confidence(threshold) {
1121 assert!(
1122 tok.confidence >= threshold,
1123 "next_above_confidence({threshold}) returned token {:?} with confidence {}",
1124 tok.text,
1125 tok.confidence
1126 );
1127 }
1128 }
1129}