kham_core/
segmenter.rs

1//! DAG-based maximal matching segmenter (newmm algorithm).
2//!
3//! The segmenter builds a Directed Acyclic Word Graph (DAWG) over the input
4//! text using TCC boundaries as candidate split points, then finds the path
5//! that maximises the number of dictionary matches (fewest unknown tokens).
6//!
7//! ## Pipeline
8//!
9//! ```text
10//! raw text
11//!   │
12//!   ▼  (optional) Tokenizer::normalize()   ← fixes tone dedup + Sara Am composition
13//!   │
14//!   ▼  pre_tokenize()
15//! [Thai span] [Number span] [Latin span] …
16//!   │
17//!   ▼  (Thai spans only) tcc_boundaries()
18//! TCC boundary positions: [0, b1, b2, …, len]
19//!   │
20//!   ▼  DP over boundary indices
21//! path of (start, end) pairs that maximises dict matches
22//!   │
23//!   ▼
24//! Vec<Token<'_>>
25//! ```
26//!
27//! ## Normalization and zero-copy
28//!
29//! [`Tokenizer::segment`] is zero-copy: every [`Token`] borrows directly from
30//! the `&str` you pass in. This means segment() cannot internally normalize
31//! the text (normalization may reorder/remove characters, producing a new
32//! allocation with different byte offsets).
33//!
34//! For input that may contain สระลอย in wrong order, stacked tone marks, or
35//! decomposed Sara Am, use the two-step pattern:
36//!
37//! ```rust
38//! use kham_core::Tokenizer;
39//!
40//! let tok = Tokenizer::new();
41//! let normalized = tok.normalize("กเินข้าว"); // fix any encoding issues
42//! let tokens = tok.segment(&normalized);       // tokens borrow `normalized`
43//! ```
44
45use alloc::vec;
46use alloc::vec::Vec;
47
48use crate::dict::{builtin_dict, Dict, BUILTIN_WORDS};
49use crate::error::KhamError;
50use crate::freq::FreqMap;
51use crate::normalizer;
52use crate::pre_tokenizer::pre_tokenize;
53use crate::tcc::tcc_boundaries;
54use crate::token::{Token, TokenKind};
55
56/// High-level tokenizer. Holds a compiled dictionary and segmentation options.
57///
58/// # Example
59///
60/// ```rust
61/// use kham_core::Tokenizer;
62///
63/// let tok = Tokenizer::new();
64/// let tokens = tok.segment("กินข้าวกับปลา");
65/// assert!(!tokens.is_empty());
66/// ```
67pub struct Tokenizer {
68    dict: Dict,
69    freq: FreqMap,
70    keep_whitespace: bool,
71}
72
73impl Tokenizer {
74    /// Create a tokenizer with the built-in dictionary and TNC frequency table.
75    pub fn new() -> Self {
76        Self {
77            dict: builtin_dict(),
78            freq: FreqMap::builtin(),
79            keep_whitespace: false,
80        }
81    }
82
83    /// Normalise Thai text into canonical form.
84    ///
85    /// This is a convenience wrapper around [`normalizer::normalize`].
86    /// Because [`segment`] is zero-copy, normalization must happen **before**
87    /// segmentation. The caller owns the returned [`alloc::string::String`] and can then
88    /// borrow it for [`segment`]:
89    ///
90    /// ```rust
91    /// use kham_core::Tokenizer;
92    ///
93    /// let tok = Tokenizer::new();
94    /// // Input with a doubled tone mark and decomposed Sara Am
95    /// let raw = "\u{0E01}\u{0E34}\u{0E19}\u{0E19}\u{0E49}\u{0E4D}\u{0E32}"; // กิน + น + ้ + อํ + อา
96    /// let normalized = tok.normalize(raw); // น้ำ composed, no dedup needed here
97    /// let tokens = tok.segment(&normalized); // tokens borrow `normalized`
98    /// assert!(!tokens.is_empty());
99    /// ```
100    ///
101    /// [`segment`]: Tokenizer::segment
102    pub fn normalize(&self, text: &str) -> alloc::string::String {
103        normalizer::normalize(text)
104    }
105
106    /// Return a [`TokenizerBuilder`] for custom configuration.
107    ///
108    /// # Example
109    ///
110    /// ```rust
111    /// use kham_core::Tokenizer;
112    ///
113    /// // Use built-in dict (no extra words needed here)
114    /// let tok = Tokenizer::builder().build();
115    /// let tokens = tok.segment("สวัสดีชาวโลก");
116    /// assert!(!tokens.is_empty());
117    /// ```
118    pub fn builder() -> TokenizerBuilder {
119        TokenizerBuilder::default()
120    }
121
122    /// Segment `text` into tokens.
123    ///
124    /// Returns a `Vec<Token<'_>>` where every token's `text` is a
125    /// zero-copy sub-slice of `text`.
126    ///
127    /// Non-Thai spans (Latin, Number, Whitespace, Emoji, Punctuation) pass
128    /// through unchanged. Thai spans are segmented with the newmm DAG
129    /// algorithm constrained to TCC boundaries.
130    ///
131    /// # Examples
132    ///
133    /// ```rust
134    /// use kham_core::{Tokenizer, TokenKind};
135    ///
136    /// let tok = Tokenizer::new();
137    /// // Mixed Thai + number + Thai — number token lands at index 1
138    /// let tokens = tok.segment("ธนาคาร100แห่ง");
139    /// assert_eq!(tokens[1].text, "100");
140    /// assert_eq!(tokens[1].kind, TokenKind::Number);
141    /// ```
142    ///
143    /// Joining all token texts reconstructs the original string (whitespace
144    /// is dropped by default, so the joined result omits whitespace):
145    ///
146    /// ```rust
147    /// use kham_core::Tokenizer;
148    ///
149    /// let tok = Tokenizer::new();
150    /// let text = "กินข้าวกับปลา";
151    /// let tokens = tok.segment(text);
152    /// let rebuilt: String = tokens.iter().map(|t| t.text).collect();
153    /// assert_eq!(rebuilt, text);
154    /// ```
155    ///
156    /// Every token carries both byte and char offsets into the original string:
157    ///
158    /// ```rust
159    /// use kham_core::Tokenizer;
160    ///
161    /// let tok = Tokenizer::new();
162    /// let text = "ธนาคาร100แห่ง";
163    /// let tokens = tok.segment(text);
164    /// for t in &tokens {
165    ///     // Byte span: valid UTF-8 slice
166    ///     assert_eq!(&text[t.span.clone()], t.text);
167    ///     // Char span: length matches Unicode scalar count
168    ///     assert_eq!(t.char_span.end - t.char_span.start, t.text.chars().count());
169    /// }
170    /// ```
171    pub fn segment<'t>(&self, text: &'t str) -> Vec<Token<'t>> {
172        if text.is_empty() {
173            return Vec::new();
174        }
175
176        // Split into script-homogeneous spans. Non-Thai spans pass through;
177        // Thai spans go through the newmm DAG segmenter.
178        // Call normalize() first if the input may contain สระลอย in wrong
179        // order, stacked tone marks, or decomposed Sara Am.
180        let pre_tokens = pre_tokenize(text);
181
182        let mut result: Vec<Token<'t>> = Vec::with_capacity(pre_tokens.len() * 2);
183
184        for token in pre_tokens {
185            match token.kind {
186                TokenKind::Thai => {
187                    segment_thai(&self.dict, &self.freq, text, token.span, &mut result);
188                }
189                TokenKind::Whitespace if !self.keep_whitespace => {
190                    // Discard whitespace tokens unless keep_whitespace is set.
191                }
192                _ => {
193                    result.push(token);
194                }
195            }
196        }
197
198        result
199    }
200
201    /// Segment `text` and return a [`TokenStream`] for incremental consumption.
202    ///
203    /// # Example
204    ///
205    /// ```rust
206    /// use kham_core::Tokenizer;
207    ///
208    /// let tok = Tokenizer::new();
209    /// let mut stream = tok.segment_stream("กินข้าวกับปลา");
210    /// while let Some(t) = stream.next_word() {
211    ///     println!("{}", t.text);
212    /// }
213    /// ```
214    pub fn segment_stream<'t>(&self, text: &'t str) -> TokenStream<'t> {
215        TokenStream {
216            inner: self.segment(text).into_iter(),
217        }
218    }
219}
220
221// ---------------------------------------------------------------------------
222// TokenStream
223// ---------------------------------------------------------------------------
224
225/// A streaming iterator over [`Token`]s produced by [`Tokenizer::segment_stream`].
226///
227/// Wraps the full `Vec<Token>` as an [`alloc::vec::IntoIter`]; the streaming
228/// API lets callers consume tokens one at a time and filter by kind or
229/// confidence without allocating a second collection.
230///
231/// # Example
232///
233/// ```rust
234/// use kham_core::Tokenizer;
235///
236/// let tok = Tokenizer::builder().keep_whitespace(true).build();
237/// let mut stream = tok.segment_stream("กิน ข้าว");
238/// // next_word() skips whitespace tokens.
239/// while let Some(t) = stream.next_word() {
240///     println!("{}", t.text);
241/// }
242/// ```
243pub struct TokenStream<'t> {
244    inner: alloc::vec::IntoIter<Token<'t>>,
245}
246
247impl<'t> TokenStream<'t> {
248    /// Advance past [`TokenKind::Whitespace`] tokens and return the next
249    /// non-whitespace token, or `None` when the stream is exhausted.
250    pub fn next_word(&mut self) -> Option<Token<'t>> {
251        self.inner
252            .by_ref()
253            .find(|t| t.kind != TokenKind::Whitespace)
254    }
255
256    /// Advance past whitespace and [`TokenKind::Unknown`] tokens and return
257    /// the next token whose kind is neither, or `None` when exhausted.
258    pub fn next_known(&mut self) -> Option<Token<'t>> {
259        self.inner
260            .by_ref()
261            .find(|t| t.kind != TokenKind::Whitespace && t.kind != TokenKind::Unknown)
262    }
263
264    /// Advance past tokens with `confidence < min` and return the next
265    /// qualifying token, or `None` when the stream is exhausted.
266    pub fn next_above_confidence(&mut self, min: f32) -> Option<Token<'t>> {
267        self.inner.by_ref().find(|t| t.confidence >= min)
268    }
269}
270
271impl<'t> Iterator for TokenStream<'t> {
272    type Item = Token<'t>;
273
274    #[inline]
275    fn next(&mut self) -> Option<Token<'t>> {
276        self.inner.next()
277    }
278
279    #[inline]
280    fn size_hint(&self) -> (usize, Option<usize>) {
281        self.inner.size_hint()
282    }
283}
284
285// ---------------------------------------------------------------------------
286// newmm DAG segmentation — Thai spans only
287// ---------------------------------------------------------------------------
288
289/// Lexicographic DP score for a TCC boundary position.
290///
291/// Fields are ordered so that `Ord` naturally expresses the newmm preference:
292/// 1. Minimise unknowns (fewer unknowns → `neg_unknowns` less negative → greater).
293/// 2. Minimise total token count (prefer longer compounds over split components).
294/// 3. Maximise dictionary matches.
295/// 4. Maximise cumulative TNC frequency as the final tiebreaker.
296#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
297struct DpScore {
298    neg_unknowns: i32,
299    neg_tokens: i32,
300    dict_words: i32,
301    freq_score: u64,
302}
303
304impl DpScore {
305    const ZERO: Self = Self {
306        neg_unknowns: 0,
307        dict_words: 0,
308        freq_score: 0,
309        neg_tokens: 0,
310    };
311
312    fn dict_edge(self, freq: u32) -> Self {
313        Self {
314            dict_words: self.dict_words + 1,
315            freq_score: self.freq_score + freq as u64,
316            neg_tokens: self.neg_tokens - 1,
317            ..self
318        }
319    }
320
321    fn unknown_edge(self) -> Self {
322        Self {
323            neg_unknowns: self.neg_unknowns - 1,
324            neg_tokens: self.neg_tokens - 1,
325            ..self
326        }
327    }
328}
329
330/// Output of the forward DP pass.
331struct DpTable {
332    /// Predecessor boundary index for backtracking.
333    from: Vec<usize>,
334    /// Whether the incoming edge at index `i` was a dictionary match.
335    is_dict: Vec<bool>,
336    /// TNC frequency of the winning dict edge that arrived at boundary `i`.
337    /// `0` for unknown edges or dict words with zero corpus frequency.
338    edge_freq: Vec<u32>,
339    /// Number of edges (dict + unknown fallback) that were considered when
340    /// trying to arrive at boundary `i`. Capped at 255.
341    competing: Vec<u8>,
342}
343
344/// Forward DP over TCC boundary indices for a single Thai slice.
345///
346/// `bounds` must be the output of [`tcc_boundaries`] for `slice`.
347fn forward_dp(dict: &Dict, freqs: &FreqMap, slice: &str, bounds: &[usize]) -> DpTable {
348    let nb = bounds.len();
349    let mut best: Vec<Option<DpScore>> = vec![None; nb];
350    let mut from = vec![0usize; nb];
351    let mut is_dict = vec![false; nb];
352    let mut edge_freq = vec![0u32; nb];
353    let mut competing = vec![0u8; nb];
354
355    best[0] = Some(DpScore::ZERO);
356
357    for i in 0..nb - 1 {
358        let score = match best[i] {
359            Some(s) => s,
360            None => continue,
361        };
362        let pos = bounds[i];
363        let remaining = &slice[pos..];
364
365        // Dictionary edges — all prefixes, not just the longest, so the DP
366        // can make a globally optimal choice rather than a greedy one.
367        for prefix in dict.prefixes(remaining) {
368            let end_pos = pos + prefix.len();
369            if let Ok(j) = bounds.binary_search(&end_pos) {
370                // Count every dict edge considered at this boundary.
371                competing[j] = competing[j].saturating_add(1);
372                let freq = freqs.get(prefix);
373                let candidate = Some(score.dict_edge(freq));
374                if candidate > best[j] {
375                    best[j] = candidate;
376                    from[j] = i;
377                    is_dict[j] = true;
378                    edge_freq[j] = freq;
379                }
380            }
381        }
382
383        // Fallback edge: advance one TCC as an unknown token.
384        let j = i + 1;
385        // Count the unknown fallback edge as a competing edge too.
386        competing[j] = competing[j].saturating_add(1);
387        let candidate = Some(score.unknown_edge());
388        if candidate > best[j] {
389            best[j] = candidate;
390            from[j] = i;
391            is_dict[j] = false;
392            edge_freq[j] = 0;
393        }
394    }
395
396    DpTable {
397        from,
398        is_dict,
399        edge_freq,
400        competing,
401    }
402}
403
404/// Reconstruct the winning boundary-index path by following `from` pointers
405/// from the last index back to 0, then reversing.
406fn backtrack_path(from: &[usize]) -> Vec<usize> {
407    let nb = from.len();
408    let mut path = Vec::with_capacity(nb);
409    let mut cur = nb - 1;
410    loop {
411        path.push(cur);
412        if cur == 0 {
413            break;
414        }
415        cur = from[cur];
416    }
417    path.reverse();
418    path
419}
420
421/// Compute the segmentation confidence for a single token boundary.
422///
423/// - `is_dict`: whether the winning edge at this boundary was a dictionary match.
424/// - `freq`: TNC corpus frequency of the winning dict edge (`0` for unknown edges
425///   or dict words absent from the frequency table).
426/// - `competing`: total number of edges (dict + unknown fallback) that were
427///   considered when arriving at this boundary.
428///
429/// Returns a value in `[0.0, 1.0]` following the design:
430/// - Unknown token → `0.0`
431/// - Dict match, zero freq → base `0.7`
432/// - Dict match, nonzero freq → base `1.0`
433/// - Ambiguity penalty applied multiplicatively: 2 edges → ×0.9, 3 → ×0.8, 4+ → ×0.7
434fn compute_confidence(is_dict: bool, freq: u32, competing: u8) -> f32 {
435    if !is_dict {
436        return 0.0;
437    }
438    let base = if freq > 0 { 1.0_f32 } else { 0.7_f32 };
439    let amb = match competing {
440        0 | 1 => 1.0,
441        2 => 0.9,
442        3 => 0.8,
443        _ => 0.7,
444    };
445    base * amb
446}
447
448/// Segment a single Thai span using the newmm DAG algorithm and append tokens
449/// to `out`.
450///
451/// Steps: TCC boundaries → forward DP → backtrack → emit tokens.
452fn segment_thai<'t>(
453    dict: &Dict,
454    freqs: &FreqMap,
455    text: &'t str,
456    span: core::ops::Range<usize>,
457    out: &mut Vec<Token<'t>>,
458) {
459    let slice = &text[span.start..span.end];
460    let bounds = tcc_boundaries(slice);
461
462    if bounds.len() <= 1 {
463        return;
464    }
465
466    let dp = forward_dp(dict, freqs, slice, &bounds);
467    let path = backtrack_path(&dp.from);
468
469    // Char offset of span.start — computed once, then incremented per token.
470    let mut char_cursor = text[..span.start].chars().count();
471
472    for w in path.windows(2) {
473        let start_byte = span.start + bounds[w[0]];
474        let end_byte = span.start + bounds[w[1]];
475        let token_text = &text[start_byte..end_byte];
476        let char_start = char_cursor;
477        char_cursor += token_text.chars().count();
478        let kind = if dp.is_dict[w[1]] {
479            TokenKind::Thai
480        } else {
481            TokenKind::Unknown
482        };
483        let confidence =
484            compute_confidence(dp.is_dict[w[1]], dp.edge_freq[w[1]], dp.competing[w[1]]);
485        out.push(Token::new(
486            token_text,
487            start_byte..end_byte,
488            char_start..char_cursor,
489            kind,
490            confidence,
491        ));
492    }
493}
494
495// ---------------------------------------------------------------------------
496// Tokenizer trait impls
497// ---------------------------------------------------------------------------
498
499impl Default for Tokenizer {
500    fn default() -> Self {
501        Self::new()
502    }
503}
504
505// ---------------------------------------------------------------------------
506// TokenizerBuilder
507// ---------------------------------------------------------------------------
508
509/// Builder for [`Tokenizer`].
510///
511/// # Example
512///
513/// ```rust
514/// use kham_core::Tokenizer;
515///
516/// let tok = Tokenizer::builder()
517///     .keep_whitespace(true)
518///     .build();
519/// ```
520#[derive(Debug, Default)]
521pub struct TokenizerBuilder {
522    dict_words: Option<alloc::string::String>,
523    dict_merge: Option<alloc::string::String>,
524    keep_whitespace: bool,
525}
526
527impl TokenizerBuilder {
528    /// Load an additional word list from a string (newline-separated words).
529    ///
530    /// Words are merged with the built-in dictionary.
531    ///
532    /// # Example
533    ///
534    /// ```rust
535    /// use kham_core::{Tokenizer, TokenKind};
536    ///
537    /// let tok = Tokenizer::builder()
538    ///     .dict_words("ปัญญาประดิษฐ์\n")
539    ///     .build();
540    /// let tokens = tok.segment("ปัญญาประดิษฐ์คือ");
541    /// assert!(tokens.iter().any(|t| t.text == "ปัญญาประดิษฐ์" && t.kind == TokenKind::Thai));
542    /// ```
543    pub fn dict_words(mut self, words: &str) -> Self {
544        self.dict_words = Some(alloc::string::String::from(words));
545        self
546    }
547
548    /// Configure whether whitespace tokens are included in the output.
549    ///
550    /// Default: `false` (whitespace is discarded).
551    ///
552    /// # Example
553    ///
554    /// ```rust
555    /// use kham_core::{Tokenizer, TokenKind};
556    ///
557    /// let tok = Tokenizer::builder().keep_whitespace(true).build();
558    /// let tokens = tok.segment("กิน ข้าว");
559    /// assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
560    /// // Byte spans are contiguous when whitespace is kept
561    /// for w in tokens.windows(2) {
562    ///     assert_eq!(w[0].span.end, w[1].span.start);
563    /// }
564    /// ```
565    /// Add extra words via a lightweight overlay — no trie rebuild.
566    ///
567    /// Words are stored in a sorted list alongside the pre-compiled trie.
568    /// This is O(k log k) in the number of custom words and avoids the O(N)
569    /// full trie rebuild that [`dict_words`](Self::dict_words) performs.
570    ///
571    /// Prefer `dict_merge` over `dict_words` when adding a small custom
572    /// vocabulary (e.g. domain-specific terms, product names).
573    ///
574    /// If both `dict_merge` and `dict_words` are called, `dict_words` takes
575    /// precedence (it performs a full rebuild that subsumes any overlay).
576    ///
577    /// # Example
578    ///
579    /// ```rust
580    /// use kham_core::{Tokenizer, TokenKind};
581    ///
582    /// let tok = Tokenizer::builder()
583    ///     .dict_merge("ปัญญาประดิษฐ์\nโปรแกรมเมอร์\n")
584    ///     .build();
585    /// let tokens = tok.segment("ปัญญาประดิษฐ์คือ");
586    /// assert!(tokens.iter().any(|t| t.text == "ปัญญาประดิษฐ์" && t.kind == TokenKind::Thai));
587    /// ```
588    pub fn dict_merge(mut self, words: &str) -> Self {
589        self.dict_merge = Some(alloc::string::String::from(words));
590        self
591    }
592
593    /// Configure whether whitespace tokens are included in the output.
594    ///
595    /// Default: `false` (whitespace is discarded).
596    ///
597    /// # Example
598    ///
599    /// ```rust
600    /// use kham_core::{Tokenizer, TokenKind};
601    ///
602    /// let tok = Tokenizer::builder().keep_whitespace(true).build();
603    /// let tokens = tok.segment("กิน ข้าว");
604    /// assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
605    /// // Byte spans are contiguous when whitespace is kept
606    /// for w in tokens.windows(2) {
607    ///     assert_eq!(w[0].span.end, w[1].span.start);
608    /// }
609    /// ```
610    pub fn keep_whitespace(mut self, keep: bool) -> Self {
611        self.keep_whitespace = keep;
612        self
613    }
614
615    /// Consume the builder and return a configured [`Tokenizer`].
616    pub fn build(self) -> Tokenizer {
617        let dict = if let Some(extra) = &self.dict_words {
618            // Full rebuild path: merges BUILTIN_WORDS + custom words into a new trie.
619            let mut combined = alloc::string::String::from(BUILTIN_WORDS);
620            combined.push('\n');
621            combined.push_str(extra);
622            Dict::from_word_list(&combined)
623        } else if let Some(overlay) = &self.dict_merge {
624            // Fast overlay path: load pre-compiled binary, attach small sorted list.
625            builtin_dict().with_overlay(overlay)
626        } else {
627            // Default path: load from pre-compiled binary — O(S) copy.
628            builtin_dict()
629        };
630        Tokenizer {
631            dict,
632            freq: FreqMap::builtin(),
633            keep_whitespace: self.keep_whitespace,
634        }
635    }
636
637    /// Try to load a custom word list from a file path.
638    ///
639    /// Only available when the `std` feature is enabled.
640    ///
641    /// # Errors
642    ///
643    /// Returns [`KhamError::DictLoadError`] if the file cannot be read.
644    ///
645    /// # Example
646    ///
647    /// ```rust,no_run
648    /// use kham_core::Tokenizer;
649    ///
650    /// let tok = Tokenizer::builder()
651    ///     .dict_file("my_words.txt")
652    ///     .expect("failed to load dict")
653    ///     .build();
654    /// ```
655    #[cfg(feature = "std")]
656    pub fn dict_file(self, path: &str) -> Result<Self, KhamError> {
657        extern crate std;
658        let content = std::fs::read_to_string(path)
659            .map_err(|e| KhamError::DictLoadError(alloc::format!("{path}: {e}")))?;
660        Ok(self.dict_words(&content))
661    }
662}
663
664// ---------------------------------------------------------------------------
665// Tests
666// ---------------------------------------------------------------------------
667
668#[cfg(test)]
669mod tests {
670    use super::*;
671
672    fn tok() -> Tokenizer {
673        Tokenizer::new()
674    }
675
676    // ── basic smoke tests ────────────────────────────────────────────────────
677
678    #[test]
679    fn empty_input() {
680        assert!(tok().segment("").is_empty());
681    }
682
683    #[test]
684    fn pure_latin_passthrough() {
685        let tokens = tok().segment("hello");
686        assert_eq!(tokens.len(), 1);
687        assert_eq!(tokens[0].text, "hello");
688        assert_eq!(tokens[0].kind, TokenKind::Latin);
689    }
690
691    #[test]
692    fn pure_number_passthrough() {
693        let tokens = tok().segment("12345");
694        assert_eq!(tokens.len(), 1);
695        assert_eq!(tokens[0].text, "12345");
696        assert_eq!(tokens[0].kind, TokenKind::Number);
697    }
698
699    #[test]
700    fn whitespace_dropped_by_default() {
701        let tokens = tok().segment("กิน ข้าว");
702        for t in &tokens {
703            assert_ne!(t.kind, TokenKind::Whitespace);
704        }
705    }
706
707    #[test]
708    fn whitespace_kept_when_requested() {
709        let tokens = Tokenizer::builder()
710            .keep_whitespace(true)
711            .build()
712            .segment("กิน ข้าว");
713        assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
714    }
715
716    // ── Thai segmentation ────────────────────────────────────────────────────
717
718    #[test]
719    fn gin_khao_gap_pla() {
720        // "กินข้าวกับปลา" — all words must be in the built-in dict
721        let tokens = tok().segment("กินข้าวกับปลา");
722        let words: Vec<&str> = tokens.iter().map(|t| t.text).collect();
723        // Must segment into at least 2 tokens (dict has กิน, ข้าว, กับ, ปลา)
724        assert!(words.len() >= 2, "expected multiple words, got {words:?}");
725        // Reconstructing must yield the original string
726        assert_eq!(words.join(""), "กินข้าวกับปลา");
727    }
728
729    #[test]
730    fn mixed_thai_number_thai() {
731        // Classic CLAUDE.md example
732        let tokens = tok().segment("ธนาคาร100แห่ง");
733        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
734        assert_eq!(rebuilt, "ธนาคาร100แห่ง");
735        // "100" must survive as a Number token
736        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
737        assert!(num.is_some());
738        assert_eq!(num.unwrap().text, "100");
739    }
740
741    #[test]
742    fn mixed_thai_latin() {
743        let tokens = tok().segment("สวัสดี hello");
744        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
745        // Whitespace dropped by default
746        assert_eq!(rebuilt, "สวัสดีhello");
747        assert!(tokens
748            .iter()
749            .any(|t| t.kind == TokenKind::Latin && t.text == "hello"));
750    }
751
752    // ── span / byte-offset invariants ────────────────────────────────────────
753
754    #[test]
755    fn spans_cover_input_excluding_whitespace() {
756        let text = "กินข้าว123hello";
757        let tokens = tok().segment(text);
758        // Every span must be a valid UTF-8 slice of `text`.
759        for t in &tokens {
760            assert_eq!(&text[t.span.clone()], t.text);
761            assert!(text.is_char_boundary(t.span.start));
762            assert!(text.is_char_boundary(t.span.end));
763        }
764    }
765
766    #[test]
767    fn adjacent_spans_are_contiguous() {
768        let text = "กินข้าวกับปลา";
769        let tokens = Tokenizer::builder()
770            .keep_whitespace(true)
771            .build()
772            .segment(text);
773        for w in tokens.windows(2) {
774            assert_eq!(
775                w[0].span.end, w[1].span.start,
776                "gap between {:?} and {:?}",
777                w[0], w[1]
778            );
779        }
780    }
781
782    #[test]
783    fn no_empty_tokens() {
784        let tokens = tok().segment("กินข้าวกับปลา 100 hello!");
785        for t in &tokens {
786            assert!(!t.text.is_empty());
787        }
788    }
789
790    // ── custom dictionary ─────────────────────────────────────────────────────
791
792    #[test]
793    fn custom_dict_word_is_matched() {
794        // Use a nonsense word that is not in the built-in dictionary and cannot
795        // be decomposed into subwords — ensures the custom dict is actually used.
796        let tok = Tokenizer::builder().dict_words("กขคงจฉ\n").build();
797        let tokens = tok.segment("กขคงจฉ");
798        let thai: Vec<&str> = tokens
799            .iter()
800            .filter(|t| t.kind == TokenKind::Thai)
801            .map(|t| t.text)
802            .collect();
803        assert!(thai.contains(&"กขคงจฉ"), "got: {thai:?}");
804    }
805
806    // ── normalize then segment ────────────────────────────────────────────────
807
808    #[test]
809    fn normalize_deduplicates_tone_before_segment() {
810        // กินข้าว with a doubled tone mark on ข้ — normalize fixes it, segment proceeds.
811        let t = tok();
812        // Insert a doubled tone on ข: ข + อ้ + อ้  (ข้้)
813        let raw = "กิน\u{0E02}\u{0E49}\u{0E49}าว"; // กิน + ข้้ + าว
814        let normalized = t.normalize(raw);
815        let tokens = t.segment(&normalized);
816        assert!(!tokens.is_empty());
817        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
818        assert_eq!(rebuilt, normalized);
819    }
820
821    #[test]
822    fn normalize_clean_input_is_identity() {
823        // normalize() on already-clean text should not change it.
824        let t = tok();
825        let clean = "กินข้าวกับปลา";
826        assert_eq!(t.normalize(clean), clean);
827    }
828
829    #[test]
830    fn segment_without_normalize_on_clean_input() {
831        // segment() alone is sufficient when input is already canonical.
832        let tokens = tok().segment("กินข้าวกับปลา");
833        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
834        assert_eq!(rebuilt, "กินข้าวกับปลา");
835    }
836
837    // ── DpScore ordering ──────────────────────────────────────────────────────
838    //
839    // The score is a 4-field lexicographic key:
840    //   1. neg_unknowns  — fewer unknowns is strictly better
841    //   2. neg_tokens    — fewer tokens (prefer longer compounds over split components)
842    //   3. dict_words    — more dictionary matches breaks token-count ties
843    //   4. freq_score    — higher cumulative TNC frequency as the final tiebreaker
844
845    #[test]
846    fn dp_score_fewer_unknowns_is_primary() {
847        // A path with no unknowns beats one with unknowns regardless of other fields.
848        let no_unknown = DpScore::ZERO;
849        let one_unknown = DpScore::ZERO.unknown_edge();
850        assert!(no_unknown > one_unknown);
851    }
852
853    #[test]
854    fn dp_score_fewer_tokens_beats_more_dict_words() {
855        // Fewer tokens wins over more dict matches: เดินทาง (1 token, 1 match)
856        // beats เดิน+ทาง (2 tokens, 2 matches).
857        let compound = DpScore::ZERO.dict_edge(0); // 1 token, 1 dict
858        let split = DpScore::ZERO.dict_edge(0).dict_edge(0); // 2 tokens, 2 dict
859        assert!(compound > split);
860    }
861
862    #[test]
863    fn dp_score_higher_freq_breaks_token_tie() {
864        // Same unknowns and token count; higher TNC freq wins.
865        let low_freq = DpScore::ZERO.dict_edge(10);
866        let high_freq = DpScore::ZERO.dict_edge(100);
867        assert!(high_freq > low_freq);
868    }
869
870    #[test]
871    fn dp_score_fewer_tokens_beats_higher_freq() {
872        // Fewer tokens wins even when the competing path has higher TNC frequency.
873        let high_freq_more_tokens = DpScore {
874            neg_unknowns: 0,
875            neg_tokens: -2,
876            dict_words: 1,
877            freq_score: 200,
878        };
879        let low_freq_fewer_tokens = DpScore {
880            neg_unknowns: 0,
881            neg_tokens: -1,
882            dict_words: 1,
883            freq_score: 100,
884        };
885        assert!(low_freq_fewer_tokens > high_freq_more_tokens);
886    }
887
888    #[test]
889    fn dp_score_more_dict_words_breaks_token_tie() {
890        // Same unknowns and token count; more dict matches wins.
891        let fewer_dict = DpScore {
892            neg_unknowns: 0,
893            neg_tokens: -2,
894            dict_words: 1,
895            freq_score: 0,
896        };
897        let more_dict = DpScore {
898            neg_unknowns: 0,
899            neg_tokens: -2,
900            dict_words: 2,
901            freq_score: 0,
902        };
903        assert!(more_dict > fewer_dict);
904    }
905
906    #[test]
907    fn dict_edge_accumulates_freq_score() {
908        let after_one = DpScore::ZERO.dict_edge(50);
909        let after_two = after_one.dict_edge(30);
910        assert_eq!(after_one.freq_score, 50);
911        assert_eq!(after_two.freq_score, 80);
912    }
913
914    #[test]
915    fn dict_edge_increments_dict_words_and_neg_tokens() {
916        let s = DpScore::ZERO.dict_edge(0);
917        assert_eq!(s.dict_words, 1);
918        assert_eq!(s.neg_tokens, -1);
919        assert_eq!(s.neg_unknowns, 0);
920    }
921
922    #[test]
923    fn unknown_edge_increments_neg_unknowns_only() {
924        let s = DpScore::ZERO.unknown_edge();
925        assert_eq!(s.neg_unknowns, -1);
926        assert_eq!(s.neg_tokens, -1);
927        assert_eq!(s.dict_words, 0);
928        assert_eq!(s.freq_score, 0);
929    }
930
931    #[test]
932    fn unknown_edge_does_not_contribute_freq() {
933        let s = DpScore::ZERO.unknown_edge().unknown_edge();
934        assert_eq!(s.freq_score, 0);
935    }
936
937    // ── char_span invariants ──────────────────────────────────────────────────
938
939    #[test]
940    fn char_span_len_equals_char_count() {
941        let tokens = tok().segment("กินข้าวกับปลา");
942        for t in &tokens {
943            assert_eq!(
944                t.char_span.end - t.char_span.start,
945                t.text.chars().count(),
946                "char_span length mismatch for {:?}",
947                t.text
948            );
949        }
950    }
951
952    #[test]
953    fn char_spans_are_contiguous() {
954        let tokens = Tokenizer::builder()
955            .keep_whitespace(true)
956            .build()
957            .segment("กินข้าว 100 hello");
958        for w in tokens.windows(2) {
959            assert_eq!(
960                w[0].char_span.end, w[1].char_span.start,
961                "char_span gap between {:?} and {:?}",
962                w[0].text, w[1].text
963            );
964        }
965    }
966
967    #[test]
968    fn char_span_for_mixed_script() {
969        // "ธนาคาร100แห่ง": ธนาคาร=6 chars, 100=3 chars, แห่ง=4 chars
970        let tokens = tok().segment("ธนาคาร100แห่ง");
971        assert_eq!(tokens[0].char_span, 0..6);
972        assert_eq!(tokens[1].char_span, 6..9);
973        assert_eq!(tokens[2].char_span, 9..13);
974    }
975
976    #[test]
977    fn char_span_accounts_for_multibyte_chars() {
978        // Each Thai codepoint is 3 bytes but 1 char.
979        // "กิน" = 3 chars (9 bytes); char_span should be 0..3, span 0..9.
980        let tokens = tok().segment("กิน");
981        assert_eq!(tokens[0].span, 0..9);
982        assert_eq!(tokens[0].char_span, 0..3);
983    }
984
985    #[test]
986    fn char_span_emoji_is_single_char() {
987        // 😀 = 1 char, 4 bytes — verify char_span counts it as 1.
988        let tokens = tok().segment("😀");
989        assert_eq!(tokens[0].char_len(), 1);
990        assert_eq!(tokens[0].byte_len(), 4);
991    }
992
993    // ── edge cases ────────────────────────────────────────────────────────────
994
995    #[test]
996    fn single_thai_char() {
997        let tokens = tok().segment("ก");
998        assert_eq!(tokens.len(), 1);
999        assert_eq!(tokens[0].text, "ก");
1000    }
1001
1002    #[test]
1003    fn sawasdee_khao_lok() {
1004        let tokens = tok().segment("สวัสดีชาวโลก");
1005        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
1006        assert_eq!(rebuilt, "สวัสดีชาวโลก");
1007    }
1008
1009    // ── confidence ────────────────────────────────────────────────────────────
1010
1011    #[test]
1012    fn confidence_unknown_token_is_zero() {
1013        // A token not in the dict should get confidence 0.0
1014        let tokens = tok().segment("กขคงจฉ"); // garbage Thai that is NOT in the dict
1015                                              // There should be at least one Unknown token with confidence 0.0
1016        let unknown = tokens.iter().find(|t| t.kind == TokenKind::Unknown);
1017        if let Some(u) = unknown {
1018            assert_eq!(u.confidence, 0.0, "Unknown token must have confidence 0.0");
1019        }
1020    }
1021
1022    #[test]
1023    fn confidence_dict_word_is_positive() {
1024        // กิน, ข้าว, ปลา are all in the dict and should have confidence > 0.0
1025        let tokens = tok().segment("กินข้าวกับปลา");
1026        for t in &tokens {
1027            if t.kind == TokenKind::Thai {
1028                assert!(
1029                    t.confidence > 0.0,
1030                    "dict Thai token {:?} must have confidence > 0",
1031                    t.text
1032                );
1033            }
1034        }
1035    }
1036
1037    #[test]
1038    fn confidence_non_thai_tokens_are_1() {
1039        // Latin, Number, Emoji tokens always have confidence 1.0
1040        let tokens = tok().segment("hello 123 😀");
1041        for t in &tokens {
1042            assert_eq!(
1043                t.confidence, 1.0,
1044                "non-Thai token {:?} must have confidence 1.0",
1045                t.text
1046            );
1047        }
1048    }
1049
1050    #[test]
1051    fn confidence_range_valid() {
1052        // Confidence must always be in [0.0, 1.0]
1053        let texts = &["กินข้าวกับปลา", "สวัสดีครับ", "hello กรุงเทพ 2024 😀", "กขคง"];
1054        for text in texts {
1055            for t in tok().segment(text) {
1056                assert!(
1057                    (0.0..=1.0).contains(&t.confidence),
1058                    "token {:?} confidence {} out of range",
1059                    t.text,
1060                    t.confidence
1061                );
1062            }
1063        }
1064    }
1065
1066    // ── TokenStream ───────────────────────────────────────────────────────────
1067
1068    #[test]
1069    fn segment_stream_yields_same_as_segment() {
1070        let t = tok();
1071        let text = "กินข้าวกับปลา";
1072        let direct: alloc::vec::Vec<_> = t.segment(text);
1073        let streamed: alloc::vec::Vec<_> = t.segment_stream(text).collect();
1074        assert_eq!(direct.len(), streamed.len());
1075        for (a, b) in direct.iter().zip(streamed.iter()) {
1076            assert_eq!(a.text, b.text);
1077            assert_eq!(a.kind, b.kind);
1078            assert_eq!(a.span, b.span);
1079        }
1080    }
1081
1082    #[test]
1083    fn next_word_skips_whitespace() {
1084        let t = Tokenizer::builder().keep_whitespace(true).build();
1085        let mut stream = t.segment_stream("กิน ข้าว ปลา");
1086        while let Some(tok) = stream.next_word() {
1087            assert_ne!(
1088                tok.kind,
1089                TokenKind::Whitespace,
1090                "next_word() must not return a whitespace token"
1091            );
1092        }
1093    }
1094
1095    #[test]
1096    fn next_known_skips_unknown() {
1097        let t = tok();
1098        // Individual bare consonants unlikely to be dict words → Unknown tokens
1099        let mut stream = t.segment_stream("กขค");
1100        while let Some(tok) = stream.next_known() {
1101            assert_ne!(
1102                tok.kind,
1103                TokenKind::Unknown,
1104                "next_known() must not return an Unknown token"
1105            );
1106            assert_ne!(
1107                tok.kind,
1108                TokenKind::Whitespace,
1109                "next_known() must not return a Whitespace token"
1110            );
1111        }
1112    }
1113
1114    #[test]
1115    fn next_above_confidence_filters_low() {
1116        let t = tok();
1117        let text = "กินข้าวกับปลา";
1118        let threshold = 0.8_f32;
1119        let mut stream = t.segment_stream(text);
1120        while let Some(tok) = stream.next_above_confidence(threshold) {
1121            assert!(
1122                tok.confidence >= threshold,
1123                "next_above_confidence({threshold}) returned token {:?} with confidence {}",
1124                tok.text,
1125                tok.confidence
1126            );
1127        }
1128    }
1129}
kham_core/segmenter.rs

kham_core/
segmenter.rs