kham_core/
segmenter.rs

1//! DAG-based maximal matching segmenter (newmm algorithm).
2//!
3//! The segmenter builds a Directed Acyclic Word Graph (DAWG) over the input
4//! text using TCC boundaries as candidate split points, then finds the path
5//! that maximises the number of dictionary matches (fewest unknown tokens).
6//!
7//! ## Pipeline
8//!
9//! ```text
10//! raw text
11//!   │
12//!   ▼  (optional) Tokenizer::normalize()   ← fixes tone dedup + Sara Am composition
13//!   │
14//!   ▼  pre_tokenize()
15//! [Thai span] [Number span] [Latin span] …
16//!   │
17//!   ▼  (Thai spans only) tcc_boundaries()
18//! TCC boundary positions: [0, b1, b2, …, len]
19//!   │
20//!   ▼  DP over boundary indices
21//! path of (start, end) pairs that maximises dict matches
22//!   │
23//!   ▼
24//! Vec<Token<'_>>
25//! ```
26//!
27//! ## Normalization and zero-copy
28//!
29//! [`Tokenizer::segment`] is zero-copy: every [`Token`] borrows directly from
30//! the `&str` you pass in. This means segment() cannot internally normalize
31//! the text (normalization may reorder/remove characters, producing a new
32//! allocation with different byte offsets).
33//!
34//! For input that may contain สระลอย in wrong order, stacked tone marks, or
35//! decomposed Sara Am, use the two-step pattern:
36//!
37//! ```rust
38//! use kham_core::Tokenizer;
39//!
40//! let tok = Tokenizer::new();
41//! let normalized = tok.normalize("กเินข้าว"); // fix any encoding issues
42//! let tokens = tok.segment(&normalized);       // tokens borrow `normalized`
43//! ```
44
45use alloc::vec;
46use alloc::vec::Vec;
47
48use crate::dict::{builtin_dict, Dict, BUILTIN_WORDS};
49use crate::error::KhamError;
50use crate::freq::FreqMap;
51use crate::normalizer;
52use crate::pre_tokenizer::pre_tokenize;
53use crate::tcc::tcc_boundaries;
54use crate::token::{Token, TokenKind};
55
56/// High-level tokenizer. Holds a compiled dictionary and segmentation options.
57///
58/// # Example
59///
60/// ```rust
61/// use kham_core::Tokenizer;
62///
63/// let tok = Tokenizer::new();
64/// let tokens = tok.segment("กินข้าวกับปลา");
65/// assert!(!tokens.is_empty());
66/// ```
67pub struct Tokenizer {
68    dict: Dict,
69    freq: FreqMap,
70    keep_whitespace: bool,
71}
72
73impl Tokenizer {
74    /// Create a tokenizer with the built-in dictionary and TNC frequency table.
75    pub fn new() -> Self {
76        Self {
77            dict: builtin_dict(),
78            freq: FreqMap::builtin(),
79            keep_whitespace: false,
80        }
81    }
82
83    /// Normalise Thai text into canonical form.
84    ///
85    /// This is a convenience wrapper around [`normalizer::normalize`].
86    /// Because [`segment`] is zero-copy, normalization must happen **before**
87    /// segmentation. The caller owns the returned [`alloc::string::String`] and can then
88    /// borrow it for [`segment`]:
89    ///
90    /// ```rust
91    /// use kham_core::Tokenizer;
92    ///
93    /// let tok = Tokenizer::new();
94    /// // Input with a doubled tone mark and decomposed Sara Am
95    /// let raw = "\u{0E01}\u{0E34}\u{0E19}\u{0E19}\u{0E49}\u{0E4D}\u{0E32}"; // กิน + น + ้ + อํ + อา
96    /// let normalized = tok.normalize(raw); // น้ำ composed, no dedup needed here
97    /// let tokens = tok.segment(&normalized); // tokens borrow `normalized`
98    /// assert!(!tokens.is_empty());
99    /// ```
100    ///
101    /// [`segment`]: Tokenizer::segment
102    pub fn normalize(&self, text: &str) -> alloc::string::String {
103        normalizer::normalize(text)
104    }
105
106    /// Return a [`TokenizerBuilder`] for custom configuration.
107    ///
108    /// # Example
109    ///
110    /// ```rust
111    /// use kham_core::Tokenizer;
112    ///
113    /// // Use built-in dict (no extra words needed here)
114    /// let tok = Tokenizer::builder().build();
115    /// let tokens = tok.segment("สวัสดีชาวโลก");
116    /// assert!(!tokens.is_empty());
117    /// ```
118    pub fn builder() -> TokenizerBuilder {
119        TokenizerBuilder::default()
120    }
121
122    /// Segment `text` into tokens.
123    ///
124    /// Returns a `Vec<Token<'_>>` where every token's `text` is a
125    /// zero-copy sub-slice of `text`.
126    ///
127    /// Non-Thai spans (Latin, Number, Whitespace, Emoji, Punctuation) pass
128    /// through unchanged. Thai spans are segmented with the newmm DAG
129    /// algorithm constrained to TCC boundaries.
130    ///
131    /// # Examples
132    ///
133    /// ```rust
134    /// use kham_core::{Tokenizer, TokenKind};
135    ///
136    /// let tok = Tokenizer::new();
137    /// // Mixed Thai + number + Thai — number token lands at index 1
138    /// let tokens = tok.segment("ธนาคาร100แห่ง");
139    /// assert_eq!(tokens[1].text, "100");
140    /// assert_eq!(tokens[1].kind, TokenKind::Number);
141    /// ```
142    ///
143    /// Joining all token texts reconstructs the original string (whitespace
144    /// is dropped by default, so the joined result omits whitespace):
145    ///
146    /// ```rust
147    /// use kham_core::Tokenizer;
148    ///
149    /// let tok = Tokenizer::new();
150    /// let text = "กินข้าวกับปลา";
151    /// let tokens = tok.segment(text);
152    /// let rebuilt: String = tokens.iter().map(|t| t.text).collect();
153    /// assert_eq!(rebuilt, text);
154    /// ```
155    ///
156    /// Every token carries both byte and char offsets into the original string:
157    ///
158    /// ```rust
159    /// use kham_core::Tokenizer;
160    ///
161    /// let tok = Tokenizer::new();
162    /// let text = "ธนาคาร100แห่ง";
163    /// let tokens = tok.segment(text);
164    /// for t in &tokens {
165    ///     // Byte span: valid UTF-8 slice
166    ///     assert_eq!(&text[t.span.clone()], t.text);
167    ///     // Char span: length matches Unicode scalar count
168    ///     assert_eq!(t.char_span.end - t.char_span.start, t.text.chars().count());
169    /// }
170    /// ```
171    pub fn segment<'t>(&self, text: &'t str) -> Vec<Token<'t>> {
172        if text.is_empty() {
173            return Vec::new();
174        }
175
176        // Split into script-homogeneous spans. Non-Thai spans pass through;
177        // Thai spans go through the newmm DAG segmenter.
178        // Call normalize() first if the input may contain สระลอย in wrong
179        // order, stacked tone marks, or decomposed Sara Am.
180        let pre_tokens = pre_tokenize(text);
181
182        let mut result: Vec<Token<'t>> = Vec::with_capacity(pre_tokens.len() * 2);
183
184        for token in pre_tokens {
185            match token.kind {
186                TokenKind::Thai => {
187                    segment_thai(&self.dict, &self.freq, text, token.span, &mut result);
188                }
189                TokenKind::Whitespace if !self.keep_whitespace => {
190                    // Discard whitespace tokens unless keep_whitespace is set.
191                }
192                _ => {
193                    result.push(token);
194                }
195            }
196        }
197
198        result
199    }
200}
201
202// ---------------------------------------------------------------------------
203// newmm DAG segmentation — Thai spans only
204// ---------------------------------------------------------------------------
205
206/// Lexicographic DP score for a TCC boundary position.
207///
208/// Fields are ordered so that `Ord` naturally expresses the newmm preference:
209/// 1. Minimise unknowns (fewer unknowns → `neg_unknowns` less negative → greater).
210/// 2. Minimise total token count (prefer longer compounds over split components).
211/// 3. Maximise dictionary matches.
212/// 4. Maximise cumulative TNC frequency as the final tiebreaker.
213#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
214struct DpScore {
215    neg_unknowns: i32,
216    neg_tokens: i32,
217    dict_words: i32,
218    freq_score: u64,
219}
220
221impl DpScore {
222    const ZERO: Self = Self {
223        neg_unknowns: 0,
224        dict_words: 0,
225        freq_score: 0,
226        neg_tokens: 0,
227    };
228
229    fn dict_edge(self, freq: u32) -> Self {
230        Self {
231            dict_words: self.dict_words + 1,
232            freq_score: self.freq_score + freq as u64,
233            neg_tokens: self.neg_tokens - 1,
234            ..self
235        }
236    }
237
238    fn unknown_edge(self) -> Self {
239        Self {
240            neg_unknowns: self.neg_unknowns - 1,
241            neg_tokens: self.neg_tokens - 1,
242            ..self
243        }
244    }
245}
246
247/// Output of the forward DP pass.
248struct DpTable {
249    /// Predecessor boundary index for backtracking.
250    from: Vec<usize>,
251    /// Whether the incoming edge at index `i` was a dictionary match.
252    is_dict: Vec<bool>,
253}
254
255/// Forward DP over TCC boundary indices for a single Thai slice.
256///
257/// `bounds` must be the output of [`tcc_boundaries`] for `slice`.
258fn forward_dp(dict: &Dict, freqs: &FreqMap, slice: &str, bounds: &[usize]) -> DpTable {
259    let nb = bounds.len();
260    let mut best: Vec<Option<DpScore>> = vec![None; nb];
261    let mut from = vec![0usize; nb];
262    let mut is_dict = vec![false; nb];
263
264    best[0] = Some(DpScore::ZERO);
265
266    for i in 0..nb - 1 {
267        let score = match best[i] {
268            Some(s) => s,
269            None => continue,
270        };
271        let pos = bounds[i];
272        let remaining = &slice[pos..];
273
274        // Dictionary edges — all prefixes, not just the longest, so the DP
275        // can make a globally optimal choice rather than a greedy one.
276        for prefix in dict.prefixes(remaining) {
277            let end_pos = pos + prefix.len();
278            if let Ok(j) = bounds.binary_search(&end_pos) {
279                let freq = freqs.get(prefix);
280                let candidate = Some(score.dict_edge(freq));
281                if candidate > best[j] {
282                    best[j] = candidate;
283                    from[j] = i;
284                    is_dict[j] = true;
285                }
286            }
287        }
288
289        // Fallback edge: advance one TCC as an unknown token.
290        let j = i + 1;
291        let candidate = Some(score.unknown_edge());
292        if candidate > best[j] {
293            best[j] = candidate;
294            from[j] = i;
295            is_dict[j] = false;
296        }
297    }
298
299    DpTable { from, is_dict }
300}
301
302/// Reconstruct the winning boundary-index path by following `from` pointers
303/// from the last index back to 0, then reversing.
304fn backtrack_path(from: &[usize]) -> Vec<usize> {
305    let nb = from.len();
306    let mut path = Vec::with_capacity(nb);
307    let mut cur = nb - 1;
308    loop {
309        path.push(cur);
310        if cur == 0 {
311            break;
312        }
313        cur = from[cur];
314    }
315    path.reverse();
316    path
317}
318
319/// Segment a single Thai span using the newmm DAG algorithm and append tokens
320/// to `out`.
321///
322/// Steps: TCC boundaries → forward DP → backtrack → emit tokens.
323fn segment_thai<'t>(
324    dict: &Dict,
325    freqs: &FreqMap,
326    text: &'t str,
327    span: core::ops::Range<usize>,
328    out: &mut Vec<Token<'t>>,
329) {
330    let slice = &text[span.start..span.end];
331    let bounds = tcc_boundaries(slice);
332
333    if bounds.len() <= 1 {
334        return;
335    }
336
337    let dp = forward_dp(dict, freqs, slice, &bounds);
338    let path = backtrack_path(&dp.from);
339
340    // Char offset of span.start — computed once, then incremented per token.
341    let mut char_cursor = text[..span.start].chars().count();
342
343    for w in path.windows(2) {
344        let start_byte = span.start + bounds[w[0]];
345        let end_byte = span.start + bounds[w[1]];
346        let token_text = &text[start_byte..end_byte];
347        let char_start = char_cursor;
348        char_cursor += token_text.chars().count();
349        let kind = if dp.is_dict[w[1]] {
350            TokenKind::Thai
351        } else {
352            TokenKind::Unknown
353        };
354        out.push(Token::new(
355            token_text,
356            start_byte..end_byte,
357            char_start..char_cursor,
358            kind,
359        ));
360    }
361}
362
363// ---------------------------------------------------------------------------
364// Tokenizer trait impls
365// ---------------------------------------------------------------------------
366
367impl Default for Tokenizer {
368    fn default() -> Self {
369        Self::new()
370    }
371}
372
373// ---------------------------------------------------------------------------
374// TokenizerBuilder
375// ---------------------------------------------------------------------------
376
377/// Builder for [`Tokenizer`].
378///
379/// # Example
380///
381/// ```rust
382/// use kham_core::Tokenizer;
383///
384/// let tok = Tokenizer::builder()
385///     .keep_whitespace(true)
386///     .build();
387/// ```
388#[derive(Debug, Default)]
389pub struct TokenizerBuilder {
390    dict_words: Option<alloc::string::String>,
391    dict_merge: Option<alloc::string::String>,
392    keep_whitespace: bool,
393}
394
395impl TokenizerBuilder {
396    /// Load an additional word list from a string (newline-separated words).
397    ///
398    /// Words are merged with the built-in dictionary.
399    ///
400    /// # Example
401    ///
402    /// ```rust
403    /// use kham_core::{Tokenizer, TokenKind};
404    ///
405    /// let tok = Tokenizer::builder()
406    ///     .dict_words("ปัญญาประดิษฐ์\n")
407    ///     .build();
408    /// let tokens = tok.segment("ปัญญาประดิษฐ์คือ");
409    /// assert!(tokens.iter().any(|t| t.text == "ปัญญาประดิษฐ์" && t.kind == TokenKind::Thai));
410    /// ```
411    pub fn dict_words(mut self, words: &str) -> Self {
412        self.dict_words = Some(alloc::string::String::from(words));
413        self
414    }
415
416    /// Configure whether whitespace tokens are included in the output.
417    ///
418    /// Default: `false` (whitespace is discarded).
419    ///
420    /// # Example
421    ///
422    /// ```rust
423    /// use kham_core::{Tokenizer, TokenKind};
424    ///
425    /// let tok = Tokenizer::builder().keep_whitespace(true).build();
426    /// let tokens = tok.segment("กิน ข้าว");
427    /// assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
428    /// // Byte spans are contiguous when whitespace is kept
429    /// for w in tokens.windows(2) {
430    ///     assert_eq!(w[0].span.end, w[1].span.start);
431    /// }
432    /// ```
433    /// Add extra words via a lightweight overlay — no trie rebuild.
434    ///
435    /// Words are stored in a sorted list alongside the pre-compiled trie.
436    /// This is O(k log k) in the number of custom words and avoids the O(N)
437    /// full trie rebuild that [`dict_words`](Self::dict_words) performs.
438    ///
439    /// Prefer `dict_merge` over `dict_words` when adding a small custom
440    /// vocabulary (e.g. domain-specific terms, product names).
441    ///
442    /// If both `dict_merge` and `dict_words` are called, `dict_words` takes
443    /// precedence (it performs a full rebuild that subsumes any overlay).
444    ///
445    /// # Example
446    ///
447    /// ```rust
448    /// use kham_core::{Tokenizer, TokenKind};
449    ///
450    /// let tok = Tokenizer::builder()
451    ///     .dict_merge("ปัญญาประดิษฐ์\nโปรแกรมเมอร์\n")
452    ///     .build();
453    /// let tokens = tok.segment("ปัญญาประดิษฐ์คือ");
454    /// assert!(tokens.iter().any(|t| t.text == "ปัญญาประดิษฐ์" && t.kind == TokenKind::Thai));
455    /// ```
456    pub fn dict_merge(mut self, words: &str) -> Self {
457        self.dict_merge = Some(alloc::string::String::from(words));
458        self
459    }
460
461    /// Configure whether whitespace tokens are included in the output.
462    ///
463    /// Default: `false` (whitespace is discarded).
464    ///
465    /// # Example
466    ///
467    /// ```rust
468    /// use kham_core::{Tokenizer, TokenKind};
469    ///
470    /// let tok = Tokenizer::builder().keep_whitespace(true).build();
471    /// let tokens = tok.segment("กิน ข้าว");
472    /// assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
473    /// // Byte spans are contiguous when whitespace is kept
474    /// for w in tokens.windows(2) {
475    ///     assert_eq!(w[0].span.end, w[1].span.start);
476    /// }
477    /// ```
478    pub fn keep_whitespace(mut self, keep: bool) -> Self {
479        self.keep_whitespace = keep;
480        self
481    }
482
483    /// Consume the builder and return a configured [`Tokenizer`].
484    pub fn build(self) -> Tokenizer {
485        let dict = if let Some(extra) = &self.dict_words {
486            // Full rebuild path: merges BUILTIN_WORDS + custom words into a new trie.
487            let mut combined = alloc::string::String::from(BUILTIN_WORDS);
488            combined.push('\n');
489            combined.push_str(extra);
490            Dict::from_word_list(&combined)
491        } else if let Some(overlay) = &self.dict_merge {
492            // Fast overlay path: load pre-compiled binary, attach small sorted list.
493            builtin_dict().with_overlay(overlay)
494        } else {
495            // Default path: load from pre-compiled binary — O(S) copy.
496            builtin_dict()
497        };
498        Tokenizer {
499            dict,
500            freq: FreqMap::builtin(),
501            keep_whitespace: self.keep_whitespace,
502        }
503    }
504
505    /// Try to load a custom word list from a file path.
506    ///
507    /// Only available when the `std` feature is enabled.
508    ///
509    /// # Errors
510    ///
511    /// Returns [`KhamError::DictLoadError`] if the file cannot be read.
512    ///
513    /// # Example
514    ///
515    /// ```rust,no_run
516    /// use kham_core::Tokenizer;
517    ///
518    /// let tok = Tokenizer::builder()
519    ///     .dict_file("my_words.txt")
520    ///     .expect("failed to load dict")
521    ///     .build();
522    /// ```
523    #[cfg(feature = "std")]
524    pub fn dict_file(self, path: &str) -> Result<Self, KhamError> {
525        extern crate std;
526        let content = std::fs::read_to_string(path)
527            .map_err(|e| KhamError::DictLoadError(alloc::format!("{path}: {e}")))?;
528        Ok(self.dict_words(&content))
529    }
530}
531
532// ---------------------------------------------------------------------------
533// Tests
534// ---------------------------------------------------------------------------
535
536#[cfg(test)]
537mod tests {
538    use super::*;
539
540    fn tok() -> Tokenizer {
541        Tokenizer::new()
542    }
543
544    // ── basic smoke tests ────────────────────────────────────────────────────
545
546    #[test]
547    fn empty_input() {
548        assert!(tok().segment("").is_empty());
549    }
550
551    #[test]
552    fn pure_latin_passthrough() {
553        let tokens = tok().segment("hello");
554        assert_eq!(tokens.len(), 1);
555        assert_eq!(tokens[0].text, "hello");
556        assert_eq!(tokens[0].kind, TokenKind::Latin);
557    }
558
559    #[test]
560    fn pure_number_passthrough() {
561        let tokens = tok().segment("12345");
562        assert_eq!(tokens.len(), 1);
563        assert_eq!(tokens[0].text, "12345");
564        assert_eq!(tokens[0].kind, TokenKind::Number);
565    }
566
567    #[test]
568    fn whitespace_dropped_by_default() {
569        let tokens = tok().segment("กิน ข้าว");
570        for t in &tokens {
571            assert_ne!(t.kind, TokenKind::Whitespace);
572        }
573    }
574
575    #[test]
576    fn whitespace_kept_when_requested() {
577        let tokens = Tokenizer::builder()
578            .keep_whitespace(true)
579            .build()
580            .segment("กิน ข้าว");
581        assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
582    }
583
584    // ── Thai segmentation ────────────────────────────────────────────────────
585
586    #[test]
587    fn gin_khao_gap_pla() {
588        // "กินข้าวกับปลา" — all words must be in the built-in dict
589        let tokens = tok().segment("กินข้าวกับปลา");
590        let words: Vec<&str> = tokens.iter().map(|t| t.text).collect();
591        // Must segment into at least 2 tokens (dict has กิน, ข้าว, กับ, ปลา)
592        assert!(words.len() >= 2, "expected multiple words, got {words:?}");
593        // Reconstructing must yield the original string
594        assert_eq!(words.join(""), "กินข้าวกับปลา");
595    }
596
597    #[test]
598    fn mixed_thai_number_thai() {
599        // Classic CLAUDE.md example
600        let tokens = tok().segment("ธนาคาร100แห่ง");
601        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
602        assert_eq!(rebuilt, "ธนาคาร100แห่ง");
603        // "100" must survive as a Number token
604        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
605        assert!(num.is_some());
606        assert_eq!(num.unwrap().text, "100");
607    }
608
609    #[test]
610    fn mixed_thai_latin() {
611        let tokens = tok().segment("สวัสดี hello");
612        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
613        // Whitespace dropped by default
614        assert_eq!(rebuilt, "สวัสดีhello");
615        assert!(tokens
616            .iter()
617            .any(|t| t.kind == TokenKind::Latin && t.text == "hello"));
618    }
619
620    // ── span / byte-offset invariants ────────────────────────────────────────
621
622    #[test]
623    fn spans_cover_input_excluding_whitespace() {
624        let text = "กินข้าว123hello";
625        let tokens = tok().segment(text);
626        // Every span must be a valid UTF-8 slice of `text`.
627        for t in &tokens {
628            assert_eq!(&text[t.span.clone()], t.text);
629            assert!(text.is_char_boundary(t.span.start));
630            assert!(text.is_char_boundary(t.span.end));
631        }
632    }
633
634    #[test]
635    fn adjacent_spans_are_contiguous() {
636        let text = "กินข้าวกับปลา";
637        let tokens = Tokenizer::builder()
638            .keep_whitespace(true)
639            .build()
640            .segment(text);
641        for w in tokens.windows(2) {
642            assert_eq!(
643                w[0].span.end, w[1].span.start,
644                "gap between {:?} and {:?}",
645                w[0], w[1]
646            );
647        }
648    }
649
650    #[test]
651    fn no_empty_tokens() {
652        let tokens = tok().segment("กินข้าวกับปลา 100 hello!");
653        for t in &tokens {
654            assert!(!t.text.is_empty());
655        }
656    }
657
658    // ── custom dictionary ─────────────────────────────────────────────────────
659
660    #[test]
661    fn custom_dict_word_is_matched() {
662        // Use a nonsense word that is not in the built-in dictionary and cannot
663        // be decomposed into subwords — ensures the custom dict is actually used.
664        let tok = Tokenizer::builder().dict_words("กขคงจฉ\n").build();
665        let tokens = tok.segment("กขคงจฉ");
666        let thai: Vec<&str> = tokens
667            .iter()
668            .filter(|t| t.kind == TokenKind::Thai)
669            .map(|t| t.text)
670            .collect();
671        assert!(thai.contains(&"กขคงจฉ"), "got: {thai:?}");
672    }
673
674    // ── normalize then segment ────────────────────────────────────────────────
675
676    #[test]
677    fn normalize_deduplicates_tone_before_segment() {
678        // กินข้าว with a doubled tone mark on ข้ — normalize fixes it, segment proceeds.
679        let t = tok();
680        // Insert a doubled tone on ข: ข + อ้ + อ้  (ข้้)
681        let raw = "กิน\u{0E02}\u{0E49}\u{0E49}าว"; // กิน + ข้้ + าว
682        let normalized = t.normalize(raw);
683        let tokens = t.segment(&normalized);
684        assert!(!tokens.is_empty());
685        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
686        assert_eq!(rebuilt, normalized);
687    }
688
689    #[test]
690    fn normalize_clean_input_is_identity() {
691        // normalize() on already-clean text should not change it.
692        let t = tok();
693        let clean = "กินข้าวกับปลา";
694        assert_eq!(t.normalize(clean), clean);
695    }
696
697    #[test]
698    fn segment_without_normalize_on_clean_input() {
699        // segment() alone is sufficient when input is already canonical.
700        let tokens = tok().segment("กินข้าวกับปลา");
701        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
702        assert_eq!(rebuilt, "กินข้าวกับปลา");
703    }
704
705    // ── DpScore ordering ──────────────────────────────────────────────────────
706    //
707    // The score is a 4-field lexicographic key:
708    //   1. neg_unknowns  — fewer unknowns is strictly better
709    //   2. neg_tokens    — fewer tokens (prefer longer compounds over split components)
710    //   3. dict_words    — more dictionary matches breaks token-count ties
711    //   4. freq_score    — higher cumulative TNC frequency as the final tiebreaker
712
713    #[test]
714    fn dp_score_fewer_unknowns_is_primary() {
715        // A path with no unknowns beats one with unknowns regardless of other fields.
716        let no_unknown = DpScore::ZERO;
717        let one_unknown = DpScore::ZERO.unknown_edge();
718        assert!(no_unknown > one_unknown);
719    }
720
721    #[test]
722    fn dp_score_fewer_tokens_beats_more_dict_words() {
723        // Fewer tokens wins over more dict matches: เดินทาง (1 token, 1 match)
724        // beats เดิน+ทาง (2 tokens, 2 matches).
725        let compound = DpScore::ZERO.dict_edge(0); // 1 token, 1 dict
726        let split = DpScore::ZERO.dict_edge(0).dict_edge(0); // 2 tokens, 2 dict
727        assert!(compound > split);
728    }
729
730    #[test]
731    fn dp_score_higher_freq_breaks_token_tie() {
732        // Same unknowns and token count; higher TNC freq wins.
733        let low_freq = DpScore::ZERO.dict_edge(10);
734        let high_freq = DpScore::ZERO.dict_edge(100);
735        assert!(high_freq > low_freq);
736    }
737
738    #[test]
739    fn dp_score_fewer_tokens_beats_higher_freq() {
740        // Fewer tokens wins even when the competing path has higher TNC frequency.
741        let high_freq_more_tokens = DpScore {
742            neg_unknowns: 0,
743            neg_tokens: -2,
744            dict_words: 1,
745            freq_score: 200,
746        };
747        let low_freq_fewer_tokens = DpScore {
748            neg_unknowns: 0,
749            neg_tokens: -1,
750            dict_words: 1,
751            freq_score: 100,
752        };
753        assert!(low_freq_fewer_tokens > high_freq_more_tokens);
754    }
755
756    #[test]
757    fn dp_score_more_dict_words_breaks_token_tie() {
758        // Same unknowns and token count; more dict matches wins.
759        let fewer_dict = DpScore {
760            neg_unknowns: 0,
761            neg_tokens: -2,
762            dict_words: 1,
763            freq_score: 0,
764        };
765        let more_dict = DpScore {
766            neg_unknowns: 0,
767            neg_tokens: -2,
768            dict_words: 2,
769            freq_score: 0,
770        };
771        assert!(more_dict > fewer_dict);
772    }
773
774    #[test]
775    fn dict_edge_accumulates_freq_score() {
776        let after_one = DpScore::ZERO.dict_edge(50);
777        let after_two = after_one.dict_edge(30);
778        assert_eq!(after_one.freq_score, 50);
779        assert_eq!(after_two.freq_score, 80);
780    }
781
782    #[test]
783    fn dict_edge_increments_dict_words_and_neg_tokens() {
784        let s = DpScore::ZERO.dict_edge(0);
785        assert_eq!(s.dict_words, 1);
786        assert_eq!(s.neg_tokens, -1);
787        assert_eq!(s.neg_unknowns, 0);
788    }
789
790    #[test]
791    fn unknown_edge_increments_neg_unknowns_only() {
792        let s = DpScore::ZERO.unknown_edge();
793        assert_eq!(s.neg_unknowns, -1);
794        assert_eq!(s.neg_tokens, -1);
795        assert_eq!(s.dict_words, 0);
796        assert_eq!(s.freq_score, 0);
797    }
798
799    #[test]
800    fn unknown_edge_does_not_contribute_freq() {
801        let s = DpScore::ZERO.unknown_edge().unknown_edge();
802        assert_eq!(s.freq_score, 0);
803    }
804
805    // ── char_span invariants ──────────────────────────────────────────────────
806
807    #[test]
808    fn char_span_len_equals_char_count() {
809        let tokens = tok().segment("กินข้าวกับปลา");
810        for t in &tokens {
811            assert_eq!(
812                t.char_span.end - t.char_span.start,
813                t.text.chars().count(),
814                "char_span length mismatch for {:?}",
815                t.text
816            );
817        }
818    }
819
820    #[test]
821    fn char_spans_are_contiguous() {
822        let tokens = Tokenizer::builder()
823            .keep_whitespace(true)
824            .build()
825            .segment("กินข้าว 100 hello");
826        for w in tokens.windows(2) {
827            assert_eq!(
828                w[0].char_span.end, w[1].char_span.start,
829                "char_span gap between {:?} and {:?}",
830                w[0].text, w[1].text
831            );
832        }
833    }
834
835    #[test]
836    fn char_span_for_mixed_script() {
837        // "ธนาคาร100แห่ง": ธนาคาร=6 chars, 100=3 chars, แห่ง=4 chars
838        let tokens = tok().segment("ธนาคาร100แห่ง");
839        assert_eq!(tokens[0].char_span, 0..6);
840        assert_eq!(tokens[1].char_span, 6..9);
841        assert_eq!(tokens[2].char_span, 9..13);
842    }
843
844    #[test]
845    fn char_span_accounts_for_multibyte_chars() {
846        // Each Thai codepoint is 3 bytes but 1 char.
847        // "กิน" = 3 chars (9 bytes); char_span should be 0..3, span 0..9.
848        let tokens = tok().segment("กิน");
849        assert_eq!(tokens[0].span, 0..9);
850        assert_eq!(tokens[0].char_span, 0..3);
851    }
852
853    #[test]
854    fn char_span_emoji_is_single_char() {
855        // 😀 = 1 char, 4 bytes — verify char_span counts it as 1.
856        let tokens = tok().segment("😀");
857        assert_eq!(tokens[0].char_len(), 1);
858        assert_eq!(tokens[0].byte_len(), 4);
859    }
860
861    // ── edge cases ────────────────────────────────────────────────────────────
862
863    #[test]
864    fn single_thai_char() {
865        let tokens = tok().segment("ก");
866        assert_eq!(tokens.len(), 1);
867        assert_eq!(tokens[0].text, "ก");
868    }
869
870    #[test]
871    fn sawasdee_khao_lok() {
872        let tokens = tok().segment("สวัสดีชาวโลก");
873        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
874        assert_eq!(rebuilt, "สวัสดีชาวโลก");
875    }
876}
kham_core/segmenter.rs

kham_core/
segmenter.rs