kham_core/
segmenter.rs

1//! DAG-based maximal matching segmenter (newmm algorithm).
2//!
3//! The segmenter builds a Directed Acyclic Word Graph (DAWG) over the input
4//! text using TCC boundaries as candidate split points, then finds the path
5//! that maximises the number of dictionary matches (fewest unknown tokens).
6//!
7//! ## Pipeline
8//!
9//! ```text
10//! raw text
11//!   │
12//!   ▼  (optional) Tokenizer::normalize()   ← fixes tone dedup + Sara Am composition
13//!   │
14//!   ▼  pre_tokenize()
15//! [Thai span] [Number span] [Latin span] …
16//!   │
17//!   ▼  (Thai spans only) tcc_boundaries()
18//! TCC boundary positions: [0, b1, b2, …, len]
19//!   │
20//!   ▼  DP over boundary indices
21//! path of (start, end) pairs that maximises dict matches
22//!   │
23//!   ▼
24//! Vec<Token<'_>>
25//! ```
26//!
27//! ## Normalization and zero-copy
28//!
29//! [`Tokenizer::segment`] is zero-copy: every [`Token`] borrows directly from
30//! the `&str` you pass in. This means segment() cannot internally normalize
31//! the text (normalization may reorder/remove characters, producing a new
32//! allocation with different byte offsets).
33//!
34//! For input that may contain สระลอย in wrong order, stacked tone marks, or
35//! decomposed Sara Am, use the two-step pattern:
36//!
37//! ```rust
38//! use kham_core::Tokenizer;
39//!
40//! let tok = Tokenizer::new();
41//! let normalized = tok.normalize("กเินข้าว"); // fix any encoding issues
42//! let tokens = tok.segment(&normalized);       // tokens borrow `normalized`
43//! ```
44
45use alloc::vec;
46use alloc::vec::Vec;
47
48use crate::dict::{builtin_dict, Dict, BUILTIN_WORDS};
49use crate::error::KhamError;
50use crate::freq::FreqMap;
51use crate::normalizer;
52use crate::pre_tokenizer::pre_tokenize;
53use crate::tcc::tcc_boundaries;
54use crate::token::{Token, TokenKind};
55
56/// High-level tokenizer. Holds a compiled dictionary and segmentation options.
57///
58/// # Example
59///
60/// ```rust
61/// use kham_core::Tokenizer;
62///
63/// let tok = Tokenizer::new();
64/// let tokens = tok.segment("กินข้าวกับปลา");
65/// assert!(!tokens.is_empty());
66/// ```
67pub struct Tokenizer {
68    dict: Dict,
69    freq: FreqMap,
70    keep_whitespace: bool,
71}
72
73impl Tokenizer {
74    /// Create a tokenizer with the built-in dictionary and TNC frequency table.
75    pub fn new() -> Self {
76        Self {
77            dict: builtin_dict(),
78            freq: FreqMap::builtin(),
79            keep_whitespace: false,
80        }
81    }
82
83    /// Normalise Thai text into canonical form.
84    ///
85    /// This is a convenience wrapper around [`normalizer::normalize`].
86    /// Because [`segment`] is zero-copy, normalization must happen **before**
87    /// segmentation. The caller owns the returned [`alloc::string::String`] and can then
88    /// borrow it for [`segment`]:
89    ///
90    /// ```rust
91    /// use kham_core::Tokenizer;
92    ///
93    /// let tok = Tokenizer::new();
94    /// // Input with a doubled tone mark and decomposed Sara Am
95    /// let raw = "\u{0E01}\u{0E34}\u{0E19}\u{0E19}\u{0E49}\u{0E4D}\u{0E32}"; // กิน + น + ้ + อํ + อา
96    /// let normalized = tok.normalize(raw); // น้ำ composed, no dedup needed here
97    /// let tokens = tok.segment(&normalized); // tokens borrow `normalized`
98    /// assert!(!tokens.is_empty());
99    /// ```
100    ///
101    /// [`segment`]: Tokenizer::segment
102    pub fn normalize(&self, text: &str) -> alloc::string::String {
103        normalizer::normalize(text)
104    }
105
106    /// Return a [`TokenizerBuilder`] for custom configuration.
107    ///
108    /// # Example
109    ///
110    /// ```rust
111    /// use kham_core::Tokenizer;
112    ///
113    /// // Use built-in dict (no extra words needed here)
114    /// let tok = Tokenizer::builder().build();
115    /// let tokens = tok.segment("สวัสดีชาวโลก");
116    /// assert!(!tokens.is_empty());
117    /// ```
118    pub fn builder() -> TokenizerBuilder {
119        TokenizerBuilder::default()
120    }
121
122    /// Segment `text` into tokens.
123    ///
124    /// Returns a `Vec<Token<'_>>` where every token's `text` is a
125    /// zero-copy sub-slice of `text`.
126    ///
127    /// Non-Thai spans (Latin, Number, Whitespace, Emoji, Punctuation) pass
128    /// through unchanged. Thai spans are segmented with the newmm DAG
129    /// algorithm constrained to TCC boundaries.
130    ///
131    /// # Example
132    ///
133    /// ```rust
134    /// use kham_core::{Tokenizer, TokenKind};
135    ///
136    /// let tok = Tokenizer::new();
137    /// // Mixed Thai + number + Thai
138    /// let tokens = tok.segment("ธนาคาร100แห่ง");
139    /// assert_eq!(tokens[1].text, "100");
140    /// assert_eq!(tokens[1].kind, TokenKind::Number);
141    /// ```
142    pub fn segment<'t>(&self, text: &'t str) -> Vec<Token<'t>> {
143        if text.is_empty() {
144            return Vec::new();
145        }
146
147        // Split into script-homogeneous spans. Non-Thai spans pass through;
148        // Thai spans go through the newmm DAG segmenter.
149        // Call normalize() first if the input may contain สระลอย in wrong
150        // order, stacked tone marks, or decomposed Sara Am.
151        let pre_tokens = pre_tokenize(text);
152
153        let mut result: Vec<Token<'t>> = Vec::with_capacity(pre_tokens.len() * 2);
154
155        for token in pre_tokens {
156            match token.kind {
157                TokenKind::Thai => {
158                    segment_thai(&self.dict, &self.freq, text, token.span, &mut result);
159                }
160                TokenKind::Whitespace if !self.keep_whitespace => {
161                    // Discard whitespace tokens unless keep_whitespace is set.
162                }
163                _ => {
164                    result.push(token);
165                }
166            }
167        }
168
169        result
170    }
171}
172
173// ---------------------------------------------------------------------------
174// newmm DAG segmentation — Thai spans only
175// ---------------------------------------------------------------------------
176
177/// Lexicographic DP score for a TCC boundary position.
178///
179/// Fields are ordered so that `Ord` naturally expresses the newmm preference:
180/// 1. Minimise unknowns (fewer unknowns → `neg_unknowns` less negative → greater).
181/// 2. Minimise total token count (prefer longer compounds over split components).
182/// 3. Maximise dictionary matches.
183/// 4. Maximise cumulative TNC frequency as the final tiebreaker.
184#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
185struct DpScore {
186    neg_unknowns: i32,
187    neg_tokens: i32,
188    dict_words: i32,
189    freq_score: u64,
190}
191
192impl DpScore {
193    const ZERO: Self = Self {
194        neg_unknowns: 0,
195        dict_words: 0,
196        freq_score: 0,
197        neg_tokens: 0,
198    };
199
200    fn dict_edge(self, freq: u32) -> Self {
201        Self {
202            dict_words: self.dict_words + 1,
203            freq_score: self.freq_score + freq as u64,
204            neg_tokens: self.neg_tokens - 1,
205            ..self
206        }
207    }
208
209    fn unknown_edge(self) -> Self {
210        Self {
211            neg_unknowns: self.neg_unknowns - 1,
212            neg_tokens: self.neg_tokens - 1,
213            ..self
214        }
215    }
216}
217
218/// Output of the forward DP pass.
219struct DpTable {
220    /// Predecessor boundary index for backtracking.
221    from: Vec<usize>,
222    /// Whether the incoming edge at index `i` was a dictionary match.
223    is_dict: Vec<bool>,
224}
225
226/// Forward DP over TCC boundary indices for a single Thai slice.
227///
228/// `bounds` must be the output of [`tcc_boundaries`] for `slice`.
229fn forward_dp(dict: &Dict, freqs: &FreqMap, slice: &str, bounds: &[usize]) -> DpTable {
230    let nb = bounds.len();
231    let mut best: Vec<Option<DpScore>> = vec![None; nb];
232    let mut from = vec![0usize; nb];
233    let mut is_dict = vec![false; nb];
234
235    best[0] = Some(DpScore::ZERO);
236
237    for i in 0..nb - 1 {
238        let score = match best[i] {
239            Some(s) => s,
240            None => continue,
241        };
242        let pos = bounds[i];
243        let remaining = &slice[pos..];
244
245        // Dictionary edges — all prefixes, not just the longest, so the DP
246        // can make a globally optimal choice rather than a greedy one.
247        for prefix in dict.prefixes(remaining) {
248            let end_pos = pos + prefix.len();
249            if let Ok(j) = bounds.binary_search(&end_pos) {
250                let freq = freqs.get(prefix);
251                let candidate = Some(score.dict_edge(freq));
252                if candidate > best[j] {
253                    best[j] = candidate;
254                    from[j] = i;
255                    is_dict[j] = true;
256                }
257            }
258        }
259
260        // Fallback edge: advance one TCC as an unknown token.
261        let j = i + 1;
262        let candidate = Some(score.unknown_edge());
263        if candidate > best[j] {
264            best[j] = candidate;
265            from[j] = i;
266            is_dict[j] = false;
267        }
268    }
269
270    DpTable { from, is_dict }
271}
272
273/// Reconstruct the winning boundary-index path by following `from` pointers
274/// from the last index back to 0, then reversing.
275fn backtrack_path(from: &[usize]) -> Vec<usize> {
276    let nb = from.len();
277    let mut path = Vec::with_capacity(nb);
278    let mut cur = nb - 1;
279    loop {
280        path.push(cur);
281        if cur == 0 {
282            break;
283        }
284        cur = from[cur];
285    }
286    path.reverse();
287    path
288}
289
290/// Segment a single Thai span using the newmm DAG algorithm and append tokens
291/// to `out`.
292///
293/// Steps: TCC boundaries → forward DP → backtrack → emit tokens.
294fn segment_thai<'t>(
295    dict: &Dict,
296    freqs: &FreqMap,
297    text: &'t str,
298    span: core::ops::Range<usize>,
299    out: &mut Vec<Token<'t>>,
300) {
301    let slice = &text[span.start..span.end];
302    let bounds = tcc_boundaries(slice);
303
304    if bounds.len() <= 1 {
305        return;
306    }
307
308    let dp = forward_dp(dict, freqs, slice, &bounds);
309    let path = backtrack_path(&dp.from);
310
311    // Char offset of span.start — computed once, then incremented per token.
312    let mut char_cursor = text[..span.start].chars().count();
313
314    for w in path.windows(2) {
315        let start_byte = span.start + bounds[w[0]];
316        let end_byte = span.start + bounds[w[1]];
317        let token_text = &text[start_byte..end_byte];
318        let char_start = char_cursor;
319        char_cursor += token_text.chars().count();
320        let kind = if dp.is_dict[w[1]] {
321            TokenKind::Thai
322        } else {
323            TokenKind::Unknown
324        };
325        out.push(Token::new(
326            token_text,
327            start_byte..end_byte,
328            char_start..char_cursor,
329            kind,
330        ));
331    }
332}
333
334// ---------------------------------------------------------------------------
335// Tokenizer trait impls
336// ---------------------------------------------------------------------------
337
338impl Default for Tokenizer {
339    fn default() -> Self {
340        Self::new()
341    }
342}
343
344// ---------------------------------------------------------------------------
345// TokenizerBuilder
346// ---------------------------------------------------------------------------
347
348/// Builder for [`Tokenizer`].
349///
350/// # Example
351///
352/// ```rust
353/// use kham_core::Tokenizer;
354///
355/// let tok = Tokenizer::builder()
356///     .keep_whitespace(true)
357///     .build();
358/// ```
359#[derive(Debug, Default)]
360pub struct TokenizerBuilder {
361    dict_words: Option<alloc::string::String>,
362    keep_whitespace: bool,
363}
364
365impl TokenizerBuilder {
366    /// Load an additional word list from a string (newline-separated words).
367    ///
368    /// Words are merged with the built-in dictionary.
369    pub fn dict_words(mut self, words: &str) -> Self {
370        self.dict_words = Some(alloc::string::String::from(words));
371        self
372    }
373
374    /// Configure whether whitespace tokens are included in the output.
375    ///
376    /// Default: `false` (whitespace is discarded).
377    pub fn keep_whitespace(mut self, keep: bool) -> Self {
378        self.keep_whitespace = keep;
379        self
380    }
381
382    /// Consume the builder and return a configured [`Tokenizer`].
383    pub fn build(self) -> Tokenizer {
384        let dict = if let Some(extra) = &self.dict_words {
385            // Custom words: merge with built-in word list and rebuild.
386            let mut combined = alloc::string::String::from(BUILTIN_WORDS);
387            combined.push('\n');
388            combined.push_str(extra);
389            Dict::from_word_list(&combined)
390        } else {
391            // Default path: load from pre-compiled binary — O(S) copy.
392            builtin_dict()
393        };
394        Tokenizer {
395            dict,
396            freq: FreqMap::builtin(),
397            keep_whitespace: self.keep_whitespace,
398        }
399    }
400
401    /// Try to load a custom word list from a file path.
402    ///
403    /// Only available when the `std` feature is enabled.
404    ///
405    /// # Errors
406    ///
407    /// Returns [`KhamError::DictLoadError`] if the file cannot be read.
408    ///
409    /// # Example
410    ///
411    /// ```rust,no_run
412    /// use kham_core::Tokenizer;
413    ///
414    /// let tok = Tokenizer::builder()
415    ///     .dict_file("my_words.txt")
416    ///     .expect("failed to load dict")
417    ///     .build();
418    /// ```
419    #[cfg(feature = "std")]
420    pub fn dict_file(self, path: &str) -> Result<Self, KhamError> {
421        extern crate std;
422        let content = std::fs::read_to_string(path)
423            .map_err(|e| KhamError::DictLoadError(alloc::format!("{path}: {e}")))?;
424        Ok(self.dict_words(&content))
425    }
426}
427
428// ---------------------------------------------------------------------------
429// Tests
430// ---------------------------------------------------------------------------
431
432#[cfg(test)]
433mod tests {
434    use super::*;
435
436    fn tok() -> Tokenizer {
437        Tokenizer::new()
438    }
439
440    // ── basic smoke tests ────────────────────────────────────────────────────
441
442    #[test]
443    fn empty_input() {
444        assert!(tok().segment("").is_empty());
445    }
446
447    #[test]
448    fn pure_latin_passthrough() {
449        let tokens = tok().segment("hello");
450        assert_eq!(tokens.len(), 1);
451        assert_eq!(tokens[0].text, "hello");
452        assert_eq!(tokens[0].kind, TokenKind::Latin);
453    }
454
455    #[test]
456    fn pure_number_passthrough() {
457        let tokens = tok().segment("12345");
458        assert_eq!(tokens.len(), 1);
459        assert_eq!(tokens[0].text, "12345");
460        assert_eq!(tokens[0].kind, TokenKind::Number);
461    }
462
463    #[test]
464    fn whitespace_dropped_by_default() {
465        let tokens = tok().segment("กิน ข้าว");
466        for t in &tokens {
467            assert_ne!(t.kind, TokenKind::Whitespace);
468        }
469    }
470
471    #[test]
472    fn whitespace_kept_when_requested() {
473        let tokens = Tokenizer::builder()
474            .keep_whitespace(true)
475            .build()
476            .segment("กิน ข้าว");
477        assert!(tokens.iter().any(|t| t.kind == TokenKind::Whitespace));
478    }
479
480    // ── Thai segmentation ────────────────────────────────────────────────────
481
482    #[test]
483    fn gin_khao_gap_pla() {
484        // "กินข้าวกับปลา" — all words must be in the built-in dict
485        let tokens = tok().segment("กินข้าวกับปลา");
486        let words: Vec<&str> = tokens.iter().map(|t| t.text).collect();
487        // Must segment into at least 2 tokens (dict has กิน, ข้าว, กับ, ปลา)
488        assert!(words.len() >= 2, "expected multiple words, got {words:?}");
489        // Reconstructing must yield the original string
490        assert_eq!(words.join(""), "กินข้าวกับปลา");
491    }
492
493    #[test]
494    fn mixed_thai_number_thai() {
495        // Classic CLAUDE.md example
496        let tokens = tok().segment("ธนาคาร100แห่ง");
497        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
498        assert_eq!(rebuilt, "ธนาคาร100แห่ง");
499        // "100" must survive as a Number token
500        let num = tokens.iter().find(|t| t.kind == TokenKind::Number);
501        assert!(num.is_some());
502        assert_eq!(num.unwrap().text, "100");
503    }
504
505    #[test]
506    fn mixed_thai_latin() {
507        let tokens = tok().segment("สวัสดี hello");
508        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
509        // Whitespace dropped by default
510        assert_eq!(rebuilt, "สวัสดีhello");
511        assert!(tokens
512            .iter()
513            .any(|t| t.kind == TokenKind::Latin && t.text == "hello"));
514    }
515
516    // ── span / byte-offset invariants ────────────────────────────────────────
517
518    #[test]
519    fn spans_cover_input_excluding_whitespace() {
520        let text = "กินข้าว123hello";
521        let tokens = tok().segment(text);
522        // Every span must be a valid UTF-8 slice of `text`.
523        for t in &tokens {
524            assert_eq!(&text[t.span.clone()], t.text);
525            assert!(text.is_char_boundary(t.span.start));
526            assert!(text.is_char_boundary(t.span.end));
527        }
528    }
529
530    #[test]
531    fn adjacent_spans_are_contiguous() {
532        let text = "กินข้าวกับปลา";
533        let tokens = Tokenizer::builder()
534            .keep_whitespace(true)
535            .build()
536            .segment(text);
537        for w in tokens.windows(2) {
538            assert_eq!(
539                w[0].span.end, w[1].span.start,
540                "gap between {:?} and {:?}",
541                w[0], w[1]
542            );
543        }
544    }
545
546    #[test]
547    fn no_empty_tokens() {
548        let tokens = tok().segment("กินข้าวกับปลา 100 hello!");
549        for t in &tokens {
550            assert!(!t.text.is_empty());
551        }
552    }
553
554    // ── custom dictionary ─────────────────────────────────────────────────────
555
556    #[test]
557    fn custom_dict_word_is_matched() {
558        // Use a nonsense word that is not in the built-in dictionary and cannot
559        // be decomposed into subwords — ensures the custom dict is actually used.
560        let tok = Tokenizer::builder().dict_words("กขคงจฉ\n").build();
561        let tokens = tok.segment("กขคงจฉ");
562        let thai: Vec<&str> = tokens
563            .iter()
564            .filter(|t| t.kind == TokenKind::Thai)
565            .map(|t| t.text)
566            .collect();
567        assert!(thai.contains(&"กขคงจฉ"), "got: {thai:?}");
568    }
569
570    // ── normalize then segment ────────────────────────────────────────────────
571
572    #[test]
573    fn normalize_deduplicates_tone_before_segment() {
574        // กินข้าว with a doubled tone mark on ข้ — normalize fixes it, segment proceeds.
575        let t = tok();
576        // Insert a doubled tone on ข: ข + อ้ + อ้  (ข้้)
577        let raw = "กิน\u{0E02}\u{0E49}\u{0E49}าว"; // กิน + ข้้ + าว
578        let normalized = t.normalize(raw);
579        let tokens = t.segment(&normalized);
580        assert!(!tokens.is_empty());
581        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
582        assert_eq!(rebuilt, normalized);
583    }
584
585    #[test]
586    fn normalize_clean_input_is_identity() {
587        // normalize() on already-clean text should not change it.
588        let t = tok();
589        let clean = "กินข้าวกับปลา";
590        assert_eq!(t.normalize(clean), clean);
591    }
592
593    #[test]
594    fn segment_without_normalize_on_clean_input() {
595        // segment() alone is sufficient when input is already canonical.
596        let tokens = tok().segment("กินข้าวกับปลา");
597        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
598        assert_eq!(rebuilt, "กินข้าวกับปลา");
599    }
600
601    // ── DpScore ordering ──────────────────────────────────────────────────────
602    //
603    // The score is a 4-field lexicographic key:
604    //   1. neg_unknowns  — fewer unknowns is strictly better
605    //   2. neg_tokens    — fewer tokens (prefer longer compounds over split components)
606    //   3. dict_words    — more dictionary matches breaks token-count ties
607    //   4. freq_score    — higher cumulative TNC frequency as the final tiebreaker
608
609    #[test]
610    fn dp_score_fewer_unknowns_is_primary() {
611        // A path with no unknowns beats one with unknowns regardless of other fields.
612        let no_unknown = DpScore::ZERO;
613        let one_unknown = DpScore::ZERO.unknown_edge();
614        assert!(no_unknown > one_unknown);
615    }
616
617    #[test]
618    fn dp_score_fewer_tokens_beats_more_dict_words() {
619        // Fewer tokens wins over more dict matches: เดินทาง (1 token, 1 match)
620        // beats เดิน+ทาง (2 tokens, 2 matches).
621        let compound = DpScore::ZERO.dict_edge(0); // 1 token, 1 dict
622        let split = DpScore::ZERO.dict_edge(0).dict_edge(0); // 2 tokens, 2 dict
623        assert!(compound > split);
624    }
625
626    #[test]
627    fn dp_score_higher_freq_breaks_token_tie() {
628        // Same unknowns and token count; higher TNC freq wins.
629        let low_freq = DpScore::ZERO.dict_edge(10);
630        let high_freq = DpScore::ZERO.dict_edge(100);
631        assert!(high_freq > low_freq);
632    }
633
634    #[test]
635    fn dp_score_fewer_tokens_beats_higher_freq() {
636        // Fewer tokens wins even when the competing path has higher TNC frequency.
637        let high_freq_more_tokens = DpScore {
638            neg_unknowns: 0,
639            neg_tokens: -2,
640            dict_words: 1,
641            freq_score: 200,
642        };
643        let low_freq_fewer_tokens = DpScore {
644            neg_unknowns: 0,
645            neg_tokens: -1,
646            dict_words: 1,
647            freq_score: 100,
648        };
649        assert!(low_freq_fewer_tokens > high_freq_more_tokens);
650    }
651
652    #[test]
653    fn dp_score_more_dict_words_breaks_token_tie() {
654        // Same unknowns and token count; more dict matches wins.
655        let fewer_dict = DpScore {
656            neg_unknowns: 0,
657            neg_tokens: -2,
658            dict_words: 1,
659            freq_score: 0,
660        };
661        let more_dict = DpScore {
662            neg_unknowns: 0,
663            neg_tokens: -2,
664            dict_words: 2,
665            freq_score: 0,
666        };
667        assert!(more_dict > fewer_dict);
668    }
669
670    #[test]
671    fn dict_edge_accumulates_freq_score() {
672        let after_one = DpScore::ZERO.dict_edge(50);
673        let after_two = after_one.dict_edge(30);
674        assert_eq!(after_one.freq_score, 50);
675        assert_eq!(after_two.freq_score, 80);
676    }
677
678    #[test]
679    fn dict_edge_increments_dict_words_and_neg_tokens() {
680        let s = DpScore::ZERO.dict_edge(0);
681        assert_eq!(s.dict_words, 1);
682        assert_eq!(s.neg_tokens, -1);
683        assert_eq!(s.neg_unknowns, 0);
684    }
685
686    #[test]
687    fn unknown_edge_increments_neg_unknowns_only() {
688        let s = DpScore::ZERO.unknown_edge();
689        assert_eq!(s.neg_unknowns, -1);
690        assert_eq!(s.neg_tokens, -1);
691        assert_eq!(s.dict_words, 0);
692        assert_eq!(s.freq_score, 0);
693    }
694
695    #[test]
696    fn unknown_edge_does_not_contribute_freq() {
697        let s = DpScore::ZERO.unknown_edge().unknown_edge();
698        assert_eq!(s.freq_score, 0);
699    }
700
701    // ── char_span invariants ──────────────────────────────────────────────────
702
703    #[test]
704    fn char_span_len_equals_char_count() {
705        let tokens = tok().segment("กินข้าวกับปลา");
706        for t in &tokens {
707            assert_eq!(
708                t.char_span.end - t.char_span.start,
709                t.text.chars().count(),
710                "char_span length mismatch for {:?}",
711                t.text
712            );
713        }
714    }
715
716    #[test]
717    fn char_spans_are_contiguous() {
718        let tokens = Tokenizer::builder()
719            .keep_whitespace(true)
720            .build()
721            .segment("กินข้าว 100 hello");
722        for w in tokens.windows(2) {
723            assert_eq!(
724                w[0].char_span.end, w[1].char_span.start,
725                "char_span gap between {:?} and {:?}",
726                w[0].text, w[1].text
727            );
728        }
729    }
730
731    #[test]
732    fn char_span_for_mixed_script() {
733        // "ธนาคาร100แห่ง": ธนาคาร=6 chars, 100=3 chars, แห่ง=4 chars
734        let tokens = tok().segment("ธนาคาร100แห่ง");
735        assert_eq!(tokens[0].char_span, 0..6);
736        assert_eq!(tokens[1].char_span, 6..9);
737        assert_eq!(tokens[2].char_span, 9..13);
738    }
739
740    #[test]
741    fn char_span_accounts_for_multibyte_chars() {
742        // Each Thai codepoint is 3 bytes but 1 char.
743        // "กิน" = 3 chars (9 bytes); char_span should be 0..3, span 0..9.
744        let tokens = tok().segment("กิน");
745        assert_eq!(tokens[0].span, 0..9);
746        assert_eq!(tokens[0].char_span, 0..3);
747    }
748
749    #[test]
750    fn char_span_emoji_is_single_char() {
751        // 😀 = 1 char, 4 bytes — verify char_span counts it as 1.
752        let tokens = tok().segment("😀");
753        assert_eq!(tokens[0].char_len(), 1);
754        assert_eq!(tokens[0].byte_len(), 4);
755    }
756
757    // ── edge cases ────────────────────────────────────────────────────────────
758
759    #[test]
760    fn single_thai_char() {
761        let tokens = tok().segment("ก");
762        assert_eq!(tokens.len(), 1);
763        assert_eq!(tokens[0].text, "ก");
764    }
765
766    #[test]
767    fn sawasdee_khao_lok() {
768        let tokens = tok().segment("สวัสดีชาวโลก");
769        let rebuilt: alloc::string::String = tokens.iter().map(|t| t.text).collect();
770        assert_eq!(rebuilt, "สวัสดีชาวโลก");
771    }
772}
kham_core/segmenter.rs

kham_core/
segmenter.rs