Skip to main content

kham_core/
token.rs

1//! Token types returned by the segmenter.
2
3use core::ops::Range;
4
5/// Classification of a [`Token`]'s script / category.
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
7pub enum TokenKind {
8    /// Thai script syllable or word.
9    Thai,
10    /// Latin / ASCII alphabetic text.
11    Latin,
12    /// Numeric digits (ASCII or Thai ๐–๙).
13    Number,
14    /// Punctuation or symbol.
15    Punctuation,
16    /// Emoji character sequence.
17    Emoji,
18    /// Whitespace (space, tab, newline).
19    Whitespace,
20    /// Anything that does not fit the above categories.
21    Unknown,
22}
23
24/// A single token produced by [`crate::Tokenizer::segment`].
25///
26/// The `text` field is a **zero-copy** slice of the original input string.
27/// Two span types are provided: `span` for byte offsets (suitable for slicing
28/// `&str`) and `char_span` for Unicode scalar-value offsets (suitable for
29/// Python/JavaScript string indexing and display).
30///
31/// # Example
32///
33/// ```rust
34/// use kham_core::Tokenizer;
35///
36/// let tok = Tokenizer::new();
37/// let input = "ธนาคาร100แห่ง";
38/// let tokens = tok.segment(input);
39/// for t in &tokens {
40///     // byte span slices the original string exactly
41///     assert_eq!(&input[t.span.clone()], t.text);
42///     // char span equals the Unicode scalar-value count
43///     assert_eq!(t.char_span.end - t.char_span.start, t.text.chars().count());
44/// }
45/// ```
46#[derive(Debug, Clone, PartialEq, Eq)]
47pub struct Token<'a> {
48    /// Zero-copy reference into the original input.
49    pub text: &'a str,
50    /// Byte offsets `start..end` in the original input string.
51    /// Both boundaries are valid UTF-8 code-point boundaries.
52    pub span: Range<usize>,
53    /// Unicode scalar-value (char) offsets `start..end` in the original input.
54    /// Use these for language-level string indexing in Python, JavaScript, etc.
55    pub char_span: Range<usize>,
56    /// Script / category of this token.
57    pub kind: TokenKind,
58}
59
60impl<'a> Token<'a> {
61    /// Construct a new [`Token`].
62    ///
63    /// # Panics (debug only)
64    ///
65    /// Panics in debug builds if `span` length does not match `text.len()`, or
66    /// if `char_span` length does not match `text.chars().count()`.
67    #[inline]
68    pub fn new(
69        text: &'a str,
70        span: Range<usize>,
71        char_span: Range<usize>,
72        kind: TokenKind,
73    ) -> Self {
74        debug_assert_eq!(text.len(), span.end - span.start);
75        debug_assert_eq!(text.chars().count(), char_span.end - char_span.start);
76        Self {
77            text,
78            span,
79            char_span,
80            kind,
81        }
82    }
83
84    /// Byte length of this token's text.
85    #[inline]
86    pub fn byte_len(&self) -> usize {
87        self.span.end - self.span.start
88    }
89
90    /// Number of Unicode scalar values (chars) in this token's text.
91    #[inline]
92    pub fn char_len(&self) -> usize {
93        self.char_span.end - self.char_span.start
94    }
95}
96
97// ---------------------------------------------------------------------------
98// Tests
99// ---------------------------------------------------------------------------
100
101#[cfg(test)]
102mod tests {
103    use super::*;
104
105    fn make(text: &str, byte_start: usize, char_start: usize, kind: TokenKind) -> Token<'_> {
106        let byte_end = byte_start + text.len();
107        let char_end = char_start + text.chars().count();
108        Token::new(text, byte_start..byte_end, char_start..char_end, kind)
109    }
110
111    #[test]
112    fn byte_len_matches_text_len() {
113        let t = make("กิน", 0, 0, TokenKind::Thai);
114        assert_eq!(t.byte_len(), "กิน".len()); // 9 bytes
115    }
116
117    #[test]
118    fn char_len_matches_char_count() {
119        let t = make("กิน", 0, 0, TokenKind::Thai);
120        assert_eq!(t.char_len(), 3); // ก + ิ + น
121    }
122
123    #[test]
124    fn char_len_ascii() {
125        let t = make("hello", 0, 0, TokenKind::Latin);
126        assert_eq!(t.char_len(), 5);
127        assert_eq!(t.byte_len(), 5);
128    }
129
130    #[test]
131    fn char_span_start_offset() {
132        // Token starting at char offset 4 (e.g. after "กิน ")
133        let t = make("ข้าว", 10, 4, TokenKind::Thai);
134        assert_eq!(t.char_span.start, 4);
135        assert_eq!(t.char_span.end, 4 + "ข้าว".chars().count());
136    }
137
138    #[test]
139    fn emoji_char_len_is_one_per_codepoint() {
140        // 😀 is a single codepoint (U+1F600) but 4 bytes.
141        let t = make("😀", 0, 0, TokenKind::Emoji);
142        assert_eq!(t.char_len(), 1);
143        assert_eq!(t.byte_len(), 4);
144    }
145}