kham_core/token.rs
1//! Token types returned by the segmenter.
2
3use core::ops::Range;
4
5/// Classification of a [`Token`]'s script / category.
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
7pub enum TokenKind {
8 /// Thai script syllable or word.
9 Thai,
10 /// Latin / ASCII alphabetic text.
11 Latin,
12 /// Numeric digits (ASCII or Thai ๐–๙).
13 Number,
14 /// Punctuation or symbol.
15 Punctuation,
16 /// Emoji character sequence.
17 Emoji,
18 /// Whitespace (space, tab, newline).
19 Whitespace,
20 /// Anything that does not fit the above categories.
21 Unknown,
22}
23
24/// A single token produced by [`crate::Tokenizer::segment`].
25///
26/// The `text` field is a **zero-copy** slice of the original input string.
27/// Two span types are provided: `span` for byte offsets (suitable for slicing
28/// `&str`) and `char_span` for Unicode scalar-value offsets (suitable for
29/// Python/JavaScript string indexing and display).
30///
31/// # Example
32///
33/// ```rust
34/// use kham_core::Tokenizer;
35///
36/// let tok = Tokenizer::new();
37/// let input = "ธนาคาร100แห่ง";
38/// let tokens = tok.segment(input);
39/// for t in &tokens {
40/// // byte span slices the original string exactly
41/// assert_eq!(&input[t.span.clone()], t.text);
42/// // char span equals the Unicode scalar-value count
43/// assert_eq!(t.char_span.end - t.char_span.start, t.text.chars().count());
44/// }
45/// ```
46#[derive(Debug, Clone, PartialEq, Eq)]
47pub struct Token<'a> {
48 /// Zero-copy reference into the original input.
49 pub text: &'a str,
50 /// Byte offsets `start..end` in the original input string.
51 /// Both boundaries are valid UTF-8 code-point boundaries.
52 pub span: Range<usize>,
53 /// Unicode scalar-value (char) offsets `start..end` in the original input.
54 /// Use these for language-level string indexing in Python, JavaScript, etc.
55 pub char_span: Range<usize>,
56 /// Script / category of this token.
57 pub kind: TokenKind,
58}
59
60impl<'a> Token<'a> {
61 /// Construct a new [`Token`].
62 ///
63 /// # Panics (debug only)
64 ///
65 /// Panics in debug builds if `span` length does not match `text.len()`, or
66 /// if `char_span` length does not match `text.chars().count()`.
67 #[inline]
68 pub fn new(
69 text: &'a str,
70 span: Range<usize>,
71 char_span: Range<usize>,
72 kind: TokenKind,
73 ) -> Self {
74 debug_assert_eq!(text.len(), span.end - span.start);
75 debug_assert_eq!(text.chars().count(), char_span.end - char_span.start);
76 Self {
77 text,
78 span,
79 char_span,
80 kind,
81 }
82 }
83
84 /// Byte length of this token's text.
85 #[inline]
86 pub fn byte_len(&self) -> usize {
87 self.span.end - self.span.start
88 }
89
90 /// Number of Unicode scalar values (chars) in this token's text.
91 #[inline]
92 pub fn char_len(&self) -> usize {
93 self.char_span.end - self.char_span.start
94 }
95}
96
97// ---------------------------------------------------------------------------
98// Tests
99// ---------------------------------------------------------------------------
100
101#[cfg(test)]
102mod tests {
103 use super::*;
104
105 fn make(text: &str, byte_start: usize, char_start: usize, kind: TokenKind) -> Token<'_> {
106 let byte_end = byte_start + text.len();
107 let char_end = char_start + text.chars().count();
108 Token::new(text, byte_start..byte_end, char_start..char_end, kind)
109 }
110
111 #[test]
112 fn byte_len_matches_text_len() {
113 let t = make("กิน", 0, 0, TokenKind::Thai);
114 assert_eq!(t.byte_len(), "กิน".len()); // 9 bytes
115 }
116
117 #[test]
118 fn char_len_matches_char_count() {
119 let t = make("กิน", 0, 0, TokenKind::Thai);
120 assert_eq!(t.char_len(), 3); // ก + ิ + น
121 }
122
123 #[test]
124 fn char_len_ascii() {
125 let t = make("hello", 0, 0, TokenKind::Latin);
126 assert_eq!(t.char_len(), 5);
127 assert_eq!(t.byte_len(), 5);
128 }
129
130 #[test]
131 fn char_span_start_offset() {
132 // Token starting at char offset 4 (e.g. after "กิน ")
133 let t = make("ข้าว", 10, 4, TokenKind::Thai);
134 assert_eq!(t.char_span.start, 4);
135 assert_eq!(t.char_span.end, 4 + "ข้าว".chars().count());
136 }
137
138 #[test]
139 fn emoji_char_len_is_one_per_codepoint() {
140 // 😀 is a single codepoint (U+1F600) but 4 bytes.
141 let t = make("😀", 0, 0, TokenKind::Emoji);
142 assert_eq!(t.char_len(), 1);
143 assert_eq!(t.byte_len(), 4);
144 }
145}