kham_core/token.rs
1//! Token types returned by the segmenter.
2
3use core::ops::Range;
4
5/// Named entity category assigned by the NE gazetteer.
6///
7/// Used as the payload of [`TokenKind::Named`]. Stored here (rather than in
8/// `ne.rs`) to keep [`TokenKind`] self-contained and avoid circular module
9/// dependencies.
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
11pub enum NamedEntityKind {
12 /// A person — individual, title used as name, or prominent public figure.
13 Person,
14 /// A place — country, province, city, or geographic region.
15 Place,
16 /// An organisation — company, government body, institution, or brand.
17 Org,
18}
19
20impl NamedEntityKind {
21 /// Parse the TSV tag string (`"PERSON"`, `"PLACE"`, `"ORG"`).
22 ///
23 /// Returns `None` for unrecognised strings.
24 pub fn from_tag(s: &str) -> Option<Self> {
25 match s {
26 "PERSON" => Some(Self::Person),
27 "PLACE" => Some(Self::Place),
28 "ORG" => Some(Self::Org),
29 _ => None,
30 }
31 }
32
33 /// The canonical TSV tag string for this variant.
34 pub fn as_tag(self) -> &'static str {
35 match self {
36 Self::Person => "PERSON",
37 Self::Place => "PLACE",
38 Self::Org => "ORG",
39 }
40 }
41
42 /// A human-readable label for use in bindings (`"Person"`, `"Place"`, `"Org"`).
43 pub fn as_str(self) -> &'static str {
44 match self {
45 Self::Person => "Person",
46 Self::Place => "Place",
47 Self::Org => "Org",
48 }
49 }
50}
51
52/// Classification of a [`Token`]'s script / category.
53#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
54pub enum TokenKind {
55 /// Thai script syllable or word.
56 Thai,
57 /// Latin / ASCII alphabetic text.
58 Latin,
59 /// Numeric digits (ASCII or Thai ๐–๙).
60 Number,
61 /// Punctuation or symbol.
62 Punctuation,
63 /// Emoji character sequence.
64 Emoji,
65 /// Whitespace (space, tab, newline).
66 Whitespace,
67 /// Anything that does not fit the above categories.
68 Unknown,
69 /// A named entity identified by the NE gazetteer.
70 ///
71 /// The segmenter emits [`Thai`](TokenKind::Thai) for all Thai tokens;
72 /// [`NeTagger::tag_tokens`](crate::ne::NeTagger::tag_tokens) relabels
73 /// gazetteer matches to `Named(kind)` in a post-processing pass.
74 Named(NamedEntityKind),
75}
76
77/// A single token produced by [`crate::Tokenizer::segment`].
78///
79/// The `text` field is a **zero-copy** slice of the original input string.
80/// Two span types are provided: `span` for byte offsets (suitable for slicing
81/// `&str`) and `char_span` for Unicode scalar-value offsets (suitable for
82/// Python/JavaScript string indexing and display).
83///
84/// # Example
85///
86/// ```rust
87/// use kham_core::Tokenizer;
88///
89/// let tok = Tokenizer::new();
90/// let input = "ธนาคาร100แห่ง";
91/// let tokens = tok.segment(input);
92/// for t in &tokens {
93/// // byte span slices the original string exactly
94/// assert_eq!(&input[t.span.clone()], t.text);
95/// // char span equals the Unicode scalar-value count
96/// assert_eq!(t.char_span.end - t.char_span.start, t.text.chars().count());
97/// }
98/// ```
99#[derive(Debug, Clone, PartialEq, Eq)]
100pub struct Token<'a> {
101 /// Zero-copy reference into the original input.
102 pub text: &'a str,
103 /// Byte offsets `start..end` in the original input string.
104 /// Both boundaries are valid UTF-8 code-point boundaries.
105 pub span: Range<usize>,
106 /// Unicode scalar-value (char) offsets `start..end` in the original input.
107 /// Use these for language-level string indexing in Python, JavaScript, etc.
108 pub char_span: Range<usize>,
109 /// Script / category of this token.
110 pub kind: TokenKind,
111}
112
113impl<'a> Token<'a> {
114 /// Construct a new [`Token`].
115 ///
116 /// # Panics (debug only)
117 ///
118 /// Panics in debug builds if `span` length does not match `text.len()`, or
119 /// if `char_span` length does not match `text.chars().count()`.
120 #[inline]
121 pub fn new(
122 text: &'a str,
123 span: Range<usize>,
124 char_span: Range<usize>,
125 kind: TokenKind,
126 ) -> Self {
127 debug_assert_eq!(text.len(), span.end - span.start);
128 debug_assert_eq!(text.chars().count(), char_span.end - char_span.start);
129 Self {
130 text,
131 span,
132 char_span,
133 kind,
134 }
135 }
136
137 /// Byte length of this token's text.
138 #[inline]
139 pub fn byte_len(&self) -> usize {
140 self.span.end - self.span.start
141 }
142
143 /// Number of Unicode scalar values (chars) in this token's text.
144 #[inline]
145 pub fn char_len(&self) -> usize {
146 self.char_span.end - self.char_span.start
147 }
148}
149
150// ---------------------------------------------------------------------------
151// Tests
152// ---------------------------------------------------------------------------
153
154#[cfg(test)]
155mod tests {
156 use super::*;
157
158 fn make(text: &str, byte_start: usize, char_start: usize, kind: TokenKind) -> Token<'_> {
159 let byte_end = byte_start + text.len();
160 let char_end = char_start + text.chars().count();
161 Token::new(text, byte_start..byte_end, char_start..char_end, kind)
162 }
163
164 #[test]
165 fn byte_len_matches_text_len() {
166 let t = make("กิน", 0, 0, TokenKind::Thai);
167 assert_eq!(t.byte_len(), "กิน".len()); // 9 bytes
168 }
169
170 #[test]
171 fn char_len_matches_char_count() {
172 let t = make("กิน", 0, 0, TokenKind::Thai);
173 assert_eq!(t.char_len(), 3); // ก + ิ + น
174 }
175
176 #[test]
177 fn char_len_ascii() {
178 let t = make("hello", 0, 0, TokenKind::Latin);
179 assert_eq!(t.char_len(), 5);
180 assert_eq!(t.byte_len(), 5);
181 }
182
183 #[test]
184 fn char_span_start_offset() {
185 // Token starting at char offset 4 (e.g. after "กิน ")
186 let t = make("ข้าว", 10, 4, TokenKind::Thai);
187 assert_eq!(t.char_span.start, 4);
188 assert_eq!(t.char_span.end, 4 + "ข้าว".chars().count());
189 }
190
191 #[test]
192 fn emoji_char_len_is_one_per_codepoint() {
193 // 😀 is a single codepoint (U+1F600) but 4 bytes.
194 let t = make("😀", 0, 0, TokenKind::Emoji);
195 assert_eq!(t.char_len(), 1);
196 assert_eq!(t.byte_len(), 4);
197 }
198}