kham_core/token.rs
1//! Token types returned by the segmenter.
2
3use core::ops::Range;
4
5/// Named entity category assigned by the NE gazetteer.
6///
7/// Used as the payload of [`TokenKind::Named`]. Stored here (rather than in
8/// `ne.rs`) to keep [`TokenKind`] self-contained and avoid circular module
9/// dependencies.
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
11pub enum NamedEntityKind {
12 /// A person — individual, title used as name, or prominent public figure.
13 Person,
14 /// A place — country, province, city, or geographic region.
15 Place,
16 /// An organisation — company, government body, institution, or brand.
17 Org,
18}
19
20impl NamedEntityKind {
21 /// Parse the TSV tag string (`"PERSON"`, `"PLACE"`, `"ORG"`).
22 ///
23 /// Returns `None` for unrecognised strings.
24 pub fn from_tag(s: &str) -> Option<Self> {
25 match s {
26 "PERSON" => Some(Self::Person),
27 "PLACE" => Some(Self::Place),
28 "ORG" => Some(Self::Org),
29 _ => None,
30 }
31 }
32
33 /// The canonical TSV tag string for this variant.
34 pub fn as_tag(self) -> &'static str {
35 match self {
36 Self::Person => "PERSON",
37 Self::Place => "PLACE",
38 Self::Org => "ORG",
39 }
40 }
41
42 /// A human-readable label for use in bindings (`"Person"`, `"Place"`, `"Org"`).
43 pub fn as_str(self) -> &'static str {
44 match self {
45 Self::Person => "Person",
46 Self::Place => "Place",
47 Self::Org => "Org",
48 }
49 }
50}
51
52/// Classification of a [`Token`]'s script / category.
53#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
54pub enum TokenKind {
55 /// Thai script syllable or word.
56 Thai,
57 /// Latin / ASCII alphabetic text.
58 Latin,
59 /// Numeric digits (ASCII or Thai ๐–๙).
60 Number,
61 /// Punctuation or symbol.
62 Punctuation,
63 /// Emoji character sequence.
64 Emoji,
65 /// Whitespace (space, tab, newline).
66 Whitespace,
67 /// Anything that does not fit the above categories.
68 Unknown,
69 /// A named entity identified by the NE gazetteer.
70 ///
71 /// The segmenter emits [`Thai`](TokenKind::Thai) for all Thai tokens;
72 /// [`NeTagger::tag_tokens`](crate::ne::NeTagger::tag_tokens) relabels
73 /// gazetteer matches to `Named(kind)` in a post-processing pass.
74 Named(NamedEntityKind),
75}
76
77/// A single token produced by [`crate::Tokenizer::segment`].
78///
79/// The `text` field is a **zero-copy** slice of the original input string.
80/// Two span types are provided: `span` for byte offsets (suitable for slicing
81/// `&str`) and `char_span` for Unicode scalar-value offsets (suitable for
82/// Python/JavaScript string indexing and display).
83///
84/// # Example
85///
86/// ```rust
87/// use kham_core::Tokenizer;
88///
89/// let tok = Tokenizer::new();
90/// let input = "ธนาคาร100แห่ง";
91/// let tokens = tok.segment(input);
92/// for t in &tokens {
93/// // byte span slices the original string exactly
94/// assert_eq!(&input[t.span.clone()], t.text);
95/// // char span equals the Unicode scalar-value count
96/// assert_eq!(t.char_span.end - t.char_span.start, t.text.chars().count());
97/// }
98/// ```
99#[derive(Debug, Clone, PartialEq)]
100pub struct Token<'a> {
101 /// Zero-copy reference into the original input.
102 pub text: &'a str,
103 /// Byte offsets `start..end` in the original input string.
104 /// Both boundaries are valid UTF-8 code-point boundaries.
105 pub span: Range<usize>,
106 /// Unicode scalar-value (char) offsets `start..end` in the original input.
107 /// Use these for language-level string indexing in Python, JavaScript, etc.
108 pub char_span: Range<usize>,
109 /// Script / category of this token.
110 pub kind: TokenKind,
111 /// Segmentation confidence in the range `[0.0, 1.0]`.
112 ///
113 /// - `0.0` — Unknown token (no dictionary evidence).
114 /// - `0.7` — Dictionary match, but zero TNC corpus frequency (word known, rare in corpus),
115 /// or dict match with 4+ competing boundary edges (highly ambiguous).
116 /// - `0.8` — Dict match with 3 competing edges.
117 /// - `0.9` — Dict match with 2 competing edges.
118 /// - `1.0` — Unambiguous high-frequency dictionary match, or any non-Thai token
119 /// (Latin, Number, Emoji, Punctuation, Whitespace, Named).
120 pub confidence: f32,
121}
122
123impl<'a> Token<'a> {
124 /// Construct a new [`Token`].
125 ///
126 /// # Panics (debug only)
127 ///
128 /// Panics in debug builds if `span` length does not match `text.len()`, or
129 /// if `char_span` length does not match `text.chars().count()`.
130 #[inline]
131 pub fn new(
132 text: &'a str,
133 span: Range<usize>,
134 char_span: Range<usize>,
135 kind: TokenKind,
136 confidence: f32,
137 ) -> Self {
138 debug_assert_eq!(text.len(), span.end - span.start);
139 debug_assert_eq!(text.chars().count(), char_span.end - char_span.start);
140 Self {
141 text,
142 span,
143 char_span,
144 kind,
145 confidence,
146 }
147 }
148
149 /// Byte length of this token's text.
150 #[inline]
151 pub fn byte_len(&self) -> usize {
152 self.span.end - self.span.start
153 }
154
155 /// Number of Unicode scalar values (chars) in this token's text.
156 #[inline]
157 pub fn char_len(&self) -> usize {
158 self.char_span.end - self.char_span.start
159 }
160}
161
162// ---------------------------------------------------------------------------
163// Tests
164// ---------------------------------------------------------------------------
165
166#[cfg(test)]
167mod tests {
168 use super::*;
169
170 fn make(text: &str, byte_start: usize, char_start: usize, kind: TokenKind) -> Token<'_> {
171 let byte_end = byte_start + text.len();
172 let char_end = char_start + text.chars().count();
173 Token::new(text, byte_start..byte_end, char_start..char_end, kind, 1.0)
174 }
175
176 #[test]
177 fn byte_len_matches_text_len() {
178 let t = make("กิน", 0, 0, TokenKind::Thai);
179 assert_eq!(t.byte_len(), "กิน".len()); // 9 bytes
180 }
181
182 #[test]
183 fn char_len_matches_char_count() {
184 let t = make("กิน", 0, 0, TokenKind::Thai);
185 assert_eq!(t.char_len(), 3); // ก + ิ + น
186 }
187
188 #[test]
189 fn char_len_ascii() {
190 let t = make("hello", 0, 0, TokenKind::Latin);
191 assert_eq!(t.char_len(), 5);
192 assert_eq!(t.byte_len(), 5);
193 }
194
195 #[test]
196 fn char_span_start_offset() {
197 // Token starting at char offset 4 (e.g. after "กิน ")
198 let t = make("ข้าว", 10, 4, TokenKind::Thai);
199 assert_eq!(t.char_span.start, 4);
200 assert_eq!(t.char_span.end, 4 + "ข้าว".chars().count());
201 }
202
203 #[test]
204 fn emoji_char_len_is_one_per_codepoint() {
205 // 😀 is a single codepoint (U+1F600) but 4 bytes.
206 let t = make("😀", 0, 0, TokenKind::Emoji);
207 assert_eq!(t.char_len(), 1);
208 assert_eq!(t.byte_len(), 4);
209 }
210
211 // --- NamedEntityKind::from_tag ---
212
213 #[test]
214 fn from_tag_person() {
215 assert_eq!(
216 NamedEntityKind::from_tag("PERSON"),
217 Some(NamedEntityKind::Person)
218 );
219 }
220
221 #[test]
222 fn from_tag_place() {
223 assert_eq!(
224 NamedEntityKind::from_tag("PLACE"),
225 Some(NamedEntityKind::Place)
226 );
227 }
228
229 #[test]
230 fn from_tag_org() {
231 assert_eq!(NamedEntityKind::from_tag("ORG"), Some(NamedEntityKind::Org));
232 }
233
234 #[test]
235 fn from_tag_unrecognised_is_none() {
236 assert_eq!(NamedEntityKind::from_tag("Person"), None);
237 assert_eq!(NamedEntityKind::from_tag(""), None);
238 assert_eq!(NamedEntityKind::from_tag("UNKNOWN"), None);
239 }
240
241 // --- NamedEntityKind::as_tag ---
242
243 #[test]
244 fn as_tag_roundtrips_from_tag() {
245 for kind in [
246 NamedEntityKind::Person,
247 NamedEntityKind::Place,
248 NamedEntityKind::Org,
249 ] {
250 assert_eq!(NamedEntityKind::from_tag(kind.as_tag()), Some(kind));
251 }
252 }
253
254 // --- NamedEntityKind::as_str ---
255
256 #[test]
257 fn as_str_human_readable_labels() {
258 assert_eq!(NamedEntityKind::Person.as_str(), "Person");
259 assert_eq!(NamedEntityKind::Place.as_str(), "Place");
260 assert_eq!(NamedEntityKind::Org.as_str(), "Org");
261 }
262
263 #[test]
264 fn as_str_differs_from_as_tag() {
265 // as_tag is UPPERCASE, as_str is Title-case
266 assert_ne!(
267 NamedEntityKind::Person.as_str(),
268 NamedEntityKind::Person.as_tag()
269 );
270 }
271
272 // --- TokenKind::Named ---
273
274 #[test]
275 fn token_kind_named_equality() {
276 assert_eq!(
277 TokenKind::Named(NamedEntityKind::Person),
278 TokenKind::Named(NamedEntityKind::Person)
279 );
280 assert_ne!(
281 TokenKind::Named(NamedEntityKind::Person),
282 TokenKind::Named(NamedEntityKind::Place)
283 );
284 }
285}