Skip to main content

kham_core/
token.rs

1//! Token types returned by the segmenter.
2
3use core::ops::Range;
4
5/// Named entity category assigned by the NE gazetteer.
6///
7/// Used as the payload of [`TokenKind::Named`]. Stored here (rather than in
8/// `ne.rs`) to keep [`TokenKind`] self-contained and avoid circular module
9/// dependencies.
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
11pub enum NamedEntityKind {
12    /// A person — individual, title used as name, or prominent public figure.
13    Person,
14    /// A place — country, province, city, or geographic region.
15    Place,
16    /// An organisation — company, government body, institution, or brand.
17    Org,
18}
19
20impl NamedEntityKind {
21    /// Parse the TSV tag string (`"PERSON"`, `"PLACE"`, `"ORG"`).
22    ///
23    /// Returns `None` for unrecognised strings.
24    pub fn from_tag(s: &str) -> Option<Self> {
25        match s {
26            "PERSON" => Some(Self::Person),
27            "PLACE" => Some(Self::Place),
28            "ORG" => Some(Self::Org),
29            _ => None,
30        }
31    }
32
33    /// The canonical TSV tag string for this variant.
34    pub fn as_tag(self) -> &'static str {
35        match self {
36            Self::Person => "PERSON",
37            Self::Place => "PLACE",
38            Self::Org => "ORG",
39        }
40    }
41
42    /// A human-readable label for use in bindings (`"Person"`, `"Place"`, `"Org"`).
43    pub fn as_str(self) -> &'static str {
44        match self {
45            Self::Person => "Person",
46            Self::Place => "Place",
47            Self::Org => "Org",
48        }
49    }
50}
51
52/// Classification of a [`Token`]'s script / category.
53#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
54pub enum TokenKind {
55    /// Thai script syllable or word.
56    Thai,
57    /// Latin / ASCII alphabetic text.
58    Latin,
59    /// Numeric digits (ASCII or Thai ๐–๙).
60    Number,
61    /// Punctuation or symbol.
62    Punctuation,
63    /// Emoji character sequence.
64    Emoji,
65    /// Whitespace (space, tab, newline).
66    Whitespace,
67    /// Anything that does not fit the above categories.
68    Unknown,
69    /// A named entity identified by the NE gazetteer.
70    ///
71    /// The segmenter emits [`Thai`](TokenKind::Thai) for all Thai tokens;
72    /// [`NeTagger::tag_tokens`](crate::ne::NeTagger::tag_tokens) relabels
73    /// gazetteer matches to `Named(kind)` in a post-processing pass.
74    Named(NamedEntityKind),
75}
76
77/// A single token produced by [`crate::Tokenizer::segment`].
78///
79/// The `text` field is a **zero-copy** slice of the original input string.
80/// Two span types are provided: `span` for byte offsets (suitable for slicing
81/// `&str`) and `char_span` for Unicode scalar-value offsets (suitable for
82/// Python/JavaScript string indexing and display).
83///
84/// # Example
85///
86/// ```rust
87/// use kham_core::Tokenizer;
88///
89/// let tok = Tokenizer::new();
90/// let input = "ธนาคาร100แห่ง";
91/// let tokens = tok.segment(input);
92/// for t in &tokens {
93///     // byte span slices the original string exactly
94///     assert_eq!(&input[t.span.clone()], t.text);
95///     // char span equals the Unicode scalar-value count
96///     assert_eq!(t.char_span.end - t.char_span.start, t.text.chars().count());
97/// }
98/// ```
99#[derive(Debug, Clone, PartialEq)]
100pub struct Token<'a> {
101    /// Zero-copy reference into the original input.
102    pub text: &'a str,
103    /// Byte offsets `start..end` in the original input string.
104    /// Both boundaries are valid UTF-8 code-point boundaries.
105    pub span: Range<usize>,
106    /// Unicode scalar-value (char) offsets `start..end` in the original input.
107    /// Use these for language-level string indexing in Python, JavaScript, etc.
108    pub char_span: Range<usize>,
109    /// Script / category of this token.
110    pub kind: TokenKind,
111    /// Segmentation confidence in the range `[0.0, 1.0]`.
112    ///
113    /// - `0.0` — Unknown token (no dictionary evidence).
114    /// - `0.7` — Dictionary match, but zero TNC corpus frequency (word known, rare in corpus),
115    ///   or dict match with 4+ competing boundary edges (highly ambiguous).
116    /// - `0.8` — Dict match with 3 competing edges.
117    /// - `0.9` — Dict match with 2 competing edges.
118    /// - `1.0` — Unambiguous high-frequency dictionary match, or any non-Thai token
119    ///   (Latin, Number, Emoji, Punctuation, Whitespace, Named).
120    pub confidence: f32,
121}
122
123impl<'a> Token<'a> {
124    /// Construct a new [`Token`].
125    ///
126    /// # Panics (debug only)
127    ///
128    /// Panics in debug builds if `span` length does not match `text.len()`, or
129    /// if `char_span` length does not match `text.chars().count()`.
130    #[inline]
131    pub fn new(
132        text: &'a str,
133        span: Range<usize>,
134        char_span: Range<usize>,
135        kind: TokenKind,
136        confidence: f32,
137    ) -> Self {
138        debug_assert_eq!(text.len(), span.end - span.start);
139        debug_assert_eq!(text.chars().count(), char_span.end - char_span.start);
140        Self {
141            text,
142            span,
143            char_span,
144            kind,
145            confidence,
146        }
147    }
148
149    /// Byte length of this token's text.
150    #[inline]
151    pub fn byte_len(&self) -> usize {
152        self.span.end - self.span.start
153    }
154
155    /// Number of Unicode scalar values (chars) in this token's text.
156    #[inline]
157    pub fn char_len(&self) -> usize {
158        self.char_span.end - self.char_span.start
159    }
160}
161
162// ---------------------------------------------------------------------------
163// Tests
164// ---------------------------------------------------------------------------
165
166#[cfg(test)]
167mod tests {
168    use super::*;
169
170    fn make(text: &str, byte_start: usize, char_start: usize, kind: TokenKind) -> Token<'_> {
171        let byte_end = byte_start + text.len();
172        let char_end = char_start + text.chars().count();
173        Token::new(text, byte_start..byte_end, char_start..char_end, kind, 1.0)
174    }
175
176    #[test]
177    fn byte_len_matches_text_len() {
178        let t = make("กิน", 0, 0, TokenKind::Thai);
179        assert_eq!(t.byte_len(), "กิน".len()); // 9 bytes
180    }
181
182    #[test]
183    fn char_len_matches_char_count() {
184        let t = make("กิน", 0, 0, TokenKind::Thai);
185        assert_eq!(t.char_len(), 3); // ก + ิ + น
186    }
187
188    #[test]
189    fn char_len_ascii() {
190        let t = make("hello", 0, 0, TokenKind::Latin);
191        assert_eq!(t.char_len(), 5);
192        assert_eq!(t.byte_len(), 5);
193    }
194
195    #[test]
196    fn char_span_start_offset() {
197        // Token starting at char offset 4 (e.g. after "กิน ")
198        let t = make("ข้าว", 10, 4, TokenKind::Thai);
199        assert_eq!(t.char_span.start, 4);
200        assert_eq!(t.char_span.end, 4 + "ข้าว".chars().count());
201    }
202
203    #[test]
204    fn emoji_char_len_is_one_per_codepoint() {
205        // 😀 is a single codepoint (U+1F600) but 4 bytes.
206        let t = make("😀", 0, 0, TokenKind::Emoji);
207        assert_eq!(t.char_len(), 1);
208        assert_eq!(t.byte_len(), 4);
209    }
210
211    // --- NamedEntityKind::from_tag ---
212
213    #[test]
214    fn from_tag_person() {
215        assert_eq!(
216            NamedEntityKind::from_tag("PERSON"),
217            Some(NamedEntityKind::Person)
218        );
219    }
220
221    #[test]
222    fn from_tag_place() {
223        assert_eq!(
224            NamedEntityKind::from_tag("PLACE"),
225            Some(NamedEntityKind::Place)
226        );
227    }
228
229    #[test]
230    fn from_tag_org() {
231        assert_eq!(NamedEntityKind::from_tag("ORG"), Some(NamedEntityKind::Org));
232    }
233
234    #[test]
235    fn from_tag_unrecognised_is_none() {
236        assert_eq!(NamedEntityKind::from_tag("Person"), None);
237        assert_eq!(NamedEntityKind::from_tag(""), None);
238        assert_eq!(NamedEntityKind::from_tag("UNKNOWN"), None);
239    }
240
241    // --- NamedEntityKind::as_tag ---
242
243    #[test]
244    fn as_tag_roundtrips_from_tag() {
245        for kind in [
246            NamedEntityKind::Person,
247            NamedEntityKind::Place,
248            NamedEntityKind::Org,
249        ] {
250            assert_eq!(NamedEntityKind::from_tag(kind.as_tag()), Some(kind));
251        }
252    }
253
254    // --- NamedEntityKind::as_str ---
255
256    #[test]
257    fn as_str_human_readable_labels() {
258        assert_eq!(NamedEntityKind::Person.as_str(), "Person");
259        assert_eq!(NamedEntityKind::Place.as_str(), "Place");
260        assert_eq!(NamedEntityKind::Org.as_str(), "Org");
261    }
262
263    #[test]
264    fn as_str_differs_from_as_tag() {
265        // as_tag is UPPERCASE, as_str is Title-case
266        assert_ne!(
267            NamedEntityKind::Person.as_str(),
268            NamedEntityKind::Person.as_tag()
269        );
270    }
271
272    // --- TokenKind::Named ---
273
274    #[test]
275    fn token_kind_named_equality() {
276        assert_eq!(
277            TokenKind::Named(NamedEntityKind::Person),
278            TokenKind::Named(NamedEntityKind::Person)
279        );
280        assert_ne!(
281            TokenKind::Named(NamedEntityKind::Person),
282            TokenKind::Named(NamedEntityKind::Place)
283        );
284    }
285}