Skip to main content

kham_core/
token.rs

1//! Token types returned by the segmenter.
2
3use core::ops::Range;
4
5/// Named entity category assigned by the NE gazetteer.
6///
7/// Used as the payload of [`TokenKind::Named`]. Stored here (rather than in
8/// `ne.rs`) to keep [`TokenKind`] self-contained and avoid circular module
9/// dependencies.
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
11pub enum NamedEntityKind {
12    /// A person — individual, title used as name, or prominent public figure.
13    Person,
14    /// A place — country, province, city, or geographic region.
15    Place,
16    /// An organisation — company, government body, institution, or brand.
17    Org,
18}
19
20impl NamedEntityKind {
21    /// Parse the TSV tag string (`"PERSON"`, `"PLACE"`, `"ORG"`).
22    ///
23    /// Returns `None` for unrecognised strings.
24    pub fn from_tag(s: &str) -> Option<Self> {
25        match s {
26            "PERSON" => Some(Self::Person),
27            "PLACE" => Some(Self::Place),
28            "ORG" => Some(Self::Org),
29            _ => None,
30        }
31    }
32
33    /// The canonical TSV tag string for this variant.
34    pub fn as_tag(self) -> &'static str {
35        match self {
36            Self::Person => "PERSON",
37            Self::Place => "PLACE",
38            Self::Org => "ORG",
39        }
40    }
41
42    /// A human-readable label for use in bindings (`"Person"`, `"Place"`, `"Org"`).
43    pub fn as_str(self) -> &'static str {
44        match self {
45            Self::Person => "Person",
46            Self::Place => "Place",
47            Self::Org => "Org",
48        }
49    }
50}
51
52/// Classification of a [`Token`]'s script / category.
53#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
54pub enum TokenKind {
55    /// Thai script syllable or word.
56    Thai,
57    /// Latin / ASCII alphabetic text.
58    Latin,
59    /// Numeric digits (ASCII or Thai ๐–๙).
60    Number,
61    /// Punctuation or symbol.
62    Punctuation,
63    /// Emoji character sequence.
64    Emoji,
65    /// Whitespace (space, tab, newline).
66    Whitespace,
67    /// Anything that does not fit the above categories.
68    Unknown,
69    /// A named entity identified by the NE gazetteer.
70    ///
71    /// The segmenter emits [`Thai`](TokenKind::Thai) for all Thai tokens;
72    /// [`NeTagger::tag_tokens`](crate::ne::NeTagger::tag_tokens) relabels
73    /// gazetteer matches to `Named(kind)` in a post-processing pass.
74    Named(NamedEntityKind),
75}
76
77/// A single token produced by [`crate::Tokenizer::segment`].
78///
79/// The `text` field is a **zero-copy** slice of the original input string.
80/// Two span types are provided: `span` for byte offsets (suitable for slicing
81/// `&str`) and `char_span` for Unicode scalar-value offsets (suitable for
82/// Python/JavaScript string indexing and display).
83///
84/// # Example
85///
86/// ```rust
87/// use kham_core::Tokenizer;
88///
89/// let tok = Tokenizer::new();
90/// let input = "ธนาคาร100แห่ง";
91/// let tokens = tok.segment(input);
92/// for t in &tokens {
93///     // byte span slices the original string exactly
94///     assert_eq!(&input[t.span.clone()], t.text);
95///     // char span equals the Unicode scalar-value count
96///     assert_eq!(t.char_span.end - t.char_span.start, t.text.chars().count());
97/// }
98/// ```
99#[derive(Debug, Clone, PartialEq, Eq)]
100pub struct Token<'a> {
101    /// Zero-copy reference into the original input.
102    pub text: &'a str,
103    /// Byte offsets `start..end` in the original input string.
104    /// Both boundaries are valid UTF-8 code-point boundaries.
105    pub span: Range<usize>,
106    /// Unicode scalar-value (char) offsets `start..end` in the original input.
107    /// Use these for language-level string indexing in Python, JavaScript, etc.
108    pub char_span: Range<usize>,
109    /// Script / category of this token.
110    pub kind: TokenKind,
111}
112
113impl<'a> Token<'a> {
114    /// Construct a new [`Token`].
115    ///
116    /// # Panics (debug only)
117    ///
118    /// Panics in debug builds if `span` length does not match `text.len()`, or
119    /// if `char_span` length does not match `text.chars().count()`.
120    #[inline]
121    pub fn new(
122        text: &'a str,
123        span: Range<usize>,
124        char_span: Range<usize>,
125        kind: TokenKind,
126    ) -> Self {
127        debug_assert_eq!(text.len(), span.end - span.start);
128        debug_assert_eq!(text.chars().count(), char_span.end - char_span.start);
129        Self {
130            text,
131            span,
132            char_span,
133            kind,
134        }
135    }
136
137    /// Byte length of this token's text.
138    #[inline]
139    pub fn byte_len(&self) -> usize {
140        self.span.end - self.span.start
141    }
142
143    /// Number of Unicode scalar values (chars) in this token's text.
144    #[inline]
145    pub fn char_len(&self) -> usize {
146        self.char_span.end - self.char_span.start
147    }
148}
149
150// ---------------------------------------------------------------------------
151// Tests
152// ---------------------------------------------------------------------------
153
154#[cfg(test)]
155mod tests {
156    use super::*;
157
158    fn make(text: &str, byte_start: usize, char_start: usize, kind: TokenKind) -> Token<'_> {
159        let byte_end = byte_start + text.len();
160        let char_end = char_start + text.chars().count();
161        Token::new(text, byte_start..byte_end, char_start..char_end, kind)
162    }
163
164    #[test]
165    fn byte_len_matches_text_len() {
166        let t = make("กิน", 0, 0, TokenKind::Thai);
167        assert_eq!(t.byte_len(), "กิน".len()); // 9 bytes
168    }
169
170    #[test]
171    fn char_len_matches_char_count() {
172        let t = make("กิน", 0, 0, TokenKind::Thai);
173        assert_eq!(t.char_len(), 3); // ก + ิ + น
174    }
175
176    #[test]
177    fn char_len_ascii() {
178        let t = make("hello", 0, 0, TokenKind::Latin);
179        assert_eq!(t.char_len(), 5);
180        assert_eq!(t.byte_len(), 5);
181    }
182
183    #[test]
184    fn char_span_start_offset() {
185        // Token starting at char offset 4 (e.g. after "กิน ")
186        let t = make("ข้าว", 10, 4, TokenKind::Thai);
187        assert_eq!(t.char_span.start, 4);
188        assert_eq!(t.char_span.end, 4 + "ข้าว".chars().count());
189    }
190
191    #[test]
192    fn emoji_char_len_is_one_per_codepoint() {
193        // 😀 is a single codepoint (U+1F600) but 4 bytes.
194        let t = make("😀", 0, 0, TokenKind::Emoji);
195        assert_eq!(t.char_len(), 1);
196        assert_eq!(t.byte_len(), 4);
197    }
198}