use core::ops::Range;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum TokenKind {
Thai,
Latin,
Number,
Punctuation,
Emoji,
Whitespace,
Unknown,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token<'a> {
pub text: &'a str,
pub span: Range<usize>,
pub char_span: Range<usize>,
pub kind: TokenKind,
}
impl<'a> Token<'a> {
#[inline]
pub fn new(
text: &'a str,
span: Range<usize>,
char_span: Range<usize>,
kind: TokenKind,
) -> Self {
debug_assert_eq!(text.len(), span.end - span.start);
debug_assert_eq!(text.chars().count(), char_span.end - char_span.start);
Self {
text,
span,
char_span,
kind,
}
}
#[inline]
pub fn byte_len(&self) -> usize {
self.span.end - self.span.start
}
#[inline]
pub fn char_len(&self) -> usize {
self.char_span.end - self.char_span.start
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make(text: &str, byte_start: usize, char_start: usize, kind: TokenKind) -> Token<'_> {
let byte_end = byte_start + text.len();
let char_end = char_start + text.chars().count();
Token::new(text, byte_start..byte_end, char_start..char_end, kind)
}
#[test]
fn byte_len_matches_text_len() {
let t = make("กิน", 0, 0, TokenKind::Thai);
assert_eq!(t.byte_len(), "กิน".len()); }
#[test]
fn char_len_matches_char_count() {
let t = make("กิน", 0, 0, TokenKind::Thai);
assert_eq!(t.char_len(), 3); }
#[test]
fn char_len_ascii() {
let t = make("hello", 0, 0, TokenKind::Latin);
assert_eq!(t.char_len(), 5);
assert_eq!(t.byte_len(), 5);
}
#[test]
fn char_span_start_offset() {
let t = make("ข้าว", 10, 4, TokenKind::Thai);
assert_eq!(t.char_span.start, 4);
assert_eq!(t.char_span.end, 4 + "ข้าว".chars().count());
}
#[test]
fn emoji_char_len_is_one_per_codepoint() {
let t = make("😀", 0, 0, TokenKind::Emoji);
assert_eq!(t.char_len(), 1);
assert_eq!(t.byte_len(), 4);
}
}