use core::ops::Range;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum NamedEntityKind {
Person,
Place,
Org,
}
impl NamedEntityKind {
pub fn from_tag(s: &str) -> Option<Self> {
match s {
"PERSON" => Some(Self::Person),
"PLACE" => Some(Self::Place),
"ORG" => Some(Self::Org),
_ => None,
}
}
pub fn as_tag(self) -> &'static str {
match self {
Self::Person => "PERSON",
Self::Place => "PLACE",
Self::Org => "ORG",
}
}
pub fn as_str(self) -> &'static str {
match self {
Self::Person => "Person",
Self::Place => "Place",
Self::Org => "Org",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum TokenKind {
Thai,
Latin,
Number,
Punctuation,
Emoji,
Whitespace,
Unknown,
Named(NamedEntityKind),
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token<'a> {
pub text: &'a str,
pub span: Range<usize>,
pub char_span: Range<usize>,
pub kind: TokenKind,
}
impl<'a> Token<'a> {
#[inline]
pub fn new(
text: &'a str,
span: Range<usize>,
char_span: Range<usize>,
kind: TokenKind,
) -> Self {
debug_assert_eq!(text.len(), span.end - span.start);
debug_assert_eq!(text.chars().count(), char_span.end - char_span.start);
Self {
text,
span,
char_span,
kind,
}
}
#[inline]
pub fn byte_len(&self) -> usize {
self.span.end - self.span.start
}
#[inline]
pub fn char_len(&self) -> usize {
self.char_span.end - self.char_span.start
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make(text: &str, byte_start: usize, char_start: usize, kind: TokenKind) -> Token<'_> {
let byte_end = byte_start + text.len();
let char_end = char_start + text.chars().count();
Token::new(text, byte_start..byte_end, char_start..char_end, kind)
}
#[test]
fn byte_len_matches_text_len() {
let t = make("กิน", 0, 0, TokenKind::Thai);
assert_eq!(t.byte_len(), "กิน".len()); }
#[test]
fn char_len_matches_char_count() {
let t = make("กิน", 0, 0, TokenKind::Thai);
assert_eq!(t.char_len(), 3); }
#[test]
fn char_len_ascii() {
let t = make("hello", 0, 0, TokenKind::Latin);
assert_eq!(t.char_len(), 5);
assert_eq!(t.byte_len(), 5);
}
#[test]
fn char_span_start_offset() {
let t = make("ข้าว", 10, 4, TokenKind::Thai);
assert_eq!(t.char_span.start, 4);
assert_eq!(t.char_span.end, 4 + "ข้าว".chars().count());
}
#[test]
fn emoji_char_len_is_one_per_codepoint() {
let t = make("😀", 0, 0, TokenKind::Emoji);
assert_eq!(t.char_len(), 1);
assert_eq!(t.byte_len(), 4);
}
}