use std::borrow::Cow;
#[cfg(test)]
use quickcheck::{Arbitrary, Gen};
use crate::detection::{Language, Script};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SeparatorKind {
Hard,
Soft,
}
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
pub enum TokenKind {
Word,
StopWord,
Separator(SeparatorKind),
#[default]
Unknown,
}
#[cfg(test)]
impl Arbitrary for TokenKind {
fn arbitrary(g: &mut Gen) -> Self {
*g.choose(&[
Self::Word,
Self::StopWord,
Self::Separator(SeparatorKind::Hard),
Self::Separator(SeparatorKind::Soft),
])
.unwrap()
}
}
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct Token<'o> {
pub kind: TokenKind,
pub lemma: Cow<'o, str>,
pub char_start: usize,
pub char_end: usize,
pub byte_start: usize,
pub byte_end: usize,
pub char_map: Option<Vec<(u8, u8)>>,
pub script: Script,
pub language: Option<Language>,
}
impl Token<'_> {
pub fn lemma(&self) -> &str {
self.lemma.as_ref()
}
pub fn byte_len(&self) -> usize {
self.lemma.len()
}
pub fn original_byte_len(&self) -> usize {
self.byte_end - self.byte_start
}
pub fn char_count(&self) -> usize {
self.lemma.chars().count()
}
pub fn original_char_count(&self) -> usize {
self.char_end - self.char_start
}
pub fn kind(&self) -> TokenKind {
self.kind
}
pub fn is_word(&self) -> bool {
self.kind == TokenKind::Word
}
pub fn is_stopword(&self) -> bool {
self.kind == TokenKind::StopWord
}
pub fn is_separator(&self) -> bool {
self.separator_kind().is_some_and(|_| true)
}
pub fn separator_kind(&self) -> Option<SeparatorKind> {
if let TokenKind::Separator(s) = self.kind {
Some(s)
} else {
None
}
}
pub fn original_lengths(&self, num_bytes: usize) -> (usize, usize) {
match &self.char_map {
None => {
self.lemma
.char_indices()
.take_while(|(byte_index, _)| *byte_index < num_bytes)
.enumerate()
.last()
.map_or((0, 0), |(char_index, (byte_index, c))| {
let char_count = char_index + 1;
let byte_len = byte_index + c.len_utf8();
(char_count, byte_len)
})
}
Some(char_map) => {
let mut normalized_byte_len = 0;
let mut original_byte_len = 0;
let char_count = char_map
.iter()
.take_while(|(original_bytes_in_char, normalized_bytes_in_char)| {
if normalized_byte_len < num_bytes {
original_byte_len += *original_bytes_in_char as usize;
normalized_byte_len += *normalized_bytes_in_char as usize;
true
} else {
false
}
})
.count();
(char_count, original_byte_len)
}
}
}
}
#[cfg(test)]
pub type StaticToken = Token<'static>;
#[cfg(test)]
impl Arbitrary for Token<'static> {
fn arbitrary(g: &mut Gen) -> Self {
let lemma = String::arbitrary(g);
let bytes_count = lemma.len();
let byte_start = usize::arbitrary(g).saturating_sub(bytes_count);
let byte_end = byte_start + bytes_count;
let chars_count = lemma.chars().count();
let char_start = usize::arbitrary(g).saturating_sub(chars_count);
let char_end = char_start + chars_count;
Token {
kind: TokenKind::arbitrary(g),
lemma: Cow::Owned(String::arbitrary(g)),
char_start,
char_end,
byte_start,
byte_end,
char_map: None,
script: Script::arbitrary(g),
language: Option::arbitrary(g),
}
}
}