use crate::analysis::token::Token;
pub trait Tokenizer: Send + Sync {
fn tokenize(&self, text: &str, output: &mut Vec<Token>);
}
pub struct StandardTokenizer;
impl Tokenizer for StandardTokenizer {
fn tokenize(&self, text: &str, output: &mut Vec<Token>) {
use unicode_segmentation::UnicodeSegmentation;
let mut position = output.last().map_or(0, |t| t.position + 1);
for (byte_offset, word) in text.unicode_word_indices() {
output.push(Token::new(
word,
byte_offset,
byte_offset + word.len(),
position,
));
position += 1;
}
}
}
pub struct WhitespaceTokenizer;
impl Tokenizer for WhitespaceTokenizer {
fn tokenize(&self, text: &str, output: &mut Vec<Token>) {
let mut position = output.last().map_or(0, |t| t.position + 1);
for token_text in text.split_whitespace() {
let byte_offset = token_text.as_ptr() as usize - text.as_ptr() as usize;
output.push(Token::new(
token_text,
byte_offset,
byte_offset + token_text.len(),
position,
));
position += 1;
}
}
}
pub struct LetterTokenizer;
impl Tokenizer for LetterTokenizer {
fn tokenize(&self, text: &str, output: &mut Vec<Token>) {
let mut position = output.last().map_or(0, |t| t.position + 1);
let mut start = None;
for (i, ch) in text.char_indices() {
if ch.is_alphabetic() {
if start.is_none() {
start = Some(i);
}
} else if let Some(s) = start.take() {
output.push(Token::new(&text[s..i], s, i, position));
position += 1;
}
}
if let Some(s) = start {
output.push(Token::new(&text[s..], s, text.len(), position));
}
}
}
pub struct KeywordTokenizer;
impl Tokenizer for KeywordTokenizer {
fn tokenize(&self, text: &str, output: &mut Vec<Token>) {
if text.is_empty() {
return;
}
let position = output.last().map_or(0, |t| t.position + 1);
output.push(Token::new(text, 0, text.len(), position));
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum TokenChar {
Letter,
Digit,
Whitespace,
Punctuation,
Symbol,
}
impl TokenChar {
fn matches(&self, ch: char) -> bool {
match self {
TokenChar::Letter => ch.is_alphabetic(),
TokenChar::Digit => ch.is_ascii_digit(),
TokenChar::Whitespace => ch.is_whitespace(),
TokenChar::Punctuation => ch.is_ascii_punctuation(),
TokenChar::Symbol => {
!ch.is_alphanumeric() && !ch.is_whitespace() && !ch.is_ascii_punctuation()
}
}
}
pub fn from_str(s: &str) -> Option<Self> {
match s {
"letter" => Some(TokenChar::Letter),
"digit" => Some(TokenChar::Digit),
"whitespace" => Some(TokenChar::Whitespace),
"punctuation" => Some(TokenChar::Punctuation),
"symbol" => Some(TokenChar::Symbol),
_ => None,
}
}
}
fn is_token_char(ch: char, token_chars: &[TokenChar]) -> bool {
if token_chars.is_empty() {
return true; }
token_chars.iter().any(|tc| tc.matches(ch))
}
pub struct NGramTokenizer {
pub min_gram: usize,
pub max_gram: usize,
pub token_chars: Vec<TokenChar>,
}
impl NGramTokenizer {
pub fn new(min_gram: usize, max_gram: usize, token_chars: Vec<TokenChar>) -> Self {
Self {
min_gram,
max_gram,
token_chars,
}
}
}
impl Tokenizer for NGramTokenizer {
fn tokenize(&self, text: &str, output: &mut Vec<Token>) {
let mut position = output.last().map_or(0, |t| t.position + 1);
let words = split_by_token_chars(text, &self.token_chars);
for (word, word_offset) in words {
let chars: Vec<(usize, char)> = word.char_indices().collect();
for n in self.min_gram..=self.max_gram {
if n > chars.len() {
break;
}
for i in 0..=chars.len() - n {
let start = chars[i].0;
let end = if i + n < chars.len() {
chars[i + n].0
} else {
word.len()
};
let gram = &word[start..end];
output.push(Token::new(
gram,
word_offset + start,
word_offset + end,
position,
));
position += 1;
}
}
}
}
}
pub struct EdgeNGramTokenizer {
pub min_gram: usize,
pub max_gram: usize,
pub token_chars: Vec<TokenChar>,
}
impl EdgeNGramTokenizer {
pub fn new(min_gram: usize, max_gram: usize, token_chars: Vec<TokenChar>) -> Self {
Self {
min_gram,
max_gram,
token_chars,
}
}
}
impl Tokenizer for EdgeNGramTokenizer {
fn tokenize(&self, text: &str, output: &mut Vec<Token>) {
let mut position = output.last().map_or(0, |t| t.position + 1);
let words = split_by_token_chars(text, &self.token_chars);
for (word, word_offset) in words {
let chars: Vec<(usize, char)> = word.char_indices().collect();
for n in self.min_gram..=self.max_gram.min(chars.len()) {
let end = if n < chars.len() {
chars[n].0
} else {
word.len()
};
let gram = &word[..end];
output.push(Token::new(gram, word_offset, word_offset + end, position));
position += 1;
}
}
}
}
fn split_by_token_chars<'a>(text: &'a str, token_chars: &[TokenChar]) -> Vec<(&'a str, usize)> {
if token_chars.is_empty() {
if text.is_empty() {
return Vec::new();
}
return vec![(text, 0)];
}
let mut words = Vec::new();
let mut start = None;
for (i, ch) in text.char_indices() {
if is_token_char(ch, token_chars) {
if start.is_none() {
start = Some(i);
}
} else if let Some(s) = start.take() {
words.push((&text[s..i], s));
}
}
if let Some(s) = start {
words.push((&text[s..], s));
}
words
}
pub struct PatternTokenizer {
pattern: regex::Regex,
}
impl PatternTokenizer {
pub fn new(pattern: &str) -> Result<Self, regex::Error> {
Ok(Self {
pattern: regex::Regex::new(pattern)?,
})
}
}
impl Tokenizer for PatternTokenizer {
fn tokenize(&self, text: &str, output: &mut Vec<Token>) {
let mut position = output.last().map_or(0, |t| t.position + 1);
let mut last_end = 0;
for m in self.pattern.find_iter(text) {
if m.start() > last_end {
let token_text = &text[last_end..m.start()];
if !token_text.is_empty() {
output.push(Token::new(token_text, last_end, m.start(), position));
position += 1;
}
}
last_end = m.end();
}
if last_end < text.len() {
let token_text = &text[last_end..];
if !token_text.is_empty() {
output.push(Token::new(token_text, last_end, text.len(), position));
}
}
}
}
pub struct PathHierarchyTokenizer {
pub separator: char,
pub replacement: Option<char>,
}
impl PathHierarchyTokenizer {
pub fn new(separator: char, replacement: Option<char>) -> Self {
Self {
separator,
replacement,
}
}
}
impl Default for PathHierarchyTokenizer {
fn default() -> Self {
Self {
separator: '/',
replacement: None,
}
}
}
impl Tokenizer for PathHierarchyTokenizer {
fn tokenize(&self, text: &str, output: &mut Vec<Token>) {
if text.is_empty() {
return;
}
let mut position = output.last().map_or(0, |t| t.position + 1);
let replacement = self.replacement.unwrap_or(self.separator);
let mut sep_positions: Vec<usize> = Vec::new();
for (i, ch) in text.char_indices() {
if ch == self.separator {
sep_positions.push(i);
}
}
if sep_positions.is_empty() {
let token_text = if self.replacement.is_some() {
text.to_string()
} else {
text.to_string()
};
output.push(Token::new(token_text, 0, text.len(), position));
return;
}
for &sep_pos in &sep_positions {
let end = sep_pos;
if end == 0 {
continue; }
let segment = &text[..end];
let token_text = if replacement != self.separator {
segment.replace(self.separator, &replacement.to_string())
} else {
segment.to_string()
};
output.push(Token::new(token_text, 0, end, position));
position += 1;
}
let token_text = if replacement != self.separator {
text.replace(self.separator, &replacement.to_string())
} else {
text.to_string()
};
output.push(Token::new(token_text, 0, text.len(), position));
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn standard_basic() {
let mut tokens = Vec::new();
StandardTokenizer.tokenize("Hello, world!", &mut tokens);
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].text, "Hello");
assert_eq!(tokens[1].text, "world");
}
#[test]
fn standard_positions() {
let mut tokens = Vec::new();
StandardTokenizer.tokenize("the quick brown fox", &mut tokens);
assert_eq!(tokens.len(), 4);
for (i, token) in tokens.iter().enumerate() {
assert_eq!(token.position, i as u32);
}
}
#[test]
fn standard_offsets() {
let mut tokens = Vec::new();
StandardTokenizer.tokenize("Hello world", &mut tokens);
assert_eq!(tokens[0].offset_from, 0);
assert_eq!(tokens[0].offset_to, 5);
assert_eq!(tokens[1].offset_from, 6);
assert_eq!(tokens[1].offset_to, 11);
}
#[test]
fn standard_empty() {
let mut tokens = Vec::new();
StandardTokenizer.tokenize("", &mut tokens);
assert!(tokens.is_empty());
}
#[test]
fn standard_punctuation_only() {
let mut tokens = Vec::new();
StandardTokenizer.tokenize("!!! ... ???", &mut tokens);
assert!(tokens.is_empty());
}
#[test]
fn standard_numbers() {
let mut tokens = Vec::new();
StandardTokenizer.tokenize("test123 456abc", &mut tokens);
assert!(tokens.len() >= 2);
}
#[test]
fn standard_apostrophe() {
let mut tokens = Vec::new();
StandardTokenizer.tokenize("it's a test", &mut tokens);
assert!(tokens.iter().any(|t| t.text.contains("it")));
}
#[test]
fn whitespace_basic() {
let mut tokens = Vec::new();
WhitespaceTokenizer.tokenize("Hello, world!", &mut tokens);
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].text, "Hello,");
assert_eq!(tokens[1].text, "world!");
}
#[test]
fn whitespace_preserves_punctuation() {
let mut tokens = Vec::new();
WhitespaceTokenizer.tokenize("price=$100.00", &mut tokens);
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].text, "price=$100.00");
}
#[test]
fn whitespace_multiple_spaces() {
let mut tokens = Vec::new();
WhitespaceTokenizer.tokenize("a b\t\nc", &mut tokens);
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].text, "a");
assert_eq!(tokens[1].text, "b");
assert_eq!(tokens[2].text, "c");
}
#[test]
fn whitespace_offsets() {
let mut tokens = Vec::new();
WhitespaceTokenizer.tokenize("Hello world", &mut tokens);
assert_eq!(tokens[0].offset_from, 0);
assert_eq!(tokens[0].offset_to, 5);
assert_eq!(tokens[1].offset_from, 6);
assert_eq!(tokens[1].offset_to, 11);
}
#[test]
fn whitespace_empty() {
let mut tokens = Vec::new();
WhitespaceTokenizer.tokenize("", &mut tokens);
assert!(tokens.is_empty());
}
#[test]
fn letter_basic() {
let mut tokens = Vec::new();
LetterTokenizer.tokenize("Hello, world!", &mut tokens);
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].text, "Hello");
assert_eq!(tokens[1].text, "world");
}
#[test]
fn letter_strips_numbers() {
let mut tokens = Vec::new();
LetterTokenizer.tokenize("test123data", &mut tokens);
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].text, "test");
assert_eq!(tokens[1].text, "data");
}
#[test]
fn letter_unicode() {
let mut tokens = Vec::new();
LetterTokenizer.tokenize("café résumé", &mut tokens);
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].text, "café");
assert_eq!(tokens[1].text, "résumé");
}
#[test]
fn letter_offsets() {
let mut tokens = Vec::new();
LetterTokenizer.tokenize("abc 123 def", &mut tokens);
assert_eq!(tokens[0].offset_from, 0);
assert_eq!(tokens[0].offset_to, 3);
assert_eq!(tokens[1].offset_from, 8);
assert_eq!(tokens[1].offset_to, 11);
}
#[test]
fn letter_empty() {
let mut tokens = Vec::new();
LetterTokenizer.tokenize("", &mut tokens);
assert!(tokens.is_empty());
}
#[test]
fn letter_no_letters() {
let mut tokens = Vec::new();
LetterTokenizer.tokenize("12345 !@#$%", &mut tokens);
assert!(tokens.is_empty());
}
#[test]
fn keyword_basic() {
let mut tokens = Vec::new();
KeywordTokenizer.tokenize("Hello, world!", &mut tokens);
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].text, "Hello, world!");
}
#[test]
fn keyword_offsets() {
let mut tokens = Vec::new();
KeywordTokenizer.tokenize("test value", &mut tokens);
assert_eq!(tokens[0].offset_from, 0);
assert_eq!(tokens[0].offset_to, 10);
assert_eq!(tokens[0].position, 0);
}
#[test]
fn keyword_empty() {
let mut tokens = Vec::new();
KeywordTokenizer.tokenize("", &mut tokens);
assert!(tokens.is_empty());
}
#[test]
fn ngram_basic() {
let tok = NGramTokenizer::new(2, 3, vec![]);
let mut tokens = Vec::new();
tok.tokenize("Quick", &mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["Qu", "ui", "ic", "ck", "Qui", "uic", "ick"]);
}
#[test]
fn ngram_with_token_chars() {
let tok = NGramTokenizer::new(3, 3, vec![TokenChar::Letter, TokenChar::Digit]);
let mut tokens = Vec::new();
tok.tokenize("2 Quick Foxes", &mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["Qui", "uic", "ick", "Fox", "oxe", "xes"]);
}
#[test]
fn ngram_empty() {
let tok = NGramTokenizer::new(1, 2, vec![]);
let mut tokens = Vec::new();
tok.tokenize("", &mut tokens);
assert!(tokens.is_empty());
}
#[test]
fn ngram_offsets() {
let tok = NGramTokenizer::new(2, 2, vec![]);
let mut tokens = Vec::new();
tok.tokenize("abc", &mut tokens);
assert_eq!(tokens[0].text, "ab");
assert_eq!(tokens[0].offset_from, 0);
assert_eq!(tokens[0].offset_to, 2);
assert_eq!(tokens[1].text, "bc");
assert_eq!(tokens[1].offset_from, 1);
assert_eq!(tokens[1].offset_to, 3);
}
#[test]
fn edge_ngram_basic() {
let tok = EdgeNGramTokenizer::new(2, 5, vec![TokenChar::Letter]);
let mut tokens = Vec::new();
tok.tokenize("Quick", &mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["Qu", "Qui", "Quic", "Quick"]);
}
#[test]
fn edge_ngram_multiple_words() {
let tok = EdgeNGramTokenizer::new(2, 4, vec![TokenChar::Letter]);
let mut tokens = Vec::new();
tok.tokenize("Quick Fox", &mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["Qu", "Qui", "Quic", "Fo", "Fox"]);
}
#[test]
fn edge_ngram_min_larger_than_word() {
let tok = EdgeNGramTokenizer::new(5, 10, vec![TokenChar::Letter]);
let mut tokens = Vec::new();
tok.tokenize("Hi", &mut tokens);
assert!(tokens.is_empty()); }
#[test]
fn edge_ngram_offsets() {
let tok = EdgeNGramTokenizer::new(2, 3, vec![TokenChar::Letter]);
let mut tokens = Vec::new();
tok.tokenize("Hello", &mut tokens);
assert_eq!(tokens[0].text, "He");
assert_eq!(tokens[0].offset_from, 0);
assert_eq!(tokens[0].offset_to, 2);
assert_eq!(tokens[1].text, "Hel");
assert_eq!(tokens[1].offset_from, 0);
assert_eq!(tokens[1].offset_to, 3);
}
#[test]
fn pattern_basic() {
let tok = PatternTokenizer::new(r"[ .,!?]").unwrap();
let mut tokens = Vec::new();
tok.tokenize("Hello, World! Test.", &mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["Hello", "World", "Test"]);
}
#[test]
fn pattern_no_match() {
let tok = PatternTokenizer::new(r"\d+").unwrap();
let mut tokens = Vec::new();
tok.tokenize("hello world", &mut tokens);
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].text, "hello world");
}
#[test]
fn pattern_offsets() {
let tok = PatternTokenizer::new(r"\s+").unwrap();
let mut tokens = Vec::new();
tok.tokenize("hello world", &mut tokens);
assert_eq!(tokens[0].offset_from, 0);
assert_eq!(tokens[0].offset_to, 5);
assert_eq!(tokens[1].offset_from, 6);
assert_eq!(tokens[1].offset_to, 11);
}
#[test]
fn path_hierarchy_basic() {
let tok = PathHierarchyTokenizer::default();
let mut tokens = Vec::new();
tok.tokenize("/a/b/c", &mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["/a", "/a/b", "/a/b/c"]);
}
#[test]
fn path_hierarchy_no_leading_sep() {
let tok = PathHierarchyTokenizer::default();
let mut tokens = Vec::new();
tok.tokenize("a/b/c", &mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["a", "a/b", "a/b/c"]);
}
#[test]
fn path_hierarchy_no_sep() {
let tok = PathHierarchyTokenizer::default();
let mut tokens = Vec::new();
tok.tokenize("filename", &mut tokens);
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].text, "filename");
}
#[test]
fn path_hierarchy_custom_sep() {
let tok = PathHierarchyTokenizer::new('.', None);
let mut tokens = Vec::new();
tok.tokenize("com.example.app", &mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["com", "com.example", "com.example.app"]);
}
}