use std::io::{self, Read};
use crate::analysis::chunk_reader::Utf8ChunkReader;
use crate::analysis::{Analyzer, AnalyzerFactory, Token};
use crate::document::TermOffset;
const MAX_TOKEN_LENGTH: usize = 255;
pub struct StandardTokenizer;
impl StandardTokenizer {
fn is_word_char(c: char) -> bool {
c.is_alphanumeric() || c == '_'
}
fn is_internal_separator(c: char) -> bool {
c == '\'' || c == '\u{2019}' }
}
#[derive(Default)]
pub struct StandardAnalyzer {
chunk_reader: Option<Utf8ChunkReader>,
current: String,
pos: usize,
boundary_buf: String,
bytes_consumed: usize,
eof: bool,
}
impl StandardAnalyzer {
pub fn new() -> Self {
Self::default()
}
fn load_next_chunk(&mut self) -> io::Result<()> {
self.bytes_consumed += self.current.len();
if let Some(reader) = &mut self.chunk_reader {
match reader.next_chunk()? {
Some(mut chunk) => {
chunk.make_ascii_lowercase();
self.current = chunk;
self.pos = 0;
}
None => {
self.current.clear();
self.pos = 0;
self.eof = true;
}
}
} else {
self.eof = true;
}
Ok(())
}
#[cfg(test)]
fn with_capacity(capacity: usize, reader: Box<dyn Read + Send>) -> Self {
Self {
chunk_reader: Some(Utf8ChunkReader::with_capacity(capacity, reader)),
..Self::default()
}
}
}
impl Analyzer for StandardAnalyzer {
fn set_reader(&mut self, reader: Box<dyn Read + Send>) {
self.chunk_reader = Some(Utf8ChunkReader::new(reader));
self.current.clear();
self.pos = 0;
self.boundary_buf.clear();
self.bytes_consumed = 0;
self.eof = false;
}
fn next_token(&mut self) -> io::Result<Option<Token<'_>>> {
'skip: loop {
let bytes = self.current.as_bytes();
while self.pos < bytes.len() {
let b = bytes[self.pos];
if b < 0x80 {
if StandardTokenizer::is_word_char(b as char) {
break 'skip;
}
self.pos += 1;
} else {
let ch = self.current[self.pos..].chars().next().unwrap();
if StandardTokenizer::is_word_char(ch) {
break 'skip;
}
self.pos += ch.len_utf8();
}
}
if self.eof {
return Ok(None);
}
self.load_next_chunk()?;
}
let token_start_byte = self.bytes_consumed + self.pos;
let scan_start = self.pos;
let mut char_count: usize = 0;
let mut spanning = false;
let mut pending_sep: Option<char> = None;
'token: loop {
if let Some(sep) = pending_sep.take() {
let sep_len = sep.len_utf8();
if self.pos >= self.current.len() {
if !spanning {
_ = sep_len;
}
break 'token;
}
let next_ch = self.current[self.pos..].chars().next().unwrap();
if next_ch.is_alphanumeric() {
if spanning {
self.boundary_buf.push(sep);
self.boundary_buf.push(next_ch);
}
self.pos += next_ch.len_utf8();
char_count += 2;
} else {
if !spanning {
self.pos -= sep_len;
}
break 'token;
}
}
let bytes = self.current.as_bytes();
while self.pos < bytes.len() && char_count < MAX_TOKEN_LENGTH {
let b = bytes[self.pos];
let ch = if b < 0x80 {
b as char
} else {
self.current[self.pos..].chars().next().unwrap()
};
if StandardTokenizer::is_word_char(ch) {
if spanning {
self.boundary_buf.push(ch);
}
self.pos += ch.len_utf8();
char_count += 1;
} else if StandardTokenizer::is_internal_separator(ch) {
let sep_len = ch.len_utf8();
if self.pos + sep_len >= bytes.len() && !self.eof {
if !spanning {
self.boundary_buf.clear();
self.boundary_buf
.push_str(&self.current[scan_start..self.pos]);
spanning = true;
}
self.pos += sep_len;
pending_sep = Some(ch);
break;
}
if self.pos + sep_len < bytes.len() {
let next_ch = self.current[self.pos + sep_len..].chars().next().unwrap();
if next_ch.is_alphanumeric() {
if spanning {
self.boundary_buf.push(ch);
self.boundary_buf.push(next_ch);
}
self.pos += sep_len + next_ch.len_utf8();
char_count += 2;
} else {
break 'token;
}
} else {
break 'token;
}
} else {
break 'token;
}
}
if char_count >= MAX_TOKEN_LENGTH {
break 'token;
}
if pending_sep.is_some() {
self.load_next_chunk()?;
continue 'token;
}
if self.eof {
break 'token;
}
if !spanning {
self.boundary_buf.clear();
self.boundary_buf
.push_str(&self.current[scan_start..self.pos]);
spanning = true;
}
self.load_next_chunk()?;
}
if char_count >= MAX_TOKEN_LENGTH {
if !spanning {
self.boundary_buf.clear();
self.boundary_buf
.push_str(&self.current[scan_start..self.pos]);
spanning = true;
}
'skip_overflow: loop {
let bytes = self.current.as_bytes();
while self.pos < bytes.len() {
let ch = if bytes[self.pos] < 0x80 {
bytes[self.pos] as char
} else {
self.current[self.pos..].chars().next().unwrap()
};
if StandardTokenizer::is_word_char(ch)
|| StandardTokenizer::is_internal_separator(ch)
{
self.pos += ch.len_utf8();
} else {
break 'skip_overflow;
}
}
if self.eof {
break;
}
self.load_next_chunk()?;
}
}
if spanning {
Ok(Some(Token {
text: &self.boundary_buf,
offset: TermOffset {
start: token_start_byte as u32,
length: self.boundary_buf.len() as u16,
},
position_increment: 1,
}))
} else {
let token_len = (self.bytes_consumed + self.pos) - token_start_byte;
Ok(Some(Token {
text: &self.current[scan_start..self.pos],
offset: TermOffset {
start: token_start_byte as u32,
length: token_len as u16,
},
position_increment: 1,
}))
}
}
}
#[derive(Debug, Clone, Copy)]
pub struct StandardAnalyzerFactory;
impl AnalyzerFactory for StandardAnalyzerFactory {
fn create(&self) -> Box<dyn Analyzer> {
Box::new(StandardAnalyzer::new())
}
}
#[cfg(test)]
mod tests {
use super::*;
use assertables::*;
fn tokenize(text: &str) -> Vec<(String, TermOffset)> {
collect_tokens(text)
.into_iter()
.map(|(t, offset, _)| (t, offset))
.collect()
}
#[test]
fn test_standard_tokenizer_simple() {
let tokens = tokenize("hello world");
assert_len_eq_x!(&tokens, 2);
assert_eq!(
tokens[0],
(
"hello".to_string(),
TermOffset {
start: 0,
length: 5
}
)
);
assert_eq!(
tokens[1],
(
"world".to_string(),
TermOffset {
start: 6,
length: 5
}
)
);
}
#[test]
fn test_standard_tokenizer_contraction() {
let tokens = tokenize("don't stop");
assert_len_eq_x!(&tokens, 2);
assert_eq!(tokens[0].0, "don't");
assert_eq!(tokens[1].0, "stop");
}
#[test]
fn test_standard_tokenizer_numbers() {
let tokens = tokenize("test123 456");
assert_len_eq_x!(&tokens, 2);
assert_eq!(tokens[0].0, "test123");
assert_eq!(tokens[1].0, "456");
}
#[test]
fn test_standard_tokenizer_punctuation() {
let tokens = tokenize("hello, world! foo.");
assert_len_eq_x!(&tokens, 3);
assert_eq!(tokens[0].0, "hello");
assert_eq!(tokens[1].0, "world");
assert_eq!(tokens[2].0, "foo");
}
#[test]
fn test_standard_tokenizer_empty() {
let tokens = tokenize("");
assert_is_empty!(tokens);
}
#[test]
fn test_apostrophe_at_end_of_input() {
let tokens = tokenize("don't");
assert_len_eq_x!(&tokens, 1);
assert_eq!(tokens[0].0, "don't");
let tokens = tokenize("hello'");
assert_len_eq_x!(&tokens, 1);
assert_eq!(tokens[0].0, "hello");
}
#[test]
fn test_apostrophe_followed_by_non_alpha() {
let tokens = tokenize("it' s");
assert_len_eq_x!(&tokens, 2);
assert_eq!(tokens[0].0, "it");
assert_eq!(tokens[1].0, "s");
}
#[test]
fn test_token_exceeding_max_length() {
let long_word: String = "a".repeat(255);
let tokens = tokenize(&long_word);
assert_len_eq_x!(&tokens, 1);
assert_len_eq_x!(&tokens[0].0, 255);
let too_long: String = "b".repeat(300);
let tokens = tokenize(&too_long);
assert_len_eq_x!(&tokens, 1);
assert_len_eq_x!(&tokens[0].0, 255);
let input = format!("{} short", "c".repeat(300));
let tokens = tokenize(&input);
assert_len_eq_x!(&tokens, 2);
assert_len_eq_x!(&tokens[0].0, 255);
assert_eq!(tokens[1].0, "short");
}
fn collect_tokens(text: &str) -> Vec<(String, TermOffset, i32)> {
let mut analyzer = StandardAnalyzer::default();
analyzer.set_reader(Box::new(io::Cursor::new(text.as_bytes().to_vec())));
let mut result = Vec::new();
while let Some(token) = analyzer.next_token().unwrap() {
result.push((
token.text.to_string(),
token.offset,
token.position_increment,
));
}
result
}
#[test]
fn test_standard_analyzer() {
let tokens = collect_tokens("The quick brown fox");
let texts: Vec<&str> = tokens.iter().map(|t| t.0.as_str()).collect();
assert_eq!(texts, vec!["the", "quick", "brown", "fox"]);
}
#[test]
fn test_standard_analyzer_no_stop_words() {
let tokens = collect_tokens("the quick and brown fox");
assert_len_eq_x!(&tokens, 5);
let texts: Vec<&str> = tokens.iter().map(|t| t.0.as_str()).collect();
assert_eq!(texts, vec!["the", "quick", "and", "brown", "fox"]);
for t in &tokens {
assert_eq!(t.2, 1);
}
}
#[test]
fn test_lowercases_tokens() {
let tokens = collect_tokens("Hello WORLD");
assert_eq!(tokens[0].0, "hello");
assert_eq!(tokens[1].0, "world");
}
#[test]
fn test_empty_string_produces_no_tokens() {
let tokens = collect_tokens("");
assert_is_empty!(&tokens);
}
#[test]
fn test_preserves_contractions() {
let tokens = collect_tokens("don't");
assert_len_eq_x!(&tokens, 1);
assert_eq!(tokens[0].0, "don't");
}
#[test]
fn test_offsets_are_correct() {
let tokens = collect_tokens("hello world");
assert_eq!(
tokens[0].1,
TermOffset {
start: 0,
length: 5
}
);
assert_eq!(
tokens[1].1,
TermOffset {
start: 6,
length: 5
}
);
}
#[test]
fn test_position_increments_are_one() {
let tokens = collect_tokens("one two three");
for t in &tokens {
assert_eq!(t.2, 1);
}
}
#[test]
fn test_set_reader_allows_reuse() {
let mut analyzer = StandardAnalyzer::default();
analyzer.set_reader(Box::new(io::Cursor::new(b"hello".to_vec())));
let token = analyzer.next_token().unwrap();
assert_some!(&token);
let none = analyzer.next_token().unwrap();
assert_none!(&none);
analyzer.set_reader(Box::new(io::Cursor::new(b"world".to_vec())));
let token = analyzer.next_token().unwrap();
assert_some!(&token);
assert_eq!(token.unwrap().text, "world");
}
fn collect_tokens_chunked(text: &str, capacity: usize) -> Vec<(String, TermOffset, i32)> {
let reader: Box<dyn Read + Send> = Box::new(io::Cursor::new(text.as_bytes().to_vec()));
let mut analyzer = StandardAnalyzer::with_capacity(capacity, reader);
let mut result = Vec::new();
while let Some(token) = analyzer.next_token().unwrap() {
result.push((
token.text.to_string(),
token.offset,
token.position_increment,
));
}
result
}
#[test]
fn test_token_spanning_chunk_boundary() {
let tokens = collect_tokens_chunked("hello world", 4);
assert_len_eq_x!(&tokens, 2);
assert_eq!(tokens[0].0, "hello");
assert_eq!(tokens[1].0, "world");
}
#[test]
fn test_contraction_spanning_boundary() {
let tokens = collect_tokens_chunked("don't stop", 4);
assert_len_eq_x!(&tokens, 2);
assert_eq!(tokens[0].0, "don't");
assert_eq!(tokens[1].0, "stop");
}
#[test]
fn test_separator_at_exact_boundary() {
let tokens = collect_tokens_chunked("ab'cd", 3);
assert_len_eq_x!(&tokens, 1);
assert_eq!(tokens[0].0, "ab'cd");
}
#[test]
fn test_separator_at_boundary_followed_by_non_alpha() {
let tokens = collect_tokens_chunked("ab' x", 3);
assert_len_eq_x!(&tokens, 2);
assert_eq!(tokens[0].0, "ab");
assert_eq!(tokens[1].0, "x");
}
#[test]
fn test_comprehensive_tiny_chunks_match_default() {
let input = "The quick brown fox don't jump over the lazy dog's bed";
let default_tokens = collect_tokens(input);
let default_texts: Vec<&str> = default_tokens.iter().map(|t| t.0.as_str()).collect();
let chunked_tokens = collect_tokens_chunked(input, 4);
let chunked_texts: Vec<&str> = chunked_tokens.iter().map(|t| t.0.as_str()).collect();
assert_eq!(chunked_texts, default_texts);
}
#[test]
fn test_token_at_eof_no_trailing_whitespace() {
let tokens = collect_tokens_chunked("hello", 3);
assert_len_eq_x!(&tokens, 1);
assert_eq!(tokens[0].0, "hello");
}
#[test]
fn test_empty_input_chunked() {
let tokens = collect_tokens_chunked("", 4);
assert_is_empty!(&tokens);
}
#[test]
fn test_set_reader_reuse_with_streaming() {
let reader1: Box<dyn Read + Send> = Box::new(io::Cursor::new(b"hello".to_vec()));
let mut analyzer = StandardAnalyzer::with_capacity(3, reader1);
let token = analyzer.next_token().unwrap();
assert_eq!(token.unwrap().text, "hello");
let none = analyzer.next_token().unwrap();
assert_none!(&none);
analyzer.set_reader(Box::new(io::Cursor::new(b"world".to_vec())));
let token = analyzer.next_token().unwrap();
assert_eq!(token.unwrap().text, "world");
}
#[test]
fn test_offsets_correct_across_chunks() {
let tokens = collect_tokens_chunked("hello world", 4);
assert_eq!(
tokens[0].1,
TermOffset {
start: 0,
length: 5
}
);
assert_eq!(
tokens[1].1,
TermOffset {
start: 6,
length: 5
}
);
}
#[test]
fn test_many_tokens_tiny_chunks() {
let input = "a b c d e f g h i j";
let tokens = collect_tokens_chunked(input, 3);
let texts: Vec<&str> = tokens.iter().map(|t| t.0.as_str()).collect();
assert_eq!(
texts,
vec!["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]
);
}
#[test]
fn test_max_token_length_exact() {
let input: String = "a".repeat(255);
let tokens = collect_tokens(&input);
assert_len_eq_x!(&tokens, 1);
assert_len_eq_x!(&tokens[0].0, 255);
}
#[test]
fn test_max_token_length_exceeded() {
let input: String = "b".repeat(300);
let tokens = collect_tokens(&input);
assert_len_eq_x!(&tokens, 1);
assert_len_eq_x!(&tokens[0].0, 255);
}
#[test]
fn test_max_token_length_with_following_token() {
let input = format!("{} short", "c".repeat(300));
let tokens = collect_tokens(&input);
assert_len_eq_x!(&tokens, 2);
assert_len_eq_x!(&tokens[0].0, 255);
assert_eq!(tokens[1].0, "short");
}
#[test]
fn test_smart_quote_contraction_at_boundary() {
let input = "don\u{2019}t";
let tokens = collect_tokens_chunked(input, 4);
assert_len_eq_x!(&tokens, 1);
assert_eq!(tokens[0].0, "don\u{2019}t");
}
}