#[cfg(test)]
mod tests {
use super::super::*;
fn create_tokenizer() -> Tokenizer {
Tokenizer::new().expect("Failed to create tokenizer")
}
#[test]
fn test_basic_word_tokenization() {
let tokenizer = create_tokenizer();
let text = "Hello world!";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
assert_eq!(tokenized.tokens.len(), 3);
let token0 = &tokenized.tokens[0];
assert_eq!(token0.index, 0);
assert_eq!(token0.token_type, TokenType::Word);
assert_eq!(token0.char_interval.start_pos, 0);
assert_eq!(token0.char_interval.end_pos, 5);
assert_eq!(&text[token0.char_interval.start_pos..token0.char_interval.end_pos], "Hello");
let token1 = &tokenized.tokens[1];
assert_eq!(token1.index, 1);
assert_eq!(token1.token_type, TokenType::Word);
assert_eq!(token1.char_interval.start_pos, 6);
assert_eq!(token1.char_interval.end_pos, 11);
assert_eq!(&text[token1.char_interval.start_pos..token1.char_interval.end_pos], "world");
let token2 = &tokenized.tokens[2];
assert_eq!(token2.index, 2);
assert_eq!(token2.token_type, TokenType::Punctuation);
assert_eq!(token2.char_interval.start_pos, 11);
assert_eq!(token2.char_interval.end_pos, 12);
assert_eq!(&text[token2.char_interval.start_pos..token2.char_interval.end_pos], "!");
}
#[test]
fn test_number_recognition() {
let tokenizer = create_tokenizer();
let text = "Price is $29.99 per item";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let tokens: Vec<_> = tokenized.tokens.iter().map(|t| {
let token_text = &text[t.char_interval.start_pos..t.char_interval.end_pos];
(token_text, t.token_type)
}).collect();
assert!(tokens.iter().any(|(text, typ)| *text == "Price" && *typ == TokenType::Word));
assert!(tokens.iter().any(|(text, typ)| *text == "$" && *typ == TokenType::Punctuation));
assert!(tokens.iter().any(|(text, typ)| *text == "29" && *typ == TokenType::Number));
assert!(tokens.iter().any(|(text, typ)| *text == "." && *typ == TokenType::Punctuation));
assert!(tokens.iter().any(|(text, typ)| *text == "99" && *typ == TokenType::Number));
}
#[test]
fn test_acronym_detection() {
let tokenizer = create_tokenizer();
let text = "The HTML/CSS/JS stack";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let acronym_token = tokenized.tokens.iter().find(|t| {
let token_text = &text[t.char_interval.start_pos..t.char_interval.end_pos];
token_text == "HTML/CSS/JS"
});
assert!(acronym_token.is_some(), "HTML/CSS/JS should be found as a token");
let acronym_token = acronym_token.unwrap();
assert_eq!(acronym_token.token_type, TokenType::Acronym);
}
#[test]
fn test_newline_boundary_tracking() {
let tokenizer = create_tokenizer();
let text = "First line\nSecond line";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let second_token = tokenized.tokens.iter().find(|t| {
let token_text = &text[t.char_interval.start_pos..t.char_interval.end_pos];
token_text == "Second"
});
assert!(second_token.is_some(), "Second token should be found");
let second_token = second_token.unwrap();
assert!(second_token.first_token_after_newline, "Second token should be marked as first after newline");
let first_token = tokenized.tokens.iter().find(|t| {
let token_text = &text[t.char_interval.start_pos..t.char_interval.end_pos];
token_text == "First"
});
assert!(first_token.is_some(), "First token should be found");
let first_token = first_token.unwrap();
assert!(!first_token.first_token_after_newline, "First token should not be marked as first after newline");
}
#[test]
fn test_text_reconstruction() {
let tokenizer = create_tokenizer();
let text = "Hello, world! How are you?";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let token_interval = TokenInterval::new(0, 4).expect("Token interval creation failed");
let reconstructed = tokenizer.tokens_text(&tokenized, &token_interval)
.expect("Text reconstruction failed");
assert_eq!(reconstructed, "Hello, world!");
}
#[test]
fn test_token_interval_validation() {
let result = TokenInterval::new(5, 3);
assert!(result.is_err(), "Invalid token interval should return error");
if let Err(e) = result {
assert!(e.to_string().contains("Start index 5 must be < end index 3"));
}
}
#[test]
fn test_character_interval_mapping() {
let tokenizer = create_tokenizer();
let text = "Test string";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let token_interval = TokenInterval::new(0, 1).expect("Token interval creation failed");
let reconstructed = tokenizer.tokens_text(&tokenized, &token_interval)
.expect("Text reconstruction failed");
assert_eq!(reconstructed, "Test");
}
#[test]
fn test_abbreviation_recognition() {
let tokenizer = create_tokenizer();
let text = "Dr. Smith went to St. Mary's hospital.";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let sentence_range = tokenizer.find_sentence_range(text, &tokenized.tokens, 0)
.expect("Sentence range detection failed");
assert_eq!(sentence_range.start_index, 0);
assert_eq!(sentence_range.end_index, tokenized.tokens.len());
let sentence_text = tokenizer.tokens_text(&tokenized, &sentence_range)
.expect("Sentence text reconstruction failed");
assert_eq!(sentence_text, text);
}
#[test]
fn test_mixed_abbreviations_and_sentences() {
let tokenizer = create_tokenizer();
let text = "Mr. Bond asked why. Prof. X answered.";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let first_sentence = tokenizer.find_sentence_range(text, &tokenized.tokens, 0)
.expect("First sentence range detection failed");
let first_sentence_text = tokenizer.tokens_text(&tokenized, &first_sentence)
.expect("First sentence text reconstruction failed");
assert_eq!(first_sentence_text, "Mr. Bond asked why.");
let second_sentence = tokenizer.find_sentence_range(text, &tokenized.tokens, first_sentence.end_index)
.expect("Second sentence range detection failed");
let second_sentence_text = tokenizer.tokens_text(&tokenized, &second_sentence)
.expect("Second sentence text reconstruction failed");
assert_eq!(second_sentence_text, "Prof. X answered.");
}
#[test]
fn test_newline_with_capitalization() {
let tokenizer = create_tokenizer();
let text = "First sentence\nSecond sentence starts here";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let first_sentence = tokenizer.find_sentence_range(text, &tokenized.tokens, 0)
.expect("First sentence range detection failed");
assert!(first_sentence.end_index < tokenized.tokens.len());
let first_sentence_text = tokenizer.tokens_text(&tokenized, &first_sentence)
.expect("First sentence text reconstruction failed");
assert_eq!(first_sentence_text, "First sentence");
}
#[test]
fn test_newline_without_capitalization() {
let tokenizer = create_tokenizer();
let text = "First line\nsecond line continues";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let sentence_range = tokenizer.find_sentence_range(text, &tokenized.tokens, 0)
.expect("Sentence range detection failed");
assert_eq!(sentence_range.start_index, 0);
assert_eq!(sentence_range.end_index, tokenized.tokens.len());
}
#[test]
fn test_empty_text_handling() {
let tokenizer = create_tokenizer();
let text = "";
let tokenized = tokenizer.tokenize(text).expect("Tokenization of empty string failed");
assert_eq!(tokenized.tokens.len(), 0);
assert!(tokenized.is_empty());
}
#[test]
fn test_complex_punctuation() {
let tokenizer = create_tokenizer();
let text = "What?! That's amazing... isn't it?";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let tokens: Vec<_> = tokenized.tokens.iter().map(|t| {
&text[t.char_interval.start_pos..t.char_interval.end_pos]
}).collect();
assert!(tokens.contains(&"What"));
assert!(tokens.contains(&"?!"));
assert!(tokens.contains(&"..."));
assert!(tokens.contains(&"'"));
}
#[test]
fn test_multiple_spaces_and_tabs() {
let tokenizer = create_tokenizer();
let text = "Word1 \t Word2";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
assert_eq!(tokenized.tokens.len(), 4);
let token_texts: Vec<_> = tokenized.tokens.iter().map(|t| {
&text[t.char_interval.start_pos..t.char_interval.end_pos]
}).collect();
assert_eq!(token_texts, vec!["Word", "1", "Word", "2"]);
}
#[test]
fn test_token_interval_bounds_checking() {
let tokenizer = create_tokenizer();
let text = "Short text";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let bad_interval = TokenInterval::new(0, tokenized.tokens.len() + 1);
assert!(bad_interval.is_ok());
let result = tokenizer.tokens_text(&tokenized, &bad_interval.unwrap());
assert!(result.is_err(), "Should fail when token interval exceeds available tokens");
}
#[test]
fn test_basic_sentence_iteration() {
let tokenizer = create_tokenizer();
let text = "First sentence. Second sentence! Third sentence?";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let sentence_iter = SentenceIterator::new(&tokenized, &tokenizer, 0)
.expect("Failed to create sentence iterator");
let sentences: Result<Vec<_>, _> = sentence_iter.collect();
let sentences = sentences.expect("Sentence iteration failed");
assert_eq!(sentences.len(), 3, "Should find exactly 3 sentences");
let sentence_texts: Vec<_> = sentences.iter().map(|interval| {
tokenizer.tokens_text(&tokenized, interval).expect("Failed to reconstruct sentence text")
}).collect();
assert_eq!(sentence_texts[0], "First sentence.");
assert_eq!(sentence_texts[1], "Second sentence!");
assert_eq!(sentence_texts[2], "Third sentence?");
}
#[test]
fn test_starting_mid_document() {
let tokenizer = create_tokenizer();
let text = "First sentence. Second sentence! Third sentence?";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let start_token = 5; let mut sentence_iter = SentenceIterator::new(&tokenized, &tokenizer, start_token)
.expect("Failed to create sentence iterator");
let first_interval = sentence_iter.next()
.expect("Should have a sentence")
.expect("Sentence iteration should succeed");
assert_eq!(first_interval.start_index, start_token);
let partial_text = tokenizer.tokens_text(&tokenized, &first_interval)
.expect("Failed to reconstruct text");
assert!(!partial_text.starts_with("First"));
assert!(!partial_text.starts_with("Second"));
}
#[test]
fn test_empty_text_sentence_iteration() {
let tokenizer = create_tokenizer();
let text = "";
let tokenized = tokenizer.tokenize(text).expect("Tokenization of empty string failed");
let mut sentence_iter = SentenceIterator::new(&tokenized, &tokenizer, 0)
.expect("Failed to create sentence iterator for empty text");
let result = sentence_iter.next();
assert!(result.is_none(), "Empty text should immediately return None");
}
#[test]
fn test_sentence_iterator_out_of_bounds() {
let tokenizer = create_tokenizer();
let text = "Short text.";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let result = SentenceIterator::new(&tokenized, &tokenizer, tokenized.tokens.len() + 5);
assert!(result.is_err(), "Should fail when starting position is beyond token count");
}
#[test]
fn test_sentence_iterator_at_end() {
let tokenizer = create_tokenizer();
let text = "Test sentence.";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let mut sentence_iter = SentenceIterator::new(&tokenized, &tokenizer, tokenized.tokens.len())
.expect("Failed to create sentence iterator at end position");
let result = sentence_iter.next();
assert!(result.is_none(), "Starting at end position should return None");
}
#[test]
fn test_sentence_iterator_progressive() {
let tokenizer = create_tokenizer();
let text = "One. Two! Three?";
let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
let mut sentence_iter = SentenceIterator::new(&tokenized, &tokenizer, 0)
.expect("Failed to create sentence iterator");
let sentence1 = sentence_iter.next()
.expect("Should have first sentence")
.expect("First sentence iteration should succeed");
let text1 = tokenizer.tokens_text(&tokenized, &sentence1).expect("Failed to get text");
assert_eq!(text1, "One.");
let sentence2 = sentence_iter.next()
.expect("Should have second sentence")
.expect("Second sentence iteration should succeed");
let text2 = tokenizer.tokens_text(&tokenized, &sentence2).expect("Failed to get text");
assert_eq!(text2, "Two!");
let sentence3 = sentence_iter.next()
.expect("Should have third sentence")
.expect("Third sentence iteration should succeed");
let text3 = tokenizer.tokens_text(&tokenized, &sentence3).expect("Failed to get text");
assert_eq!(text3, "Three?");
let sentence4 = sentence_iter.next();
assert!(sentence4.is_none(), "Should be no more sentences");
}
}