pub trait Bm25Tokenizer {
fn tokenize(&self, input_text: &str) -> Vec<String>;
}
#[cfg(test)]
mod tests {
use super::*;
use crate::mocking::{
MockCasePreservingTokenizer,
MockPunctuationTokenizer, MockWhitespaceTokenizer,
};
use rust_stemmers::{Algorithm as StemmingAlgorithm, Stemmer};
use stop_words::{get, LANGUAGE as StopWordLanguage};
use unicode_segmentation::UnicodeSegmentation;
#[test]
fn test_whitespace_tokenizer_basic() {
let tokenizer = MockWhitespaceTokenizer;
let tokens = tokenizer.tokenize("hello world rust");
assert_eq!(tokens, vec!["hello", "world", "rust"]);
}
#[test]
fn test_whitespace_tokenizer_case_normalization() {
let tokenizer = MockWhitespaceTokenizer;
let tokens = tokenizer.tokenize("Hello WORLD RusT");
assert_eq!(tokens, vec!["hello", "world", "rust"]);
}
#[test]
fn test_whitespace_tokenizer_empty_string() {
let tokenizer = MockWhitespaceTokenizer;
let tokens = tokenizer.tokenize("");
assert_eq!(tokens, Vec::<String>::new());
}
#[test]
fn test_whitespace_tokenizer_single_token() {
let tokenizer = MockWhitespaceTokenizer;
let tokens = tokenizer.tokenize("hello");
assert_eq!(tokens, vec!["hello"]);
}
#[test]
fn test_whitespace_tokenizer_multiple_spaces() {
let tokenizer = MockWhitespaceTokenizer;
let tokens = tokenizer.tokenize("hello world rust");
assert_eq!(tokens, vec!["hello", "world", "rust"]);
}
#[test]
fn test_whitespace_tokenizer_leading_trailing_spaces() {
let tokenizer = MockWhitespaceTokenizer;
let tokens = tokenizer.tokenize(" hello world ");
assert_eq!(tokens, vec!["hello", "world"]);
}
#[test]
fn test_case_preserving_tokenizer() {
let tokenizer = MockCasePreservingTokenizer;
let tokens = tokenizer.tokenize("Hello WORLD RusT");
assert_eq!(tokens, vec!["Hello", "WORLD", "RusT"]);
}
#[test]
fn test_punctuation_tokenizer() {
let tokenizer = MockPunctuationTokenizer;
let tokens = tokenizer.tokenize("hello, world! rust?");
assert_eq!(tokens, vec!["hello", "world", "rust"]);
}
#[test]
fn test_punctuation_tokenizer_numbers() {
let tokenizer = MockPunctuationTokenizer;
let tokens = tokenizer.tokenize("version 2.0 is great!");
assert_eq!(tokens, vec!["version", "20", "is", "great"]);
}
#[test]
fn test_tokenizer_properties() {
let tokenizer = MockWhitespaceTokenizer;
assert!(tokenizer.tokenize("").is_empty());
let result = tokenizer.tokenize("word");
assert_eq!(result.len(), 1);
assert_eq!(result[0], "word");
let result = tokenizer.tokenize("HELLO World");
for token in &result {
assert_eq!(
token.to_lowercase(),
*token,
"All tokens should be lowercase"
);
}
}
struct SampleNlpTokenizer;
impl SampleNlpTokenizer {
fn new() -> Self {
Self
}
}
impl Bm25Tokenizer for SampleNlpTokenizer {
fn tokenize(&self, input_text: &str) -> Vec<String> {
let text = deunicode::deunicode_with_tofu_cow(input_text, "�");
let text = text.to_lowercase();
let tokens: Vec<&str> = text
.unicode_words()
.filter(|word| !word.is_empty())
.collect();
let stop_words = get(StopWordLanguage::English);
let stemmer = Stemmer::create(StemmingAlgorithm::English);
tokens
.into_iter()
.filter(|token| !stop_words.contains(&*token))
.map(|token| stemmer.stem(token).to_string())
.collect()
}
}
#[test]
fn test_nlp_tokenizer_basic() {
let tokenizer = SampleNlpTokenizer::new();
let tokens = tokenizer.tokenize("The quick brown fox jumps over the lazy dog");
assert!(!tokens.contains(&"the".to_string()));
assert!(!tokens.contains(&"over".to_string()));
assert!(tokens.iter().any(|t| t.starts_with("quick")));
assert!(tokens.iter().any(|t| t.starts_with("jump")));
}
#[test]
fn test_nlp_tokenizer_pipeline() {
let tokenizer = SampleNlpTokenizer::new();
let input_text = "Modern computing owes much to the theoretical foundations laid by pioneers in mathematics and logic.";
let tokens = tokenizer.tokenize(input_text);
assert!(!tokens.is_empty(), "Token list should not be empty");
assert!(
!tokens.contains(&"to".to_string()),
"Stop word 'to' should be removed"
);
assert!(
!tokens.contains(&"the".to_string()),
"Stop word 'the' should be removed"
);
assert!(
!tokens.contains(&"in".to_string()),
"Stop word 'in' should be removed"
);
assert!(
tokens.iter().any(|t| t.starts_with("comput")),
"Should contain stemmed form of 'computing'"
);
assert!(
tokens.iter().any(|t| t.starts_with("theoret")),
"Should contain stemmed form of 'theoretical'"
);
}
#[test]
fn test_nlp_tokenizer_empty_input() {
let tokenizer = SampleNlpTokenizer::new();
let tokens = tokenizer.tokenize("");
assert!(tokens.is_empty());
}
#[test]
fn test_nlp_tokenizer_unicode() {
let tokenizer = SampleNlpTokenizer::new();
let tokens = tokenizer.tokenize("café résumé naïve");
assert!(tokens.len() == 3);
assert!(tokens.contains(&"cafe".to_string()));
assert!(tokens.contains(&"resum".to_string()));
assert!(tokens.contains(&"naiv".to_string()));
}
}