pub trait Bm25TokenIndexer {
type Bm25TokenIndex;
fn index(&self, token: &str) -> Self::Bm25TokenIndex;
}
#[cfg(test)]
mod tests {
use super::*;
use crate::mocking::{
MockDictionaryTokenIndexer, MockHashTokenIndexer, MockStringTokenIndexer,
MockWhitespaceTokenizer,
};
use crate::Bm25Tokenizer;
#[test]
fn test_hash_token_indexer_deterministic() {
let indexer = MockHashTokenIndexer;
let index1 = indexer.index("hello");
let index2 = indexer.index("hello");
assert_eq!(index1, index2, "Same token should produce same index");
}
#[test]
fn test_hash_token_indexer_different_tokens() {
let indexer = MockHashTokenIndexer;
let index1 = indexer.index("hello");
let index2 = indexer.index("world");
assert_ne!(
index1, index2,
"Different tokens should produce different indices"
);
}
#[test]
fn test_hash_token_indexer_case_sensitivity() {
let indexer = MockHashTokenIndexer;
let index1 = indexer.index("hello");
let index2 = indexer.index("Hello");
assert_ne!(
index1, index2,
"Case-different tokens should produce different indices"
);
}
#[test]
fn test_dictionary_token_indexer_sequential() {
let indexer = MockDictionaryTokenIndexer::new();
let index1 = indexer.index("hello");
let index2 = indexer.index("world");
let index3 = indexer.index("rust");
assert_eq!(index1, 0);
assert_eq!(index2, 1);
assert_eq!(index3, 2);
}
#[test]
fn test_dictionary_token_indexer_deterministic() {
let indexer = MockDictionaryTokenIndexer::new();
let index1 = indexer.index("hello");
let index2 = indexer.index("world");
let index3 = indexer.index("hello");
assert_eq!(index1, index3, "Same token should produce same index");
assert_ne!(
index1, index2,
"Different tokens should produce different indices"
);
}
#[test]
fn test_dictionary_token_indexer_empty_string() {
let indexer = MockDictionaryTokenIndexer::new();
let index1 = indexer.index("");
let index2 = indexer.index("");
assert_eq!(
index1, index2,
"Empty string should be handled consistently"
);
}
#[test]
fn test_string_token_indexer() {
let indexer = MockStringTokenIndexer;
let index1 = indexer.index("hello");
let index2 = indexer.index("world");
assert_eq!(index1, "idx_hello");
assert_eq!(index2, "idx_world");
}
#[test]
fn test_string_token_indexer_deterministic() {
let indexer = MockStringTokenIndexer;
let index1 = indexer.index("test");
let index2 = indexer.index("test");
assert_eq!(index1, index2, "Same token should produce same index");
}
#[test]
fn test_tokenizer_indexer_integration() {
let tokenizer = MockWhitespaceTokenizer;
let indexer = MockHashTokenIndexer;
let text = "hello world hello rust";
let tokens = tokenizer.tokenize(text);
let indices: Vec<u64> = tokens.iter().map(|token| indexer.index(token)).collect();
assert_eq!(indices.len(), 4);
assert_eq!(
indices[0], indices[2],
"Repeated token 'hello' should have same index"
);
assert_ne!(
indices[0], indices[1],
"'hello' and 'world' should have different indices"
);
assert_ne!(
indices[1], indices[3],
"'world' and 'rust' should have different indices"
);
assert_ne!(
indices[0], indices[3],
"'hello' and 'rust' should have different indices"
);
}
#[test]
fn test_dictionary_indexer_with_tokenizer() {
let tokenizer = MockWhitespaceTokenizer;
let indexer = MockDictionaryTokenIndexer::new();
let text = "the quick brown fox jumps over the lazy dog";
let tokens = tokenizer.tokenize(text);
let indices: Vec<usize> = tokens.iter().map(|token| indexer.index(token)).collect();
assert_eq!(indices.len(), 9);
let the_index = indexer.index("the");
assert_eq!(indices[0], the_index);
assert_eq!(indices[6], the_index);
assert_eq!(
indices[0], indices[6],
"Repeated token 'the' should have same index"
);
}
#[test]
fn test_edge_cases() {
let tokenizer = MockWhitespaceTokenizer;
let indexer = MockHashTokenIndexer;
let tokens = tokenizer.tokenize(" \t \n ");
assert!(
tokens.is_empty(),
"Whitespace-only string should produce no tokens"
);
let tokens = tokenizer.tokenize("a");
assert_eq!(tokens, vec!["a"]);
let index = indexer.index(&tokens[0]);
assert!(index > 0, "Single character should produce valid index");
let long_token = "a".repeat(1000);
let index1 = indexer.index(&long_token);
let index2 = indexer.index(&long_token);
assert_eq!(index1, index2, "Long token should be handled consistently");
}
#[test]
fn test_indexer_properties() {
let indexer = MockHashTokenIndexer;
let token = "consistent";
let index1 = indexer.index(token);
let index2 = indexer.index(token);
assert_eq!(index1, index2, "Indexer should be deterministic");
let index_a = indexer.index("a");
let index_b = indexer.index("b");
assert_ne!(
index_a, index_b,
"Different tokens should generally have different indices"
);
}
}