bm25_vectorizer/
mocking.rs

1//! This module contains simple tokenizers and indexers that are used for testing and examples and not designed for
2//! real use cases.  Real use cases could involve performance optimisations and additional pre-processing steps such as:
3//! stop word removal, stemming/lemmatisation, punctuation removal, n-grams, handling language specific features, etc.
4//!  
5
6use crate::bm25_token_indexer::Bm25TokenIndexer;
7use crate::Bm25Tokenizer;
8use std::cell::RefCell;
9use std::collections::HashMap;
10use std::hash::{DefaultHasher, Hash, Hasher};
11
12/// Simple whitespace tokenizer
13pub struct MockWhitespaceTokenizer;
14
15impl Bm25Tokenizer for MockWhitespaceTokenizer {
16    fn tokenize(&self, input_text: &str) -> Vec<String> {
17        input_text
18            .split_whitespace()
19            .map(|s| s.to_lowercase())
20            .collect()
21    }
22}
23
24/// Case-preserving tokenizer
25pub struct MockCasePreservingTokenizer;
26
27impl Bm25Tokenizer for MockCasePreservingTokenizer {
28    fn tokenize(&self, input_text: &str) -> Vec<String> {
29        input_text.split_whitespace().map(String::from).collect()
30    }
31}
32
33/// Punctuation-aware tokenizer
34pub struct MockPunctuationTokenizer;
35
36impl Bm25Tokenizer for MockPunctuationTokenizer {
37    fn tokenize(&self, input_text: &str) -> Vec<String> {
38        input_text
39            .chars()
40            .filter(|c| c.is_alphanumeric() || c.is_whitespace())
41            .collect::<String>()
42            .split_whitespace()
43            .map(|s| s.to_lowercase())
44            .collect()
45    }
46}
47
48/// Hash-based token indexer
49pub struct MockHashTokenIndexer;
50
51impl Bm25TokenIndexer for MockHashTokenIndexer {
52    type Bm25TokenIndex = u64;
53
54    fn index(&self, token: &str) -> Self::Bm25TokenIndex {
55        let mut hasher = DefaultHasher::new();
56        token.hash(&mut hasher);
57        hasher.finish()
58    }
59}
60
61/// Dictionary-based token indexer with interior mutability
62pub struct MockDictionaryTokenIndexer {
63    token_to_id: RefCell<HashMap<String, usize>>,
64    next_id: RefCell<usize>,
65}
66
67impl MockDictionaryTokenIndexer {
68    pub fn new() -> Self {
69        Self {
70            token_to_id: RefCell::new(HashMap::new()),
71            next_id: RefCell::new(0),
72        }
73    }
74}
75
76impl Bm25TokenIndexer for MockDictionaryTokenIndexer {
77    type Bm25TokenIndex = usize;
78
79    fn index(&self, token: &str) -> Self::Bm25TokenIndex {
80        let mut token_map = self.token_to_id.borrow_mut();
81        let mut next_id = self.next_id.borrow_mut();
82
83        if let Some(&id) = token_map.get(token) {
84            id
85        } else {
86            let id = *next_id;
87            token_map.insert(token.to_string(), id);
88            *next_id += 1;
89            id
90        }
91    }
92}
93
94/// String-based token indexer
95pub struct MockStringTokenIndexer;
96
97impl Bm25TokenIndexer for MockStringTokenIndexer {
98    type Bm25TokenIndex = String;
99
100    fn index(&self, token: &str) -> Self::Bm25TokenIndex {
101        format!("idx_{}", token)
102    }
103}