bm25_vectorizer/
mocking.rs1use crate::bm25_token_indexer::Bm25TokenIndexer;
7use crate::Bm25Tokenizer;
8use std::cell::RefCell;
9use std::collections::HashMap;
10use std::hash::{DefaultHasher, Hash, Hasher};
11
12pub struct MockWhitespaceTokenizer;
14
15impl Bm25Tokenizer for MockWhitespaceTokenizer {
16 fn tokenize(&self, input_text: &str) -> Vec<String> {
17 input_text
18 .split_whitespace()
19 .map(|s| s.to_lowercase())
20 .collect()
21 }
22}
23
24pub struct MockCasePreservingTokenizer;
26
27impl Bm25Tokenizer for MockCasePreservingTokenizer {
28 fn tokenize(&self, input_text: &str) -> Vec<String> {
29 input_text.split_whitespace().map(String::from).collect()
30 }
31}
32
33pub struct MockPunctuationTokenizer;
35
36impl Bm25Tokenizer for MockPunctuationTokenizer {
37 fn tokenize(&self, input_text: &str) -> Vec<String> {
38 input_text
39 .chars()
40 .filter(|c| c.is_alphanumeric() || c.is_whitespace())
41 .collect::<String>()
42 .split_whitespace()
43 .map(|s| s.to_lowercase())
44 .collect()
45 }
46}
47
48pub struct MockHashTokenIndexer;
50
51impl Bm25TokenIndexer for MockHashTokenIndexer {
52 type Bm25TokenIndex = u64;
53
54 fn index(&self, token: &str) -> Self::Bm25TokenIndex {
55 let mut hasher = DefaultHasher::new();
56 token.hash(&mut hasher);
57 hasher.finish()
58 }
59}
60
61pub struct MockDictionaryTokenIndexer {
63 token_to_id: RefCell<HashMap<String, usize>>,
64 next_id: RefCell<usize>,
65}
66
67impl MockDictionaryTokenIndexer {
68 pub fn new() -> Self {
69 Self {
70 token_to_id: RefCell::new(HashMap::new()),
71 next_id: RefCell::new(0),
72 }
73 }
74}
75
76impl Bm25TokenIndexer for MockDictionaryTokenIndexer {
77 type Bm25TokenIndex = usize;
78
79 fn index(&self, token: &str) -> Self::Bm25TokenIndex {
80 let mut token_map = self.token_to_id.borrow_mut();
81 let mut next_id = self.next_id.borrow_mut();
82
83 if let Some(&id) = token_map.get(token) {
84 id
85 } else {
86 let id = *next_id;
87 token_map.insert(token.to_string(), id);
88 *next_id += 1;
89 id
90 }
91 }
92}
93
94pub struct MockStringTokenIndexer;
96
97impl Bm25TokenIndexer for MockStringTokenIndexer {
98 type Bm25TokenIndex = String;
99
100 fn index(&self, token: &str) -> Self::Bm25TokenIndex {
101 format!("idx_{}", token)
102 }
103}