1use crate::types::{Document, DocumentId, Value};
9use std::collections::{HashMap, HashSet};
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum IndexType {
18 Hash,
20 BTree,
22 FullText,
24 Unique,
26}
27
28pub struct DocumentIndex {
34 field: String,
35 index_type: IndexType,
36 hash_index: HashMap<IndexKey, HashSet<DocumentId>>,
37 text_index: Option<InvertedIndex>,
38}
39
40impl DocumentIndex {
41 pub fn new(field: impl Into<String>, index_type: IndexType) -> Self {
43 let text_index = if index_type == IndexType::FullText {
44 Some(InvertedIndex::new())
45 } else {
46 None
47 };
48
49 Self {
50 field: field.into(),
51 index_type,
52 hash_index: HashMap::new(),
53 text_index,
54 }
55 }
56
57 pub fn field(&self) -> &str {
59 &self.field
60 }
61
62 pub fn index_type(&self) -> IndexType {
64 self.index_type
65 }
66
67 pub fn index_document(&mut self, doc: &Document) {
69 let value = doc.get(&self.field);
70
71 if let Some(value) = value {
72 match self.index_type {
73 IndexType::FullText => {
74 if let Some(text) = value.as_str() {
75 if let Some(ref mut idx) = self.text_index {
76 idx.index_document(&doc.id, text);
77 }
78 }
79 }
80 _ => {
81 let key = IndexKey::from_value(value);
82 self.hash_index
83 .entry(key)
84 .or_default()
85 .insert(doc.id.clone());
86 }
87 }
88 }
89 }
90
91 pub fn unindex_document(&mut self, doc: &Document) {
93 let value = doc.get(&self.field);
94
95 if let Some(value) = value {
96 match self.index_type {
97 IndexType::FullText => {
98 if let Some(text) = value.as_str() {
99 if let Some(ref mut idx) = self.text_index {
100 idx.unindex_document(&doc.id, text);
101 }
102 }
103 }
104 _ => {
105 let key = IndexKey::from_value(value);
106 if let Some(ids) = self.hash_index.get_mut(&key) {
107 ids.remove(&doc.id);
108 if ids.is_empty() {
109 self.hash_index.remove(&key);
110 }
111 }
112 }
113 }
114 }
115 }
116
117 pub fn find_eq(&self, value: &Value) -> Vec<DocumentId> {
119 let key = IndexKey::from_value(value);
120 self.hash_index
121 .get(&key)
122 .map(|ids| ids.iter().cloned().collect())
123 .unwrap_or_default()
124 }
125
126 pub fn search(&self, query: &str) -> Vec<DocumentId> {
128 self.text_index
129 .as_ref()
130 .map(|idx| idx.search(query))
131 .unwrap_or_default()
132 }
133
134 pub fn clear(&mut self) {
136 self.hash_index.clear();
137 if let Some(ref mut idx) = self.text_index {
138 idx.clear();
139 }
140 }
141
142 pub fn key_count(&self) -> usize {
144 self.hash_index.len()
145 }
146}
147
148#[derive(Debug, Clone, PartialEq, Eq, Hash)]
154enum IndexKey {
155 Null,
156 Bool(bool),
157 Int(i64),
158 String(String),
159 Float(OrderedFloat),
160}
161
162impl IndexKey {
163 fn from_value(value: &Value) -> Self {
164 match value {
165 Value::Null => Self::Null,
166 Value::Bool(b) => Self::Bool(*b),
167 Value::Int(n) => Self::Int(*n),
168 Value::Float(f) => Self::Float(OrderedFloat(*f)),
169 Value::String(s) => Self::String(s.clone()),
170 Value::Array(_) | Value::Object(_) => Self::Null,
171 }
172 }
173}
174
175#[derive(Debug, Clone, Copy)]
177struct OrderedFloat(f64);
178
179impl PartialEq for OrderedFloat {
180 fn eq(&self, other: &Self) -> bool {
181 self.0.to_bits() == other.0.to_bits()
182 }
183}
184
185impl Eq for OrderedFloat {}
186
187impl std::hash::Hash for OrderedFloat {
188 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
189 self.0.to_bits().hash(state);
190 }
191}
192
193pub struct InvertedIndex {
199 terms: HashMap<String, HashSet<DocumentId>>,
200 doc_terms: HashMap<DocumentId, HashSet<String>>,
201}
202
203impl InvertedIndex {
204 pub fn new() -> Self {
205 Self {
206 terms: HashMap::new(),
207 doc_terms: HashMap::new(),
208 }
209 }
210
211 pub fn index_document(&mut self, doc_id: &DocumentId, text: &str) {
213 let tokens = self.tokenize(text);
214 let mut doc_term_set = HashSet::new();
215
216 for token in tokens {
217 self.terms
218 .entry(token.clone())
219 .or_default()
220 .insert(doc_id.clone());
221 doc_term_set.insert(token);
222 }
223
224 self.doc_terms.insert(doc_id.clone(), doc_term_set);
225 }
226
227 pub fn unindex_document(&mut self, doc_id: &DocumentId, text: &str) {
229 let tokens = self.tokenize(text);
230
231 for token in &tokens {
232 if let Some(docs) = self.terms.get_mut(token) {
233 docs.remove(doc_id);
234 if docs.is_empty() {
235 self.terms.remove(token);
236 }
237 }
238 }
239
240 self.doc_terms.remove(doc_id);
241 }
242
243 pub fn search(&self, query: &str) -> Vec<DocumentId> {
245 let tokens = self.tokenize(query);
246
247 if tokens.is_empty() {
248 return Vec::new();
249 }
250
251 let mut result: Option<HashSet<DocumentId>> = None;
252
253 for token in tokens {
254 let matching = self.terms.get(&token).cloned().unwrap_or_default();
255
256 result = Some(match result {
257 Some(current) => current.intersection(&matching).cloned().collect(),
258 None => matching,
259 });
260 }
261
262 result
263 .map(|set| set.into_iter().collect())
264 .unwrap_or_default()
265 }
266
267 pub fn search_any(&self, query: &str) -> Vec<DocumentId> {
269 let tokens = self.tokenize(query);
270 let mut result = HashSet::new();
271
272 for token in tokens {
273 if let Some(docs) = self.terms.get(&token) {
274 result.extend(docs.iter().cloned());
275 }
276 }
277
278 result.into_iter().collect()
279 }
280
281 pub fn clear(&mut self) {
283 self.terms.clear();
284 self.doc_terms.clear();
285 }
286
287 pub fn term_count(&self) -> usize {
289 self.terms.len()
290 }
291
292 fn tokenize(&self, text: &str) -> Vec<String> {
293 text.to_lowercase()
294 .split(|c: char| !c.is_alphanumeric())
295 .filter(|s| !s.is_empty() && s.len() >= 2)
296 .map(String::from)
297 .collect()
298 }
299}
300
301impl Default for InvertedIndex {
302 fn default() -> Self {
303 Self::new()
304 }
305}
306
307#[cfg(test)]
312mod tests {
313 use super::*;
314
315 #[test]
316 fn test_hash_index() {
317 let mut index = DocumentIndex::new("status", IndexType::Hash);
318
319 let mut doc1 = Document::with_id("doc1");
320 doc1.set("status", "active");
321
322 let mut doc2 = Document::with_id("doc2");
323 doc2.set("status", "active");
324
325 let mut doc3 = Document::with_id("doc3");
326 doc3.set("status", "inactive");
327
328 index.index_document(&doc1);
329 index.index_document(&doc2);
330 index.index_document(&doc3);
331
332 let result = index.find_eq(&Value::String("active".to_string()));
333 assert_eq!(result.len(), 2);
334
335 let result = index.find_eq(&Value::String("inactive".to_string()));
336 assert_eq!(result.len(), 1);
337 }
338
339 #[test]
340 fn test_inverted_index() {
341 let mut index = InvertedIndex::new();
342
343 let doc1 = DocumentId::new("doc1");
344 let doc2 = DocumentId::new("doc2");
345
346 index.index_document(&doc1, "The quick brown fox");
347 index.index_document(&doc2, "The lazy brown dog");
348
349 let result = index.search("brown");
350 assert_eq!(result.len(), 2);
351
352 let result = index.search("quick");
353 assert_eq!(result.len(), 1);
354
355 let result = index.search("quick brown");
356 assert_eq!(result.len(), 1);
357 }
358
359 #[test]
360 fn test_full_text_index() {
361 let mut index = DocumentIndex::new("content", IndexType::FullText);
362
363 let mut doc1 = Document::with_id("doc1");
364 doc1.set("content", "Hello world");
365
366 let mut doc2 = Document::with_id("doc2");
367 doc2.set("content", "Goodbye world");
368
369 index.index_document(&doc1);
370 index.index_document(&doc2);
371
372 let result = index.search("world");
373 assert_eq!(result.len(), 2);
374
375 let result = index.search("hello");
376 assert_eq!(result.len(), 1);
377 }
378
379 #[test]
380 fn test_unindex() {
381 let mut index = DocumentIndex::new("name", IndexType::Hash);
382
383 let mut doc = Document::with_id("doc1");
384 doc.set("name", "Test");
385
386 index.index_document(&doc);
387 assert_eq!(index.find_eq(&Value::String("Test".to_string())).len(), 1);
388
389 index.unindex_document(&doc);
390 assert_eq!(index.find_eq(&Value::String("Test".to_string())).len(), 0);
391 }
392}