Skip to main content

aegis_document/
index.rs

1//! Aegis Document Index
2//!
3//! Indexing structures for efficient document queries.
4//!
5//! @version 0.1.0
6//! @author AutomataNexus Development Team
7
8use crate::types::{Document, DocumentId, Value};
9use std::collections::{HashMap, HashSet};
10
11// =============================================================================
12// Index Type
13// =============================================================================
14
15/// Type of document index.
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum IndexType {
18    /// Hash index for equality lookups.
19    Hash,
20    /// B-tree index for range queries.
21    BTree,
22    /// Full-text search index.
23    FullText,
24    /// Unique index (no duplicates allowed).
25    Unique,
26}
27
28// =============================================================================
29// Document Index
30// =============================================================================
31
32/// Index for efficient document queries.
33pub struct DocumentIndex {
34    field: String,
35    index_type: IndexType,
36    hash_index: HashMap<IndexKey, HashSet<DocumentId>>,
37    text_index: Option<InvertedIndex>,
38}
39
40impl DocumentIndex {
41    /// Create a new index.
42    pub fn new(field: impl Into<String>, index_type: IndexType) -> Self {
43        let text_index = if index_type == IndexType::FullText {
44            Some(InvertedIndex::new())
45        } else {
46            None
47        };
48
49        Self {
50            field: field.into(),
51            index_type,
52            hash_index: HashMap::new(),
53            text_index,
54        }
55    }
56
57    /// Get the indexed field name.
58    pub fn field(&self) -> &str {
59        &self.field
60    }
61
62    /// Get the index type.
63    pub fn index_type(&self) -> IndexType {
64        self.index_type
65    }
66
67    /// Index a document.
68    pub fn index_document(&mut self, doc: &Document) {
69        let value = doc.get(&self.field);
70
71        if let Some(value) = value {
72            match self.index_type {
73                IndexType::FullText => {
74                    if let Some(text) = value.as_str() {
75                        if let Some(ref mut idx) = self.text_index {
76                            idx.index_document(&doc.id, text);
77                        }
78                    }
79                }
80                _ => {
81                    let key = IndexKey::from_value(value);
82                    self.hash_index
83                        .entry(key)
84                        .or_default()
85                        .insert(doc.id.clone());
86                }
87            }
88        }
89    }
90
91    /// Remove a document from the index.
92    pub fn unindex_document(&mut self, doc: &Document) {
93        let value = doc.get(&self.field);
94
95        if let Some(value) = value {
96            match self.index_type {
97                IndexType::FullText => {
98                    if let Some(text) = value.as_str() {
99                        if let Some(ref mut idx) = self.text_index {
100                            idx.unindex_document(&doc.id, text);
101                        }
102                    }
103                }
104                _ => {
105                    let key = IndexKey::from_value(value);
106                    if let Some(ids) = self.hash_index.get_mut(&key) {
107                        ids.remove(&doc.id);
108                        if ids.is_empty() {
109                            self.hash_index.remove(&key);
110                        }
111                    }
112                }
113            }
114        }
115    }
116
117    /// Find documents by exact value.
118    pub fn find_eq(&self, value: &Value) -> Vec<DocumentId> {
119        let key = IndexKey::from_value(value);
120        self.hash_index
121            .get(&key)
122            .map(|ids| ids.iter().cloned().collect())
123            .unwrap_or_default()
124    }
125
126    /// Search full-text index.
127    pub fn search(&self, query: &str) -> Vec<DocumentId> {
128        self.text_index
129            .as_ref()
130            .map(|idx| idx.search(query))
131            .unwrap_or_default()
132    }
133
134    /// Clear the index.
135    pub fn clear(&mut self) {
136        self.hash_index.clear();
137        if let Some(ref mut idx) = self.text_index {
138            idx.clear();
139        }
140    }
141
142    /// Get the number of unique keys.
143    pub fn key_count(&self) -> usize {
144        self.hash_index.len()
145    }
146}
147
148// =============================================================================
149// Index Key
150// =============================================================================
151
152/// Key for hash-based indexing.
153#[derive(Debug, Clone, PartialEq, Eq, Hash)]
154enum IndexKey {
155    Null,
156    Bool(bool),
157    Int(i64),
158    String(String),
159    Float(OrderedFloat),
160}
161
162impl IndexKey {
163    fn from_value(value: &Value) -> Self {
164        match value {
165            Value::Null => Self::Null,
166            Value::Bool(b) => Self::Bool(*b),
167            Value::Int(n) => Self::Int(*n),
168            Value::Float(f) => Self::Float(OrderedFloat(*f)),
169            Value::String(s) => Self::String(s.clone()),
170            Value::Array(_) | Value::Object(_) => Self::Null,
171        }
172    }
173}
174
175/// Wrapper for f64 that implements Eq and Hash.
176#[derive(Debug, Clone, Copy)]
177struct OrderedFloat(f64);
178
179impl PartialEq for OrderedFloat {
180    fn eq(&self, other: &Self) -> bool {
181        self.0.to_bits() == other.0.to_bits()
182    }
183}
184
185impl Eq for OrderedFloat {}
186
187impl std::hash::Hash for OrderedFloat {
188    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
189        self.0.to_bits().hash(state);
190    }
191}
192
193// =============================================================================
194// Inverted Index
195// =============================================================================
196
197/// Inverted index for full-text search.
198pub struct InvertedIndex {
199    terms: HashMap<String, HashSet<DocumentId>>,
200    doc_terms: HashMap<DocumentId, HashSet<String>>,
201}
202
203impl InvertedIndex {
204    pub fn new() -> Self {
205        Self {
206            terms: HashMap::new(),
207            doc_terms: HashMap::new(),
208        }
209    }
210
211    /// Index a document's text.
212    pub fn index_document(&mut self, doc_id: &DocumentId, text: &str) {
213        let tokens = self.tokenize(text);
214        let mut doc_term_set = HashSet::new();
215
216        for token in tokens {
217            self.terms
218                .entry(token.clone())
219                .or_default()
220                .insert(doc_id.clone());
221            doc_term_set.insert(token);
222        }
223
224        self.doc_terms.insert(doc_id.clone(), doc_term_set);
225    }
226
227    /// Remove a document from the index.
228    pub fn unindex_document(&mut self, doc_id: &DocumentId, text: &str) {
229        let tokens = self.tokenize(text);
230
231        for token in &tokens {
232            if let Some(docs) = self.terms.get_mut(token) {
233                docs.remove(doc_id);
234                if docs.is_empty() {
235                    self.terms.remove(token);
236                }
237            }
238        }
239
240        self.doc_terms.remove(doc_id);
241    }
242
243    /// Search for documents containing the query terms.
244    pub fn search(&self, query: &str) -> Vec<DocumentId> {
245        let tokens = self.tokenize(query);
246
247        if tokens.is_empty() {
248            return Vec::new();
249        }
250
251        let mut result: Option<HashSet<DocumentId>> = None;
252
253        for token in tokens {
254            let matching = self.terms.get(&token).cloned().unwrap_or_default();
255
256            result = Some(match result {
257                Some(current) => current.intersection(&matching).cloned().collect(),
258                None => matching,
259            });
260        }
261
262        result
263            .map(|set| set.into_iter().collect())
264            .unwrap_or_default()
265    }
266
267    /// Search with OR semantics.
268    pub fn search_any(&self, query: &str) -> Vec<DocumentId> {
269        let tokens = self.tokenize(query);
270        let mut result = HashSet::new();
271
272        for token in tokens {
273            if let Some(docs) = self.terms.get(&token) {
274                result.extend(docs.iter().cloned());
275            }
276        }
277
278        result.into_iter().collect()
279    }
280
281    /// Clear the index.
282    pub fn clear(&mut self) {
283        self.terms.clear();
284        self.doc_terms.clear();
285    }
286
287    /// Get the number of unique terms.
288    pub fn term_count(&self) -> usize {
289        self.terms.len()
290    }
291
292    fn tokenize(&self, text: &str) -> Vec<String> {
293        text.to_lowercase()
294            .split(|c: char| !c.is_alphanumeric())
295            .filter(|s| !s.is_empty() && s.len() >= 2)
296            .map(String::from)
297            .collect()
298    }
299}
300
301impl Default for InvertedIndex {
302    fn default() -> Self {
303        Self::new()
304    }
305}
306
307// =============================================================================
308// Tests
309// =============================================================================
310
311#[cfg(test)]
312mod tests {
313    use super::*;
314
315    #[test]
316    fn test_hash_index() {
317        let mut index = DocumentIndex::new("status", IndexType::Hash);
318
319        let mut doc1 = Document::with_id("doc1");
320        doc1.set("status", "active");
321
322        let mut doc2 = Document::with_id("doc2");
323        doc2.set("status", "active");
324
325        let mut doc3 = Document::with_id("doc3");
326        doc3.set("status", "inactive");
327
328        index.index_document(&doc1);
329        index.index_document(&doc2);
330        index.index_document(&doc3);
331
332        let result = index.find_eq(&Value::String("active".to_string()));
333        assert_eq!(result.len(), 2);
334
335        let result = index.find_eq(&Value::String("inactive".to_string()));
336        assert_eq!(result.len(), 1);
337    }
338
339    #[test]
340    fn test_inverted_index() {
341        let mut index = InvertedIndex::new();
342
343        let doc1 = DocumentId::new("doc1");
344        let doc2 = DocumentId::new("doc2");
345
346        index.index_document(&doc1, "The quick brown fox");
347        index.index_document(&doc2, "The lazy brown dog");
348
349        let result = index.search("brown");
350        assert_eq!(result.len(), 2);
351
352        let result = index.search("quick");
353        assert_eq!(result.len(), 1);
354
355        let result = index.search("quick brown");
356        assert_eq!(result.len(), 1);
357    }
358
359    #[test]
360    fn test_full_text_index() {
361        let mut index = DocumentIndex::new("content", IndexType::FullText);
362
363        let mut doc1 = Document::with_id("doc1");
364        doc1.set("content", "Hello world");
365
366        let mut doc2 = Document::with_id("doc2");
367        doc2.set("content", "Goodbye world");
368
369        index.index_document(&doc1);
370        index.index_document(&doc2);
371
372        let result = index.search("world");
373        assert_eq!(result.len(), 2);
374
375        let result = index.search("hello");
376        assert_eq!(result.len(), 1);
377    }
378
379    #[test]
380    fn test_unindex() {
381        let mut index = DocumentIndex::new("name", IndexType::Hash);
382
383        let mut doc = Document::with_id("doc1");
384        doc.set("name", "Test");
385
386        index.index_document(&doc);
387        assert_eq!(index.find_eq(&Value::String("Test".to_string())).len(), 1);
388
389        index.unindex_document(&doc);
390        assert_eq!(index.find_eq(&Value::String("Test".to_string())).len(), 0);
391    }
392}