omendb_core/omen/
metadata.rs

1//! Metadata indexing with Roaring bitmaps for O(1) filter evaluation
2//!
3//! Qdrant-style payload indexes: each indexed field has an inverted index
4//! mapping values to document IDs via Roaring bitmaps.
5
6use roaring::RoaringBitmap;
7use std::collections::HashMap;
8use std::io::{self, Read, Write};
9
10/// Field types for metadata indexing
11#[allow(dead_code)]
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13#[repr(u8)]
14pub enum FieldType {
15    /// String equality (inverted index)
16    Keyword = 1,
17    /// Integer equality/range
18    Integer = 2,
19    /// Float range
20    Float = 3,
21    /// Boolean (two bitmaps: true/false)
22    Boolean = 4,
23}
24
25impl From<u8> for FieldType {
26    fn from(v: u8) -> Self {
27        match v {
28            2 => Self::Integer,
29            3 => Self::Float,
30            4 => Self::Boolean,
31            _ => Self::Keyword, // 1 or unknown -> Keyword
32        }
33    }
34}
35
36/// Keyword index - inverted index for string equality
37#[derive(Debug, Clone, Default)]
38pub struct KeywordIndex {
39    /// term -> bitmap of doc IDs
40    terms: HashMap<String, RoaringBitmap>,
41}
42
43impl KeywordIndex {
44    pub fn new() -> Self {
45        Self::default()
46    }
47
48    /// Add a document with the given term
49    pub fn insert(&mut self, doc_id: u32, term: &str) {
50        self.terms
51            .entry(term.to_string())
52            .or_default()
53            .insert(doc_id);
54    }
55
56    /// Remove a document from all terms
57    pub fn remove(&mut self, doc_id: u32) {
58        for bitmap in self.terms.values_mut() {
59            bitmap.remove(doc_id);
60        }
61    }
62
63    /// Get documents matching a term
64    pub fn get(&self, term: &str) -> Option<&RoaringBitmap> {
65        self.terms.get(term)
66    }
67
68    /// Check if a document has a specific term
69    #[inline]
70    pub fn contains(&self, doc_id: u32, term: &str) -> bool {
71        self.terms
72            .get(term)
73            .is_some_and(|bitmap| bitmap.contains(doc_id))
74    }
75
76    /// Get all terms
77    pub fn terms(&self) -> impl Iterator<Item = &str> {
78        self.terms.keys().map(std::string::String::as_str)
79    }
80
81    /// Serialize to bytes
82    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
83        // Write term count
84        writer.write_all(&(self.terms.len() as u32).to_le_bytes())?;
85
86        for (term, bitmap) in &self.terms {
87            // Write term (length-prefixed)
88            writer.write_all(&(term.len() as u32).to_le_bytes())?;
89            writer.write_all(term.as_bytes())?;
90
91            // Write bitmap (use native serialization)
92            let mut bitmap_bytes = Vec::new();
93            bitmap.serialize_into(&mut bitmap_bytes)?;
94            writer.write_all(&(bitmap_bytes.len() as u32).to_le_bytes())?;
95            writer.write_all(&bitmap_bytes)?;
96        }
97
98        Ok(())
99    }
100
101    /// Deserialize from bytes
102    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
103        let mut len_buf = [0u8; 4];
104        reader.read_exact(&mut len_buf)?;
105        let term_count = u32::from_le_bytes(len_buf) as usize;
106
107        let mut terms = HashMap::with_capacity(term_count);
108
109        for _ in 0..term_count {
110            // Read term
111            reader.read_exact(&mut len_buf)?;
112            let term_len = u32::from_le_bytes(len_buf) as usize;
113            let mut term_buf = vec![0u8; term_len];
114            reader.read_exact(&mut term_buf)?;
115            let term = String::from_utf8_lossy(&term_buf).to_string();
116
117            // Read bitmap
118            reader.read_exact(&mut len_buf)?;
119            let bitmap_len = u32::from_le_bytes(len_buf) as usize;
120            let mut bitmap_buf = vec![0u8; bitmap_len];
121            reader.read_exact(&mut bitmap_buf)?;
122            let bitmap = RoaringBitmap::deserialize_from(&bitmap_buf[..])
123                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
124
125            terms.insert(term, bitmap);
126        }
127
128        Ok(Self { terms })
129    }
130}
131
132/// Boolean index - two bitmaps for true/false
133#[derive(Debug, Clone, Default)]
134pub struct BooleanIndex {
135    true_docs: RoaringBitmap,
136    false_docs: RoaringBitmap,
137}
138
139impl BooleanIndex {
140    pub fn new() -> Self {
141        Self::default()
142    }
143
144    pub fn insert(&mut self, doc_id: u32, value: bool) {
145        if value {
146            self.true_docs.insert(doc_id);
147            self.false_docs.remove(doc_id);
148        } else {
149            self.false_docs.insert(doc_id);
150            self.true_docs.remove(doc_id);
151        }
152    }
153
154    pub fn remove(&mut self, doc_id: u32) {
155        self.true_docs.remove(doc_id);
156        self.false_docs.remove(doc_id);
157    }
158
159    #[inline]
160    pub fn matches(&self, doc_id: u32, value: bool) -> bool {
161        if value {
162            self.true_docs.contains(doc_id)
163        } else {
164            self.false_docs.contains(doc_id)
165        }
166    }
167
168    pub fn get_true(&self) -> &RoaringBitmap {
169        &self.true_docs
170    }
171
172    pub fn get_false(&self) -> &RoaringBitmap {
173        &self.false_docs
174    }
175}
176
177/// Numeric index for integer/float range queries
178#[derive(Debug, Clone, Default)]
179pub struct NumericIndex {
180    /// Sorted (value, `doc_id`) pairs for range queries
181    entries: Vec<(f64, u32)>,
182    /// Optional: bitmap for common values (equality fast path)
183    common_values: HashMap<i64, RoaringBitmap>,
184}
185
186impl NumericIndex {
187    pub fn new() -> Self {
188        Self::default()
189    }
190
191    pub fn insert(&mut self, doc_id: u32, value: f64) {
192        // Skip NaN values (can't be compared or indexed meaningfully)
193        if value.is_nan() {
194            return;
195        }
196
197        // Add to sorted entries (total_cmp handles -0.0 vs 0.0, NaN already filtered)
198        let pos = self
199            .entries
200            .binary_search_by(|(v, _)| v.total_cmp(&value))
201            .unwrap_or_else(|p| p);
202        self.entries.insert(pos, (value, doc_id));
203
204        // Track common integer values for fast equality
205        if value.fract() == 0.0 && value >= i64::MIN as f64 && value <= i64::MAX as f64 {
206            let int_val = value as i64;
207            self.common_values
208                .entry(int_val)
209                .or_default()
210                .insert(doc_id);
211        }
212    }
213
214    pub fn remove(&mut self, doc_id: u32) {
215        self.entries.retain(|(_, id)| *id != doc_id);
216        for bitmap in self.common_values.values_mut() {
217            bitmap.remove(doc_id);
218        }
219    }
220
221    /// Get documents where value == target (fast path for integers)
222    pub fn get_eq(&self, value: f64) -> Option<&RoaringBitmap> {
223        if value.fract() == 0.0 {
224            self.common_values.get(&(value as i64))
225        } else {
226            None
227        }
228    }
229
230    /// Get documents where value is in range [min, max]
231    pub fn get_range(&self, min: f64, max: f64) -> RoaringBitmap {
232        // Use partition_point to find FIRST position where value >= min
233        // This handles duplicates correctly (binary_search may find any duplicate)
234        let start = self.entries.partition_point(|(v, _)| *v < min);
235
236        let mut result = RoaringBitmap::new();
237        for (val, doc_id) in &self.entries[start..] {
238            if *val > max {
239                break;
240            }
241            result.insert(*doc_id);
242        }
243        result
244    }
245
246    /// Check if document matches value
247    #[inline]
248    pub fn matches_eq(&self, doc_id: u32, value: f64) -> bool {
249        if let Some(bitmap) = self.get_eq(value) {
250            bitmap.contains(doc_id)
251        } else {
252            // Slow path: linear scan
253            self.entries
254                .iter()
255                .any(|(v, id)| *id == doc_id && (*v - value).abs() < f64::EPSILON)
256        }
257    }
258
259    /// Check if document is in range
260    #[inline]
261    pub fn matches_range(&self, doc_id: u32, min: f64, max: f64) -> bool {
262        self.entries
263            .iter()
264            .any(|(v, id)| *id == doc_id && *v >= min && *v <= max)
265    }
266}
267
268/// Field index - wraps different index types
269#[derive(Debug, Clone)]
270pub enum FieldIndex {
271    Keyword(KeywordIndex),
272    Boolean(BooleanIndex),
273    Numeric(NumericIndex),
274}
275
276impl FieldIndex {
277    #[must_use]
278    pub fn keyword() -> Self {
279        Self::Keyword(KeywordIndex::new())
280    }
281
282    #[must_use]
283    pub fn boolean() -> Self {
284        Self::Boolean(BooleanIndex::new())
285    }
286
287    #[must_use]
288    pub fn numeric() -> Self {
289        Self::Numeric(NumericIndex::new())
290    }
291}
292
293/// Metadata index - collection of field indexes
294#[derive(Debug, Clone, Default)]
295pub struct MetadataIndex {
296    fields: HashMap<String, FieldIndex>,
297}
298
299impl MetadataIndex {
300    #[must_use]
301    pub fn new() -> Self {
302        Self::default()
303    }
304
305    /// Create or get a keyword index for a field.
306    /// Returns None if the field exists with a different type.
307    pub fn keyword_index(&mut self, field: &str) -> Option<&mut KeywordIndex> {
308        let index = self
309            .fields
310            .entry(field.to_string())
311            .or_insert_with(FieldIndex::keyword);
312        match index {
313            FieldIndex::Keyword(idx) => Some(idx),
314            _ => None, // Field exists with different type
315        }
316    }
317
318    /// Create or get a boolean index for a field.
319    /// Returns None if the field exists with a different type.
320    pub fn boolean_index(&mut self, field: &str) -> Option<&mut BooleanIndex> {
321        let index = self
322            .fields
323            .entry(field.to_string())
324            .or_insert_with(FieldIndex::boolean);
325        match index {
326            FieldIndex::Boolean(idx) => Some(idx),
327            _ => None, // Field exists with different type
328        }
329    }
330
331    /// Create or get a numeric index for a field.
332    /// Returns None if the field exists with a different type.
333    pub fn numeric_index(&mut self, field: &str) -> Option<&mut NumericIndex> {
334        let index = self
335            .fields
336            .entry(field.to_string())
337            .or_insert_with(FieldIndex::numeric);
338        match index {
339            FieldIndex::Numeric(idx) => Some(idx),
340            _ => None, // Field exists with different type
341        }
342    }
343
344    /// Get a field index
345    #[must_use]
346    pub fn get(&self, field: &str) -> Option<&FieldIndex> {
347        self.fields.get(field)
348    }
349
350    /// Index a JSON metadata object.
351    /// Silently skips fields with inconsistent types across documents.
352    pub fn index_json(&mut self, doc_id: u32, metadata: &serde_json::Value) {
353        if let serde_json::Value::Object(map) = metadata {
354            for (key, value) in map {
355                match value {
356                    serde_json::Value::String(s) => {
357                        if let Some(idx) = self.keyword_index(key) {
358                            idx.insert(doc_id, s);
359                        }
360                    }
361                    serde_json::Value::Bool(b) => {
362                        if let Some(idx) = self.boolean_index(key) {
363                            idx.insert(doc_id, *b);
364                        }
365                    }
366                    serde_json::Value::Number(n) => {
367                        if let Some(f) = n.as_f64() {
368                            if let Some(idx) = self.numeric_index(key) {
369                                idx.insert(doc_id, f);
370                            }
371                        }
372                    }
373                    _ => {} // Skip arrays and nested objects
374                }
375            }
376        }
377    }
378
379    /// Remove a document from all indexes
380    pub fn remove(&mut self, doc_id: u32) {
381        for index in self.fields.values_mut() {
382            match index {
383                FieldIndex::Keyword(idx) => idx.remove(doc_id),
384                FieldIndex::Boolean(idx) => idx.remove(doc_id),
385                FieldIndex::Numeric(idx) => idx.remove(doc_id),
386            }
387        }
388    }
389
390    /// Evaluate a filter expression (returns true if matches)
391    #[inline]
392    #[must_use]
393    pub fn matches(&self, doc_id: u32, filter: &Filter) -> bool {
394        match filter {
395            Filter::Eq(field, value) => self.matches_eq(doc_id, field, value),
396            Filter::Ne(field, value) => !self.matches_eq(doc_id, field, value),
397            Filter::Gt(field, value) => self.matches_gt(doc_id, field, *value),
398            Filter::Gte(field, value) => self.matches_gte(doc_id, field, *value),
399            Filter::Lt(field, value) => self.matches_lt(doc_id, field, *value),
400            Filter::Lte(field, value) => self.matches_lte(doc_id, field, *value),
401            Filter::In(field, values) => values.iter().any(|v| self.matches_eq(doc_id, field, v)),
402            Filter::And(filters) => filters.iter().all(|f| self.matches(doc_id, f)),
403            Filter::Or(filters) => filters.iter().any(|f| self.matches(doc_id, f)),
404            Filter::Not(inner) => !self.matches(doc_id, inner),
405        }
406    }
407
408    fn matches_eq(&self, doc_id: u32, field: &str, value: &FilterValue) -> bool {
409        match (self.get(field), value) {
410            (Some(FieldIndex::Keyword(idx)), FilterValue::String(s)) => idx.contains(doc_id, s),
411            (Some(FieldIndex::Boolean(idx)), FilterValue::Bool(b)) => idx.matches(doc_id, *b),
412            (Some(FieldIndex::Numeric(idx)), FilterValue::Number(n)) => idx.matches_eq(doc_id, *n),
413            _ => false,
414        }
415    }
416
417    fn matches_gt(&self, doc_id: u32, field: &str, value: f64) -> bool {
418        match self.get(field) {
419            Some(FieldIndex::Numeric(idx)) => {
420                idx.matches_range(doc_id, value + f64::EPSILON, f64::MAX)
421            }
422            _ => false,
423        }
424    }
425
426    fn matches_gte(&self, doc_id: u32, field: &str, value: f64) -> bool {
427        match self.get(field) {
428            Some(FieldIndex::Numeric(idx)) => idx.matches_range(doc_id, value, f64::MAX),
429            _ => false,
430        }
431    }
432
433    fn matches_lt(&self, doc_id: u32, field: &str, value: f64) -> bool {
434        match self.get(field) {
435            Some(FieldIndex::Numeric(idx)) => {
436                idx.matches_range(doc_id, f64::MIN, value - f64::EPSILON)
437            }
438            _ => false,
439        }
440    }
441
442    fn matches_lte(&self, doc_id: u32, field: &str, value: f64) -> bool {
443        match self.get(field) {
444            Some(FieldIndex::Numeric(idx)) => idx.matches_range(doc_id, f64::MIN, value),
445            _ => false,
446        }
447    }
448}
449
450/// Filter value types
451#[derive(Debug, Clone)]
452pub enum FilterValue {
453    String(String),
454    Number(f64),
455    Bool(bool),
456}
457
458/// Filter expressions
459#[derive(Debug, Clone)]
460pub enum Filter {
461    Eq(String, FilterValue),
462    Ne(String, FilterValue),
463    Gt(String, f64),
464    Gte(String, f64),
465    Lt(String, f64),
466    Lte(String, f64),
467    In(String, Vec<FilterValue>),
468    And(Vec<Filter>),
469    Or(Vec<Filter>),
470    Not(Box<Filter>),
471}
472
473#[cfg(test)]
474mod tests {
475    use super::*;
476    use serde_json::json;
477
478    #[test]
479    fn test_keyword_index() {
480        let mut idx = KeywordIndex::new();
481        idx.insert(1, "apple");
482        idx.insert(2, "apple");
483        idx.insert(3, "banana");
484
485        assert!(idx.contains(1, "apple"));
486        assert!(idx.contains(2, "apple"));
487        assert!(!idx.contains(3, "apple"));
488        assert!(idx.contains(3, "banana"));
489
490        let bitmap = idx.get("apple").unwrap();
491        assert_eq!(bitmap.len(), 2);
492    }
493
494    #[test]
495    fn test_boolean_index() {
496        let mut idx = BooleanIndex::new();
497        idx.insert(1, true);
498        idx.insert(2, false);
499        idx.insert(3, true);
500
501        assert!(idx.matches(1, true));
502        assert!(!idx.matches(1, false));
503        assert!(idx.matches(2, false));
504        assert!(idx.matches(3, true));
505    }
506
507    #[test]
508    fn test_numeric_index() {
509        let mut idx = NumericIndex::new();
510        idx.insert(1, 10.0);
511        idx.insert(2, 20.0);
512        idx.insert(3, 30.0);
513
514        // Equality
515        assert!(idx.matches_eq(1, 10.0));
516        assert!(!idx.matches_eq(1, 20.0));
517
518        // Range
519        let range = idx.get_range(15.0, 25.0);
520        assert!(range.contains(2));
521        assert!(!range.contains(1));
522        assert!(!range.contains(3));
523    }
524
525    #[test]
526    fn test_metadata_index_json() {
527        let mut idx = MetadataIndex::new();
528
529        idx.index_json(1, &json!({"category": "tech", "score": 85, "active": true}));
530        idx.index_json(
531            2,
532            &json!({"category": "tech", "score": 92, "active": false}),
533        );
534        idx.index_json(
535            3,
536            &json!({"category": "science", "score": 78, "active": true}),
537        );
538
539        // Keyword filter
540        let filter = Filter::Eq("category".into(), FilterValue::String("tech".into()));
541        assert!(idx.matches(1, &filter));
542        assert!(idx.matches(2, &filter));
543        assert!(!idx.matches(3, &filter));
544
545        // Numeric filter
546        let filter = Filter::Gte("score".into(), 80.0);
547        assert!(idx.matches(1, &filter));
548        assert!(idx.matches(2, &filter));
549        assert!(!idx.matches(3, &filter));
550
551        // Boolean filter
552        let filter = Filter::Eq("active".into(), FilterValue::Bool(true));
553        assert!(idx.matches(1, &filter));
554        assert!(!idx.matches(2, &filter));
555        assert!(idx.matches(3, &filter));
556
557        // Combined filter
558        let filter = Filter::And(vec![
559            Filter::Eq("category".into(), FilterValue::String("tech".into())),
560            Filter::Eq("active".into(), FilterValue::Bool(true)),
561        ]);
562        assert!(idx.matches(1, &filter));
563        assert!(!idx.matches(2, &filter));
564        assert!(!idx.matches(3, &filter));
565    }
566
567    #[test]
568    fn test_keyword_serialize_roundtrip() {
569        let mut idx = KeywordIndex::new();
570        idx.insert(1, "apple");
571        idx.insert(2, "apple");
572        idx.insert(3, "banana");
573
574        let mut buf = Vec::new();
575        idx.serialize(&mut buf).unwrap();
576
577        let mut cursor = std::io::Cursor::new(&buf);
578        let idx2 = KeywordIndex::deserialize(&mut cursor).unwrap();
579
580        assert!(idx2.contains(1, "apple"));
581        assert!(idx2.contains(2, "apple"));
582        assert!(idx2.contains(3, "banana"));
583    }
584}