omendb_core/omen/
metadata.rs

1//! Metadata indexing with Roaring bitmaps for O(1) filter evaluation
2//!
3//! Qdrant-style payload indexes: each indexed field has an inverted index
4//! mapping values to document IDs via Roaring bitmaps.
5
6use roaring::RoaringBitmap;
7use std::collections::HashMap;
8use std::io::{self, Read, Write};
9
10/// Field types for metadata indexing
11#[allow(dead_code)]
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13#[repr(u8)]
14pub enum FieldType {
15    /// String equality (inverted index)
16    Keyword = 1,
17    /// Integer equality/range
18    Integer = 2,
19    /// Float range
20    Float = 3,
21    /// Boolean (two bitmaps: true/false)
22    Boolean = 4,
23}
24
25impl From<u8> for FieldType {
26    fn from(v: u8) -> Self {
27        match v {
28            2 => Self::Integer,
29            3 => Self::Float,
30            4 => Self::Boolean,
31            _ => Self::Keyword, // 1 or unknown -> Keyword
32        }
33    }
34}
35
36/// Keyword index - inverted index for string equality
37#[derive(Debug, Clone, Default)]
38pub struct KeywordIndex {
39    /// term -> bitmap of doc IDs
40    terms: HashMap<String, RoaringBitmap>,
41}
42
43impl KeywordIndex {
44    pub fn new() -> Self {
45        Self::default()
46    }
47
48    /// Add a document with the given term
49    pub fn insert(&mut self, doc_id: u32, term: &str) {
50        self.terms
51            .entry(term.to_string())
52            .or_default()
53            .insert(doc_id);
54    }
55
56    /// Remove a document from all terms
57    pub fn remove(&mut self, doc_id: u32) {
58        for bitmap in self.terms.values_mut() {
59            bitmap.remove(doc_id);
60        }
61        // Clean up empty bitmaps to prevent memory leak
62        self.terms.retain(|_, bitmap| !bitmap.is_empty());
63    }
64
65    /// Get documents matching a term
66    pub fn get(&self, term: &str) -> Option<&RoaringBitmap> {
67        self.terms.get(term)
68    }
69
70    /// Check if a document has a specific term
71    #[inline]
72    pub fn contains(&self, doc_id: u32, term: &str) -> bool {
73        self.terms
74            .get(term)
75            .is_some_and(|bitmap| bitmap.contains(doc_id))
76    }
77
78    /// Get all terms
79    pub fn terms(&self) -> impl Iterator<Item = &str> {
80        self.terms.keys().map(std::string::String::as_str)
81    }
82
83    /// Serialize to bytes
84    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
85        // Write term count
86        writer.write_all(&(self.terms.len() as u32).to_le_bytes())?;
87
88        for (term, bitmap) in &self.terms {
89            // Write term (length-prefixed)
90            writer.write_all(&(term.len() as u32).to_le_bytes())?;
91            writer.write_all(term.as_bytes())?;
92
93            // Write bitmap (use native serialization)
94            let mut bitmap_bytes = Vec::new();
95            bitmap.serialize_into(&mut bitmap_bytes)?;
96            writer.write_all(&(bitmap_bytes.len() as u32).to_le_bytes())?;
97            writer.write_all(&bitmap_bytes)?;
98        }
99
100        Ok(())
101    }
102
103    /// Deserialize from bytes
104    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
105        let mut len_buf = [0u8; 4];
106        reader.read_exact(&mut len_buf)?;
107        let term_count = u32::from_le_bytes(len_buf) as usize;
108
109        let mut terms = HashMap::with_capacity(term_count);
110
111        for _ in 0..term_count {
112            // Read term
113            reader.read_exact(&mut len_buf)?;
114            let term_len = u32::from_le_bytes(len_buf) as usize;
115            let mut term_buf = vec![0u8; term_len];
116            reader.read_exact(&mut term_buf)?;
117            let term = String::from_utf8(term_buf)
118                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
119
120            // Read bitmap
121            reader.read_exact(&mut len_buf)?;
122            let bitmap_len = u32::from_le_bytes(len_buf) as usize;
123            let mut bitmap_buf = vec![0u8; bitmap_len];
124            reader.read_exact(&mut bitmap_buf)?;
125            let bitmap = RoaringBitmap::deserialize_from(&bitmap_buf[..])
126                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
127
128            terms.insert(term, bitmap);
129        }
130
131        Ok(Self { terms })
132    }
133}
134
135/// Boolean index - two bitmaps for true/false
136#[derive(Debug, Clone, Default)]
137pub struct BooleanIndex {
138    true_docs: RoaringBitmap,
139    false_docs: RoaringBitmap,
140}
141
142impl BooleanIndex {
143    pub fn new() -> Self {
144        Self::default()
145    }
146
147    pub fn insert(&mut self, doc_id: u32, value: bool) {
148        if value {
149            self.true_docs.insert(doc_id);
150            self.false_docs.remove(doc_id);
151        } else {
152            self.false_docs.insert(doc_id);
153            self.true_docs.remove(doc_id);
154        }
155    }
156
157    pub fn remove(&mut self, doc_id: u32) {
158        self.true_docs.remove(doc_id);
159        self.false_docs.remove(doc_id);
160    }
161
162    #[inline]
163    pub fn matches(&self, doc_id: u32, value: bool) -> bool {
164        if value {
165            self.true_docs.contains(doc_id)
166        } else {
167            self.false_docs.contains(doc_id)
168        }
169    }
170
171    pub fn get_true(&self) -> &RoaringBitmap {
172        &self.true_docs
173    }
174
175    pub fn get_false(&self) -> &RoaringBitmap {
176        &self.false_docs
177    }
178
179    /// Serialize to bytes
180    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
181        // Write true_docs bitmap
182        let mut true_bytes = Vec::new();
183        self.true_docs.serialize_into(&mut true_bytes)?;
184        writer.write_all(&(true_bytes.len() as u32).to_le_bytes())?;
185        writer.write_all(&true_bytes)?;
186
187        // Write false_docs bitmap
188        let mut false_bytes = Vec::new();
189        self.false_docs.serialize_into(&mut false_bytes)?;
190        writer.write_all(&(false_bytes.len() as u32).to_le_bytes())?;
191        writer.write_all(&false_bytes)?;
192
193        Ok(())
194    }
195
196    /// Deserialize from bytes
197    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
198        let mut len_buf = [0u8; 4];
199
200        // Read true_docs bitmap
201        reader.read_exact(&mut len_buf)?;
202        let true_len = u32::from_le_bytes(len_buf) as usize;
203        let mut true_buf = vec![0u8; true_len];
204        reader.read_exact(&mut true_buf)?;
205        let true_docs = RoaringBitmap::deserialize_from(&true_buf[..])
206            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
207
208        // Read false_docs bitmap
209        reader.read_exact(&mut len_buf)?;
210        let false_len = u32::from_le_bytes(len_buf) as usize;
211        let mut false_buf = vec![0u8; false_len];
212        reader.read_exact(&mut false_buf)?;
213        let false_docs = RoaringBitmap::deserialize_from(&false_buf[..])
214            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
215
216        Ok(Self {
217            true_docs,
218            false_docs,
219        })
220    }
221}
222
223/// Numeric index for integer/float range queries
224#[derive(Debug, Clone, Default)]
225pub struct NumericIndex {
226    /// Sorted (value, `doc_id`) pairs for range queries
227    entries: Vec<(f64, u32)>,
228    /// Optional: bitmap for common values (equality fast path)
229    common_values: HashMap<i64, RoaringBitmap>,
230}
231
232impl NumericIndex {
233    pub fn new() -> Self {
234        Self::default()
235    }
236
237    pub fn insert(&mut self, doc_id: u32, value: f64) {
238        // Skip NaN values (can't be compared or indexed meaningfully)
239        if value.is_nan() {
240            return;
241        }
242
243        // Add to sorted entries (total_cmp handles -0.0 vs 0.0, NaN already filtered)
244        let pos = self
245            .entries
246            .binary_search_by(|(v, _)| v.total_cmp(&value))
247            .unwrap_or_else(|p| p);
248        self.entries.insert(pos, (value, doc_id));
249
250        // Track common integer values for fast equality
251        if value.fract() == 0.0 && value >= i64::MIN as f64 && value <= i64::MAX as f64 {
252            let int_val = value as i64;
253            self.common_values
254                .entry(int_val)
255                .or_default()
256                .insert(doc_id);
257        }
258    }
259
260    pub fn remove(&mut self, doc_id: u32) {
261        self.entries.retain(|(_, id)| *id != doc_id);
262        for bitmap in self.common_values.values_mut() {
263            bitmap.remove(doc_id);
264        }
265        // Clean up empty bitmaps to prevent memory leak
266        self.common_values.retain(|_, bitmap| !bitmap.is_empty());
267    }
268
269    /// Get documents where value == target (fast path for integers)
270    pub fn get_eq(&self, value: f64) -> Option<&RoaringBitmap> {
271        if value.fract() == 0.0 {
272            self.common_values.get(&(value as i64))
273        } else {
274            None
275        }
276    }
277
278    /// Get documents where value is in range [min, max]
279    pub fn get_range(&self, min: f64, max: f64) -> RoaringBitmap {
280        // Use partition_point to find FIRST position where value >= min
281        // This handles duplicates correctly (binary_search may find any duplicate)
282        let start = self.entries.partition_point(|(v, _)| *v < min);
283
284        let mut result = RoaringBitmap::new();
285        for (val, doc_id) in &self.entries[start..] {
286            if *val > max {
287                break;
288            }
289            result.insert(*doc_id);
290        }
291        result
292    }
293
294    /// Check if document matches value
295    #[inline]
296    pub fn matches_eq(&self, doc_id: u32, value: f64) -> bool {
297        if let Some(bitmap) = self.get_eq(value) {
298            bitmap.contains(doc_id)
299        } else {
300            // Slow path: linear scan
301            self.entries
302                .iter()
303                .any(|(v, id)| *id == doc_id && (*v - value).abs() < f64::EPSILON)
304        }
305    }
306
307    /// Check if document is in range [min, max] (inclusive)
308    #[inline]
309    pub fn matches_range(&self, doc_id: u32, min: f64, max: f64) -> bool {
310        self.entries
311            .iter()
312            .any(|(v, id)| *id == doc_id && *v >= min && *v <= max)
313    }
314
315    /// Check if document value > threshold (strict greater than)
316    #[inline]
317    pub fn matches_gt(&self, doc_id: u32, threshold: f64) -> bool {
318        self.entries
319            .iter()
320            .any(|(v, id)| *id == doc_id && *v > threshold)
321    }
322
323    /// Check if document value < threshold (strict less than)
324    #[inline]
325    pub fn matches_lt(&self, doc_id: u32, threshold: f64) -> bool {
326        self.entries
327            .iter()
328            .any(|(v, id)| *id == doc_id && *v < threshold)
329    }
330
331    /// Serialize to bytes
332    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
333        // Write entries count and data
334        writer.write_all(&(self.entries.len() as u32).to_le_bytes())?;
335        for (value, doc_id) in &self.entries {
336            writer.write_all(&value.to_le_bytes())?;
337            writer.write_all(&doc_id.to_le_bytes())?;
338        }
339
340        // Write common_values count and data
341        writer.write_all(&(self.common_values.len() as u32).to_le_bytes())?;
342        for (int_val, bitmap) in &self.common_values {
343            writer.write_all(&int_val.to_le_bytes())?;
344            let mut bitmap_bytes = Vec::new();
345            bitmap.serialize_into(&mut bitmap_bytes)?;
346            writer.write_all(&(bitmap_bytes.len() as u32).to_le_bytes())?;
347            writer.write_all(&bitmap_bytes)?;
348        }
349
350        Ok(())
351    }
352
353    /// Deserialize from bytes
354    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
355        let mut buf4 = [0u8; 4];
356        let mut buf8 = [0u8; 8];
357
358        // Read entries
359        reader.read_exact(&mut buf4)?;
360        let entries_len = u32::from_le_bytes(buf4) as usize;
361        let mut entries = Vec::with_capacity(entries_len);
362        for _ in 0..entries_len {
363            reader.read_exact(&mut buf8)?;
364            let value = f64::from_le_bytes(buf8);
365            reader.read_exact(&mut buf4)?;
366            let doc_id = u32::from_le_bytes(buf4);
367            entries.push((value, doc_id));
368        }
369
370        // Read common_values
371        reader.read_exact(&mut buf4)?;
372        let common_len = u32::from_le_bytes(buf4) as usize;
373        let mut common_values = HashMap::with_capacity(common_len);
374        for _ in 0..common_len {
375            reader.read_exact(&mut buf8)?;
376            let int_val = i64::from_le_bytes(buf8);
377            reader.read_exact(&mut buf4)?;
378            let bitmap_len = u32::from_le_bytes(buf4) as usize;
379            let mut bitmap_buf = vec![0u8; bitmap_len];
380            reader.read_exact(&mut bitmap_buf)?;
381            let bitmap = RoaringBitmap::deserialize_from(&bitmap_buf[..])
382                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
383            common_values.insert(int_val, bitmap);
384        }
385
386        Ok(Self {
387            entries,
388            common_values,
389        })
390    }
391}
392
393/// Field index - wraps different index types
394#[derive(Debug, Clone)]
395pub enum FieldIndex {
396    Keyword(KeywordIndex),
397    Boolean(BooleanIndex),
398    Numeric(NumericIndex),
399}
400
401impl FieldIndex {
402    #[must_use]
403    pub fn keyword() -> Self {
404        Self::Keyword(KeywordIndex::new())
405    }
406
407    #[must_use]
408    pub fn boolean() -> Self {
409        Self::Boolean(BooleanIndex::new())
410    }
411
412    #[must_use]
413    pub fn numeric() -> Self {
414        Self::Numeric(NumericIndex::new())
415    }
416}
417
418/// Metadata index - collection of field indexes
419#[derive(Debug, Clone, Default)]
420pub struct MetadataIndex {
421    fields: HashMap<String, FieldIndex>,
422}
423
424impl MetadataIndex {
425    #[must_use]
426    pub fn new() -> Self {
427        Self::default()
428    }
429
430    /// Create or get a keyword index for a field.
431    /// Returns None if the field exists with a different type.
432    pub fn keyword_index(&mut self, field: &str) -> Option<&mut KeywordIndex> {
433        let index = self
434            .fields
435            .entry(field.to_string())
436            .or_insert_with(FieldIndex::keyword);
437        match index {
438            FieldIndex::Keyword(idx) => Some(idx),
439            _ => None, // Field exists with different type
440        }
441    }
442
443    /// Create or get a boolean index for a field.
444    /// Returns None if the field exists with a different type.
445    pub fn boolean_index(&mut self, field: &str) -> Option<&mut BooleanIndex> {
446        let index = self
447            .fields
448            .entry(field.to_string())
449            .or_insert_with(FieldIndex::boolean);
450        match index {
451            FieldIndex::Boolean(idx) => Some(idx),
452            _ => None, // Field exists with different type
453        }
454    }
455
456    /// Create or get a numeric index for a field.
457    /// Returns None if the field exists with a different type.
458    pub fn numeric_index(&mut self, field: &str) -> Option<&mut NumericIndex> {
459        let index = self
460            .fields
461            .entry(field.to_string())
462            .or_insert_with(FieldIndex::numeric);
463        match index {
464            FieldIndex::Numeric(idx) => Some(idx),
465            _ => None, // Field exists with different type
466        }
467    }
468
469    /// Get a field index
470    #[must_use]
471    pub fn get(&self, field: &str) -> Option<&FieldIndex> {
472        self.fields.get(field)
473    }
474
475    /// Index a JSON metadata object.
476    /// Silently skips fields with inconsistent types across documents.
477    pub fn index_json(&mut self, doc_id: u32, metadata: &serde_json::Value) {
478        if let serde_json::Value::Object(map) = metadata {
479            for (key, value) in map {
480                match value {
481                    serde_json::Value::String(s) => {
482                        if let Some(idx) = self.keyword_index(key) {
483                            idx.insert(doc_id, s);
484                        }
485                    }
486                    serde_json::Value::Bool(b) => {
487                        if let Some(idx) = self.boolean_index(key) {
488                            idx.insert(doc_id, *b);
489                        }
490                    }
491                    serde_json::Value::Number(n) => {
492                        if let Some(f) = n.as_f64() {
493                            if let Some(idx) = self.numeric_index(key) {
494                                idx.insert(doc_id, f);
495                            }
496                        }
497                    }
498                    _ => {} // Skip arrays and nested objects
499                }
500            }
501        }
502    }
503
504    /// Remove a document from all indexes
505    pub fn remove(&mut self, doc_id: u32) {
506        for index in self.fields.values_mut() {
507            match index {
508                FieldIndex::Keyword(idx) => idx.remove(doc_id),
509                FieldIndex::Boolean(idx) => idx.remove(doc_id),
510                FieldIndex::Numeric(idx) => idx.remove(doc_id),
511            }
512        }
513    }
514
515    /// Evaluate a filter expression (returns true if matches)
516    #[inline]
517    #[must_use]
518    pub fn matches(&self, doc_id: u32, filter: &Filter) -> bool {
519        match filter {
520            Filter::Eq(field, value) => self.matches_eq(doc_id, field, value),
521            Filter::Ne(field, value) => !self.matches_eq(doc_id, field, value),
522            Filter::Gt(field, value) => self.matches_gt(doc_id, field, *value),
523            Filter::Gte(field, value) => self.matches_gte(doc_id, field, *value),
524            Filter::Lt(field, value) => self.matches_lt(doc_id, field, *value),
525            Filter::Lte(field, value) => self.matches_lte(doc_id, field, *value),
526            Filter::In(field, values) => values.iter().any(|v| self.matches_eq(doc_id, field, v)),
527            Filter::And(filters) => filters.iter().all(|f| self.matches(doc_id, f)),
528            Filter::Or(filters) => filters.iter().any(|f| self.matches(doc_id, f)),
529            Filter::Not(inner) => !self.matches(doc_id, inner),
530        }
531    }
532
533    fn matches_eq(&self, doc_id: u32, field: &str, value: &FilterValue) -> bool {
534        match (self.get(field), value) {
535            (Some(FieldIndex::Keyword(idx)), FilterValue::String(s)) => idx.contains(doc_id, s),
536            (Some(FieldIndex::Boolean(idx)), FilterValue::Bool(b)) => idx.matches(doc_id, *b),
537            (Some(FieldIndex::Numeric(idx)), FilterValue::Number(n)) => idx.matches_eq(doc_id, *n),
538            _ => false,
539        }
540    }
541
542    fn matches_gt(&self, doc_id: u32, field: &str, value: f64) -> bool {
543        match self.get(field) {
544            Some(FieldIndex::Numeric(idx)) => idx.matches_gt(doc_id, value),
545            _ => false,
546        }
547    }
548
549    fn matches_gte(&self, doc_id: u32, field: &str, value: f64) -> bool {
550        match self.get(field) {
551            Some(FieldIndex::Numeric(idx)) => idx.matches_range(doc_id, value, f64::INFINITY),
552            _ => false,
553        }
554    }
555
556    fn matches_lt(&self, doc_id: u32, field: &str, value: f64) -> bool {
557        match self.get(field) {
558            Some(FieldIndex::Numeric(idx)) => idx.matches_lt(doc_id, value),
559            _ => false,
560        }
561    }
562
563    fn matches_lte(&self, doc_id: u32, field: &str, value: f64) -> bool {
564        match self.get(field) {
565            Some(FieldIndex::Numeric(idx)) => idx.matches_range(doc_id, f64::NEG_INFINITY, value),
566            _ => false,
567        }
568    }
569}
570
571/// Filter value types
572#[derive(Debug, Clone)]
573pub enum FilterValue {
574    String(String),
575    Number(f64),
576    Bool(bool),
577}
578
579/// Filter expressions
580#[derive(Debug, Clone)]
581pub enum Filter {
582    Eq(String, FilterValue),
583    Ne(String, FilterValue),
584    Gt(String, f64),
585    Gte(String, f64),
586    Lt(String, f64),
587    Lte(String, f64),
588    In(String, Vec<FilterValue>),
589    And(Vec<Filter>),
590    Or(Vec<Filter>),
591    Not(Box<Filter>),
592}
593
594#[cfg(test)]
595mod tests {
596    use super::*;
597    use serde_json::json;
598
599    #[test]
600    fn test_keyword_index() {
601        let mut idx = KeywordIndex::new();
602        idx.insert(1, "apple");
603        idx.insert(2, "apple");
604        idx.insert(3, "banana");
605
606        assert!(idx.contains(1, "apple"));
607        assert!(idx.contains(2, "apple"));
608        assert!(!idx.contains(3, "apple"));
609        assert!(idx.contains(3, "banana"));
610
611        let bitmap = idx.get("apple").unwrap();
612        assert_eq!(bitmap.len(), 2);
613    }
614
615    #[test]
616    fn test_boolean_index() {
617        let mut idx = BooleanIndex::new();
618        idx.insert(1, true);
619        idx.insert(2, false);
620        idx.insert(3, true);
621
622        assert!(idx.matches(1, true));
623        assert!(!idx.matches(1, false));
624        assert!(idx.matches(2, false));
625        assert!(idx.matches(3, true));
626    }
627
628    #[test]
629    fn test_numeric_index() {
630        let mut idx = NumericIndex::new();
631        idx.insert(1, 10.0);
632        idx.insert(2, 20.0);
633        idx.insert(3, 30.0);
634
635        // Equality
636        assert!(idx.matches_eq(1, 10.0));
637        assert!(!idx.matches_eq(1, 20.0));
638
639        // Range
640        let range = idx.get_range(15.0, 25.0);
641        assert!(range.contains(2));
642        assert!(!range.contains(1));
643        assert!(!range.contains(3));
644    }
645
646    #[test]
647    fn test_metadata_index_json() {
648        let mut idx = MetadataIndex::new();
649
650        idx.index_json(1, &json!({"category": "tech", "score": 85, "active": true}));
651        idx.index_json(
652            2,
653            &json!({"category": "tech", "score": 92, "active": false}),
654        );
655        idx.index_json(
656            3,
657            &json!({"category": "science", "score": 78, "active": true}),
658        );
659
660        // Keyword filter
661        let filter = Filter::Eq("category".into(), FilterValue::String("tech".into()));
662        assert!(idx.matches(1, &filter));
663        assert!(idx.matches(2, &filter));
664        assert!(!idx.matches(3, &filter));
665
666        // Numeric filter
667        let filter = Filter::Gte("score".into(), 80.0);
668        assert!(idx.matches(1, &filter));
669        assert!(idx.matches(2, &filter));
670        assert!(!idx.matches(3, &filter));
671
672        // Boolean filter
673        let filter = Filter::Eq("active".into(), FilterValue::Bool(true));
674        assert!(idx.matches(1, &filter));
675        assert!(!idx.matches(2, &filter));
676        assert!(idx.matches(3, &filter));
677
678        // Combined filter
679        let filter = Filter::And(vec![
680            Filter::Eq("category".into(), FilterValue::String("tech".into())),
681            Filter::Eq("active".into(), FilterValue::Bool(true)),
682        ]);
683        assert!(idx.matches(1, &filter));
684        assert!(!idx.matches(2, &filter));
685        assert!(!idx.matches(3, &filter));
686    }
687
688    #[test]
689    fn test_keyword_serialize_roundtrip() {
690        let mut idx = KeywordIndex::new();
691        idx.insert(1, "apple");
692        idx.insert(2, "apple");
693        idx.insert(3, "banana");
694
695        let mut buf = Vec::new();
696        idx.serialize(&mut buf).unwrap();
697
698        let mut cursor = std::io::Cursor::new(&buf);
699        let idx2 = KeywordIndex::deserialize(&mut cursor).unwrap();
700
701        assert!(idx2.contains(1, "apple"));
702        assert!(idx2.contains(2, "apple"));
703        assert!(idx2.contains(3, "banana"));
704    }
705}