Skip to main content

iqdb_filter/
index.rs

1//! [`MetadataIndex`] — an opt-in, per-field inverted index that turns a
2//! selective predicate into a candidate key set without scanning every row.
3//!
4//! # What it is
5//!
6//! For each **explicitly indexed** field, the index keeps an inverted map from
7//! a metadata value to the keys of the records that carry it. A consumer hands
8//! a validated filter to [`candidates`](MetadataIndex::candidates) and gets back
9//! the keys that *might* match — a **superset** of the true matches — then
10//! re-runs [`FilterEvaluator::evaluate`](crate::FilterEvaluator::evaluate) on
11//! that smaller set for an exact answer. The win is skipping the per-row test
12//! (and, for an approximate index, the distance computation) on the rows the
13//! predicate already excludes.
14//!
15//! # What it resolves
16//!
17//! The index resolves the predicates an inverted index is good at — equality
18//! and membership — over the value types with well-behaved equality:
19//!
20//! - [`Filter::Eq`] / [`Filter::In`] on an **indexed** field whose literal is a
21//!   `String`, `Int`, `Bool`, or `Null`.
22//! - [`Filter::And`] — the intersection of whatever children resolve (any
23//!   unresolved child is left to `evaluate`).
24//! - [`Filter::Or`] — the union, but **only if every child resolves** (an
25//!   unresolved branch could match anything).
26//!
27//! Everything else returns [`None`] from [`candidates`](MetadataIndex::candidates),
28//! meaning "I can't bound this — scan everything": ranges
29//! (`Lt`/`Lte`/`Gt`/`Gte`), [`Filter::Not`], `Float` literals (IEEE equality
30//! makes a float a poor index key, and missing one would drop a real match),
31//! and any field that was not indexed.
32//!
33//! # Correctness contract
34//!
35//! Whenever [`candidates`](MetadataIndex::candidates) returns `Some(set)`, every
36//! record the evaluator would accept is in `set`. False positives are allowed
37//! (the consumer filters them with `evaluate`); false negatives never happen.
38//!
39//! # Why opt-in per field
40//!
41//! Indexing a field costs memory and per-insert work. Most metadata fields are
42//! never filtered on, so the index only tracks the fields a caller names.
43//!
44//! # Example
45//!
46//! ```
47//! use iqdb_filter::{FilterEvaluator, MetadataIndex};
48//! use iqdb_types::{Filter, Metadata, Value};
49//!
50//! # fn main() -> iqdb_types::Result<()> {
51//! fn meta(pairs: &[(&str, Value)]) -> Metadata {
52//!     pairs.iter().map(|(k, v)| ((*k).to_string(), v.clone())).collect()
53//! }
54//!
55//! let rows = [
56//!     (0_usize, meta(&[("lang", Value::String("rust".into()))])),
57//!     (1, meta(&[("lang", Value::String("go".into()))])),
58//!     (2, meta(&[("lang", Value::String("rust".into()))])),
59//! ];
60//!
61//! // Index only the `lang` field.
62//! let index = MetadataIndex::build(&["lang"], rows.iter().map(|(k, m)| (*k, Some(m))));
63//!
64//! let evaluator = FilterEvaluator::new(Filter::eq("lang", Value::String("rust".into())))?;
65//! let mut candidates = index.candidates(&evaluator).expect("indexed field resolves");
66//! candidates.sort_unstable();
67//! assert_eq!(candidates, [0, 2]);
68//! # Ok(())
69//! # }
70//! ```
71
72use std::collections::{HashMap, HashSet};
73use std::hash::Hash;
74
75use iqdb_types::{Filter, Metadata, Value};
76
77use crate::FilterEvaluator;
78use crate::selectivity;
79
80/// The subset of [`Value`] variants the index can use as a posting-list key:
81/// everything except `Float`, whose IEEE-754 equality (`NaN != NaN`,
82/// `+0.0 == -0.0`) does not survive a hash-map round-trip without risking a
83/// dropped match.
84#[derive(Debug, Clone, PartialEq, Eq, Hash)]
85enum IndexKey {
86    Str(String),
87    Int(i64),
88    Bool(bool),
89    Null,
90}
91
92impl IndexKey {
93    /// Returns the index key for `value`, or `None` for a `Float` (which the
94    /// index deliberately does not key on).
95    fn from_value(value: &Value) -> Option<Self> {
96        match value {
97            Value::String(s) => Some(IndexKey::Str(s.clone())),
98            Value::Int(i) => Some(IndexKey::Int(*i)),
99            Value::Bool(b) => Some(IndexKey::Bool(*b)),
100            Value::Null => Some(IndexKey::Null),
101            Value::Float(_) => None,
102        }
103    }
104}
105
106/// An opt-in, per-field inverted index over record metadata.
107///
108/// `K` is the caller's row key — a storage index, an [`iqdb_types::VectorId`],
109/// or any `Clone + Eq + Hash` handle. Build one with
110/// [`build`](MetadataIndex::build); it is immutable thereafter (rebuild to
111/// reflect new data). See [`candidates`](MetadataIndex::candidates) for the
112/// resolution rules and the superset contract.
113#[derive(Debug, Clone)]
114pub struct MetadataIndex<K> {
115    /// indexed field -> (value key -> the rows carrying it)
116    fields: HashMap<String, HashMap<IndexKey, Vec<K>>>,
117    /// Total records seen at build time (the denominator for selectivity).
118    total: usize,
119}
120
121impl<K> MetadataIndex<K>
122where
123    K: Clone + Eq + Hash,
124{
125    /// Builds an index over `fields` from `records`.
126    ///
127    /// `fields` is the explicit opt-in list — only these fields are indexed.
128    /// `records` yields `(key, metadata)` pairs; a record with `None` metadata
129    /// (or one missing an indexed field) still counts toward the total but
130    /// contributes no postings. A field named in `fields` that no record
131    /// carries simply has an empty posting set.
132    ///
133    /// # Examples
134    ///
135    /// ```
136    /// use iqdb_filter::MetadataIndex;
137    /// use iqdb_types::{Metadata, Value};
138    ///
139    /// let rows = [(0_u64, [("k".to_string(), Value::Int(1))].into_iter().collect::<Metadata>())];
140    /// let index = MetadataIndex::build(&["k"], rows.iter().map(|(k, m)| (*k, Some(m))));
141    /// assert_eq!(index.len(), 1);
142    /// assert!(index.is_indexed("k"));
143    /// ```
144    pub fn build<'a, I>(fields: &[&str], records: I) -> Self
145    where
146        I: IntoIterator<Item = (K, Option<&'a Metadata>)>,
147    {
148        let mut field_maps: HashMap<String, HashMap<IndexKey, Vec<K>>> = fields
149            .iter()
150            .map(|f| ((*f).to_string(), HashMap::new()))
151            .collect();
152
153        let mut total = 0usize;
154        for (key, metadata) in records {
155            total += 1;
156            let Some(metadata) = metadata else { continue };
157            for (field, postings) in field_maps.iter_mut() {
158                let Some(value) = metadata.get(field) else {
159                    continue;
160                };
161                let Some(index_key) = IndexKey::from_value(value) else {
162                    continue;
163                };
164                postings.entry(index_key).or_default().push(key.clone());
165            }
166        }
167
168        Self {
169            fields: field_maps,
170            total,
171        }
172    }
173
174    /// The number of records the index was built from.
175    #[must_use]
176    pub fn len(&self) -> usize {
177        self.total
178    }
179
180    /// Whether the index was built from zero records.
181    #[must_use]
182    pub fn is_empty(&self) -> bool {
183        self.total == 0
184    }
185
186    /// Whether `field` is one of the indexed fields.
187    #[must_use]
188    pub fn is_indexed(&self, field: &str) -> bool {
189        self.fields.contains_key(field)
190    }
191
192    /// The set of indexed field names, in unspecified order.
193    pub fn indexed_fields(&self) -> impl Iterator<Item = &str> {
194        self.fields.keys().map(String::as_str)
195    }
196
197    /// Resolves `evaluator`'s filter to a candidate key set, or `None` if the
198    /// index cannot bound it (scan everything in that case).
199    ///
200    /// When this returns `Some(keys)`, `keys` is a **superset** of the records
201    /// the evaluator accepts — re-run
202    /// [`evaluate`](crate::FilterEvaluator::evaluate) over them for the exact
203    /// result. The keys are unique and in unspecified order.
204    ///
205    /// Takes a validated [`FilterEvaluator`], so the recursive walk is bounded
206    /// by [`MAX_FILTER_DEPTH`](crate::MAX_FILTER_DEPTH).
207    ///
208    /// # Examples
209    ///
210    /// ```
211    /// use iqdb_filter::{FilterEvaluator, MetadataIndex};
212    /// use iqdb_types::{Filter, Metadata, Value};
213    ///
214    /// # fn main() -> iqdb_types::Result<()> {
215    /// let rows = [
216    ///     (0_usize, [("tier".to_string(), Value::Int(1))].into_iter().collect::<Metadata>()),
217    ///     (1, [("tier".to_string(), Value::Int(2))].into_iter().collect::<Metadata>()),
218    /// ];
219    /// let index = MetadataIndex::build(&["tier"], rows.iter().map(|(k, m)| (*k, Some(m))));
220    ///
221    /// // Indexed equality resolves.
222    /// let eq = FilterEvaluator::new(Filter::eq("tier", Value::Int(1)))?;
223    /// assert_eq!(index.candidates(&eq), Some(vec![0]));
224    ///
225    /// // A range over an indexed field is left to a full scan.
226    /// let range = FilterEvaluator::new(Filter::gt("tier", Value::Int(1)))?;
227    /// assert_eq!(index.candidates(&range), None);
228    /// # Ok(())
229    /// # }
230    /// ```
231    #[must_use]
232    pub fn candidates(&self, evaluator: &FilterEvaluator) -> Option<Vec<K>> {
233        self.resolve(evaluator.filter())
234            .map(|set| set.into_iter().collect())
235    }
236
237    /// Returns the candidate set for `filter`, or `None` when it cannot be
238    /// bounded by the index.
239    fn resolve(&self, filter: &Filter) -> Option<HashSet<K>> {
240        match filter {
241            Filter::Eq { field, value } => self.postings_for(field, value),
242            Filter::In { field, values } => {
243                let mut acc: HashSet<K> = HashSet::new();
244                for value in values {
245                    // Any non-indexable member makes the union unbounded.
246                    acc.extend(self.postings_for(field, value)?);
247                }
248                Some(acc)
249            }
250            Filter::And(children) => {
251                // Intersect the children that resolve; an unresolved child is
252                // left to `evaluate`. The intersection of resolvable
253                // constraints is still a superset of the true matches.
254                let mut acc: Option<HashSet<K>> = None;
255                for child in children {
256                    if let Some(set) = self.resolve(child) {
257                        acc = Some(match acc {
258                            None => set,
259                            Some(current) => intersect(current, &set),
260                        });
261                    }
262                }
263                acc
264            }
265            Filter::Or(children) => {
266                // A union is only bounded if every branch is.
267                let mut acc: HashSet<K> = HashSet::new();
268                for child in children {
269                    acc.extend(self.resolve(child)?);
270                }
271                Some(acc)
272            }
273            // Negation, inequality, and ranges are anti-selective or
274            // non-equality: the index cannot bound them, so the caller scans.
275            Filter::Neq { .. }
276            | Filter::Not(_)
277            | Filter::Lt { .. }
278            | Filter::Lte { .. }
279            | Filter::Gt { .. }
280            | Filter::Gte { .. } => None,
281        }
282    }
283
284    /// The posting set for `field == value`, or `None` if the field is not
285    /// indexed or the value is not an index key (a `Float`). A `Some(empty)`
286    /// means the field is indexed but nothing carries that value.
287    fn postings_for(&self, field: &str, value: &Value) -> Option<HashSet<K>> {
288        let postings = self.fields.get(field)?;
289        let index_key = IndexKey::from_value(value)?;
290        Some(
291            postings
292                .get(&index_key)
293                .map(|keys| keys.iter().cloned().collect())
294                .unwrap_or_default(),
295        )
296    }
297
298    /// Estimates the fraction of records `evaluator`'s filter passes, in
299    /// `[0.0, 1.0]`, using real posting counts where the index can and the
300    /// structural [`estimate_selectivity`](crate::estimate_selectivity)
301    /// fallback elsewhere.
302    ///
303    /// This is the data-backed counterpart to the structural estimate: an
304    /// indexed `Eq` / `In` leaf contributes its actual `matches / total`
305    /// fraction, so a selector backed by the index makes sharper pre/post
306    /// decisions. With zero records it falls back entirely to the structural
307    /// estimate.
308    ///
309    /// # Examples
310    ///
311    /// ```
312    /// use iqdb_filter::{FilterEvaluator, MetadataIndex};
313    /// use iqdb_types::{Filter, Metadata, Value};
314    ///
315    /// # fn main() -> iqdb_types::Result<()> {
316    /// // 1 of 4 rows has status == 1: the index knows the true 0.25.
317    /// let rows = [
318    ///     (0_usize, [("status".to_string(), Value::Int(1))].into_iter().collect::<Metadata>()),
319    ///     (1, [("status".to_string(), Value::Int(2))].into_iter().collect::<Metadata>()),
320    ///     (2, [("status".to_string(), Value::Int(2))].into_iter().collect::<Metadata>()),
321    ///     (3, [("status".to_string(), Value::Int(3))].into_iter().collect::<Metadata>()),
322    /// ];
323    /// let index = MetadataIndex::build(&["status"], rows.iter().map(|(k, m)| (*k, Some(m))));
324    ///
325    /// let eq = FilterEvaluator::new(Filter::eq("status", Value::Int(1)))?;
326    /// assert!((index.estimate_selectivity(&eq) - 0.25).abs() < 1e-9);
327    /// # Ok(())
328    /// # }
329    /// ```
330    #[must_use]
331    pub fn estimate_selectivity(&self, evaluator: &FilterEvaluator) -> f64 {
332        if self.total == 0 {
333            return selectivity::structural(evaluator.filter()).clamp(0.0, 1.0);
334        }
335        self.estimate(evaluator.filter()).clamp(0.0, 1.0)
336    }
337
338    fn estimate(&self, filter: &Filter) -> f64 {
339        match filter {
340            Filter::Eq { field, value } => self
341                .leaf_fraction(field, value)
342                .unwrap_or(selectivity::EQ_SELECTIVITY),
343            Filter::Neq { field, value } => {
344                1.0 - self
345                    .leaf_fraction(field, value)
346                    .unwrap_or(selectivity::EQ_SELECTIVITY)
347            }
348            Filter::In { field, values } => {
349                // Sum the per-value fractions when every member is indexable;
350                // otherwise fall back to the structural estimate for the node.
351                let mut acc = 0.0;
352                for value in values {
353                    match self.leaf_fraction(field, value) {
354                        Some(f) => acc += f,
355                        None => return selectivity::structural(filter),
356                    }
357                }
358                acc.min(1.0)
359            }
360            Filter::Lt { .. } | Filter::Lte { .. } | Filter::Gt { .. } | Filter::Gte { .. } => {
361                selectivity::RANGE_SELECTIVITY
362            }
363            Filter::And(children) => children.iter().map(|c| self.estimate(c)).product(),
364            Filter::Or(children) => {
365                1.0 - children
366                    .iter()
367                    .map(|c| 1.0 - self.estimate(c))
368                    .product::<f64>()
369            }
370            Filter::Not(inner) => 1.0 - self.estimate(inner),
371        }
372    }
373
374    /// The real `matches / total` fraction for `field == value`, or `None` when
375    /// the field is not indexed or the value is not an index key.
376    fn leaf_fraction(&self, field: &str, value: &Value) -> Option<f64> {
377        let postings = self.fields.get(field)?;
378        let index_key = IndexKey::from_value(value)?;
379        let count = postings.get(&index_key).map_or(0, Vec::len);
380        Some(count as f64 / self.total as f64)
381    }
382}
383
384/// Intersection that retains the elements of `a` also present in `b`.
385fn intersect<K: Eq + Hash>(a: HashSet<K>, b: &HashSet<K>) -> HashSet<K> {
386    a.into_iter().filter(|k| b.contains(k)).collect()
387}
388
389#[cfg(test)]
390mod tests {
391    #![allow(clippy::unwrap_used)]
392
393    use super::*;
394    use iqdb_types::Metadata;
395
396    fn meta(pairs: &[(&str, Value)]) -> Metadata {
397        pairs
398            .iter()
399            .map(|(k, v)| ((*k).to_string(), v.clone()))
400            .collect()
401    }
402
403    fn corpus() -> Vec<(usize, Metadata)> {
404        vec![
405            (
406                0,
407                meta(&[
408                    ("lang", Value::String("rust".into())),
409                    ("year", Value::Int(2026)),
410                ]),
411            ),
412            (
413                1,
414                meta(&[
415                    ("lang", Value::String("go".into())),
416                    ("year", Value::Int(2024)),
417                ]),
418            ),
419            (
420                2,
421                meta(&[
422                    ("lang", Value::String("rust".into())),
423                    ("year", Value::Int(2020)),
424                ]),
425            ),
426            (3, meta(&[("lang", Value::String("rust".into()))])),
427        ]
428    }
429
430    fn index() -> MetadataIndex<usize> {
431        let rows = corpus();
432        MetadataIndex::build(&["lang", "year"], rows.iter().map(|(k, m)| (*k, Some(m))))
433    }
434
435    fn sorted(mut v: Vec<usize>) -> Vec<usize> {
436        v.sort_unstable();
437        v
438    }
439
440    fn cands(filter: Filter) -> Option<Vec<usize>> {
441        let evaluator = FilterEvaluator::new(filter).unwrap();
442        index().candidates(&evaluator).map(sorted)
443    }
444
445    #[test]
446    fn build_counts_all_records_and_fields() {
447        let idx = index();
448        assert_eq!(idx.len(), 4);
449        assert!(!idx.is_empty());
450        assert!(idx.is_indexed("lang"));
451        assert!(idx.is_indexed("year"));
452        assert!(!idx.is_indexed("missing"));
453    }
454
455    #[test]
456    fn eq_on_indexed_field_resolves() {
457        assert_eq!(
458            cands(Filter::eq("lang", Value::String("rust".into()))),
459            Some(vec![0, 2, 3])
460        );
461    }
462
463    #[test]
464    fn eq_with_no_postings_is_empty_not_none() {
465        assert_eq!(
466            cands(Filter::eq("lang", Value::String("zig".into()))),
467            Some(vec![])
468        );
469    }
470
471    #[test]
472    fn eq_on_unindexed_field_is_none() {
473        assert_eq!(
474            cands(Filter::eq("author", Value::String("ada".into()))),
475            None
476        );
477    }
478
479    #[test]
480    fn float_literal_is_none() {
481        assert_eq!(cands(Filter::eq("year", Value::Float(2026.0))), None);
482    }
483
484    #[test]
485    fn in_resolves_to_union() {
486        assert_eq!(
487            cands(Filter::is_in(
488                "lang",
489                vec![Value::String("go".into()), Value::String("rust".into())]
490            )),
491            Some(vec![0, 1, 2, 3])
492        );
493    }
494
495    #[test]
496    fn and_intersects_resolvable_children() {
497        // lang == rust AND year == 2026 -> only row 0.
498        assert_eq!(
499            cands(Filter::and(vec![
500                Filter::eq("lang", Value::String("rust".into())),
501                Filter::eq("year", Value::Int(2026)),
502            ])),
503            Some(vec![0])
504        );
505    }
506
507    #[test]
508    fn and_narrows_using_only_the_resolvable_child() {
509        // lang == rust AND (range, unresolved) -> narrowed to rust rows; the
510        // range is left for `evaluate`.
511        assert_eq!(
512            cands(Filter::and(vec![
513                Filter::eq("lang", Value::String("rust".into())),
514                Filter::gt("year", Value::Int(2021)),
515            ])),
516            Some(vec![0, 2, 3])
517        );
518    }
519
520    #[test]
521    fn or_with_unresolvable_child_is_none() {
522        assert_eq!(
523            cands(Filter::or(vec![
524                Filter::eq("lang", Value::String("rust".into())),
525                Filter::gt("year", Value::Int(2021)),
526            ])),
527            None
528        );
529    }
530
531    #[test]
532    fn not_and_ranges_are_none() {
533        assert_eq!(
534            cands(Filter::not(Filter::eq(
535                "lang",
536                Value::String("rust".into())
537            ))),
538            None
539        );
540        assert_eq!(cands(Filter::gt("year", Value::Int(2000))), None);
541    }
542
543    #[test]
544    fn candidates_are_a_superset_of_true_matches() {
545        // For the narrowing-And case, every true match must be present.
546        let rows = corpus();
547        let filter = Filter::and(vec![
548            Filter::eq("lang", Value::String("rust".into())),
549            Filter::gt("year", Value::Int(2021)),
550        ]);
551        let evaluator = FilterEvaluator::new(filter).unwrap();
552        let candidates: std::collections::HashSet<usize> = index()
553            .candidates(&evaluator)
554            .unwrap()
555            .into_iter()
556            .collect();
557
558        for (k, m) in &rows {
559            if evaluator.evaluate(Some(m)) {
560                assert!(
561                    candidates.contains(k),
562                    "true match {k} missing from candidates"
563                );
564            }
565        }
566    }
567
568    #[test]
569    fn index_backed_selectivity_uses_real_counts() {
570        let idx = index();
571        // 3 of 4 rows are rust.
572        let rust = FilterEvaluator::new(Filter::eq("lang", Value::String("rust".into()))).unwrap();
573        assert!((idx.estimate_selectivity(&rust) - 0.75).abs() < 1e-9);
574        // 1 of 4 rows has year == 2026.
575        let y = FilterEvaluator::new(Filter::eq("year", Value::Int(2026))).unwrap();
576        assert!((idx.estimate_selectivity(&y) - 0.25).abs() < 1e-9);
577    }
578
579    #[test]
580    fn empty_index_falls_back_to_structural() {
581        let empty: MetadataIndex<usize> = MetadataIndex::build(&["lang"], std::iter::empty());
582        assert!(empty.is_empty());
583        let eq = FilterEvaluator::new(Filter::eq("lang", Value::String("rust".into()))).unwrap();
584        // Structural EQ estimate, not a divide-by-zero.
585        let s = empty.estimate_selectivity(&eq);
586        assert!((0.0..=1.0).contains(&s));
587    }
588
589    #[test]
590    fn records_without_metadata_count_but_do_not_post() {
591        let rows: Vec<(usize, Option<&Metadata>)> = vec![(0, None), (1, None)];
592        let idx: MetadataIndex<usize> = MetadataIndex::build(&["lang"], rows);
593        assert_eq!(idx.len(), 2);
594        let eq = FilterEvaluator::new(Filter::eq("lang", Value::String("rust".into()))).unwrap();
595        assert_eq!(idx.candidates(&eq), Some(vec![]));
596    }
597}