iqdb_filter/index.rs
1//! [`MetadataIndex`] — an opt-in, per-field inverted index that turns a
2//! selective predicate into a candidate key set without scanning every row.
3//!
4//! # What it is
5//!
6//! For each **explicitly indexed** field, the index keeps an inverted map from
7//! a metadata value to the keys of the records that carry it. A consumer hands
8//! a validated filter to [`candidates`](MetadataIndex::candidates) and gets back
9//! the keys that *might* match — a **superset** of the true matches — then
10//! re-runs [`FilterEvaluator::evaluate`](crate::FilterEvaluator::evaluate) on
11//! that smaller set for an exact answer. The win is skipping the per-row test
12//! (and, for an approximate index, the distance computation) on the rows the
13//! predicate already excludes.
14//!
15//! # What it resolves
16//!
17//! The index resolves the predicates an inverted index is good at — equality
18//! and membership — over the value types with well-behaved equality:
19//!
20//! - [`Filter::Eq`] / [`Filter::In`] on an **indexed** field whose literal is a
21//! `String`, `Int`, `Bool`, or `Null`.
22//! - [`Filter::And`] — the intersection of whatever children resolve (any
23//! unresolved child is left to `evaluate`).
24//! - [`Filter::Or`] — the union, but **only if every child resolves** (an
25//! unresolved branch could match anything).
26//!
27//! Everything else returns [`None`] from [`candidates`](MetadataIndex::candidates),
28//! meaning "I can't bound this — scan everything": ranges
29//! (`Lt`/`Lte`/`Gt`/`Gte`), [`Filter::Not`], `Float` literals (IEEE equality
30//! makes a float a poor index key, and missing one would drop a real match),
31//! and any field that was not indexed.
32//!
33//! # Correctness contract
34//!
35//! Whenever [`candidates`](MetadataIndex::candidates) returns `Some(set)`, every
36//! record the evaluator would accept is in `set`. False positives are allowed
37//! (the consumer filters them with `evaluate`); false negatives never happen.
38//!
39//! # Why opt-in per field
40//!
41//! Indexing a field costs memory and per-insert work. Most metadata fields are
42//! never filtered on, so the index only tracks the fields a caller names.
43//!
44//! # Example
45//!
46//! ```
47//! use iqdb_filter::{FilterEvaluator, MetadataIndex};
48//! use iqdb_types::{Filter, Metadata, Value};
49//!
50//! # fn main() -> iqdb_types::Result<()> {
51//! fn meta(pairs: &[(&str, Value)]) -> Metadata {
52//! pairs.iter().map(|(k, v)| ((*k).to_string(), v.clone())).collect()
53//! }
54//!
55//! let rows = [
56//! (0_usize, meta(&[("lang", Value::String("rust".into()))])),
57//! (1, meta(&[("lang", Value::String("go".into()))])),
58//! (2, meta(&[("lang", Value::String("rust".into()))])),
59//! ];
60//!
61//! // Index only the `lang` field.
62//! let index = MetadataIndex::build(&["lang"], rows.iter().map(|(k, m)| (*k, Some(m))));
63//!
64//! let evaluator = FilterEvaluator::new(Filter::eq("lang", Value::String("rust".into())))?;
65//! let mut candidates = index.candidates(&evaluator).expect("indexed field resolves");
66//! candidates.sort_unstable();
67//! assert_eq!(candidates, [0, 2]);
68//! # Ok(())
69//! # }
70//! ```
71
72use std::collections::{HashMap, HashSet};
73use std::hash::Hash;
74
75use iqdb_types::{Filter, Metadata, Value};
76
77use crate::FilterEvaluator;
78use crate::selectivity;
79
80/// The subset of [`Value`] variants the index can use as a posting-list key:
81/// everything except `Float`, whose IEEE-754 equality (`NaN != NaN`,
82/// `+0.0 == -0.0`) does not survive a hash-map round-trip without risking a
83/// dropped match.
84#[derive(Debug, Clone, PartialEq, Eq, Hash)]
85enum IndexKey {
86 Str(String),
87 Int(i64),
88 Bool(bool),
89 Null,
90}
91
92impl IndexKey {
93 /// Returns the index key for `value`, or `None` for a `Float` (which the
94 /// index deliberately does not key on).
95 fn from_value(value: &Value) -> Option<Self> {
96 match value {
97 Value::String(s) => Some(IndexKey::Str(s.clone())),
98 Value::Int(i) => Some(IndexKey::Int(*i)),
99 Value::Bool(b) => Some(IndexKey::Bool(*b)),
100 Value::Null => Some(IndexKey::Null),
101 Value::Float(_) => None,
102 }
103 }
104}
105
106/// An opt-in, per-field inverted index over record metadata.
107///
108/// `K` is the caller's row key — a storage index, an [`iqdb_types::VectorId`],
109/// or any `Clone + Eq + Hash` handle. Build one with
110/// [`build`](MetadataIndex::build); it is immutable thereafter (rebuild to
111/// reflect new data). See [`candidates`](MetadataIndex::candidates) for the
112/// resolution rules and the superset contract.
113#[derive(Debug, Clone)]
114pub struct MetadataIndex<K> {
115 /// indexed field -> (value key -> the rows carrying it)
116 fields: HashMap<String, HashMap<IndexKey, Vec<K>>>,
117 /// Total records seen at build time (the denominator for selectivity).
118 total: usize,
119}
120
121impl<K> MetadataIndex<K>
122where
123 K: Clone + Eq + Hash,
124{
125 /// Builds an index over `fields` from `records`.
126 ///
127 /// `fields` is the explicit opt-in list — only these fields are indexed.
128 /// `records` yields `(key, metadata)` pairs; a record with `None` metadata
129 /// (or one missing an indexed field) still counts toward the total but
130 /// contributes no postings. A field named in `fields` that no record
131 /// carries simply has an empty posting set.
132 ///
133 /// # Examples
134 ///
135 /// ```
136 /// use iqdb_filter::MetadataIndex;
137 /// use iqdb_types::{Metadata, Value};
138 ///
139 /// let rows = [(0_u64, [("k".to_string(), Value::Int(1))].into_iter().collect::<Metadata>())];
140 /// let index = MetadataIndex::build(&["k"], rows.iter().map(|(k, m)| (*k, Some(m))));
141 /// assert_eq!(index.len(), 1);
142 /// assert!(index.is_indexed("k"));
143 /// ```
144 pub fn build<'a, I>(fields: &[&str], records: I) -> Self
145 where
146 I: IntoIterator<Item = (K, Option<&'a Metadata>)>,
147 {
148 let mut field_maps: HashMap<String, HashMap<IndexKey, Vec<K>>> = fields
149 .iter()
150 .map(|f| ((*f).to_string(), HashMap::new()))
151 .collect();
152
153 let mut total = 0usize;
154 for (key, metadata) in records {
155 total += 1;
156 let Some(metadata) = metadata else { continue };
157 for (field, postings) in field_maps.iter_mut() {
158 let Some(value) = metadata.get(field) else {
159 continue;
160 };
161 let Some(index_key) = IndexKey::from_value(value) else {
162 continue;
163 };
164 postings.entry(index_key).or_default().push(key.clone());
165 }
166 }
167
168 Self {
169 fields: field_maps,
170 total,
171 }
172 }
173
174 /// The number of records the index was built from.
175 #[must_use]
176 pub fn len(&self) -> usize {
177 self.total
178 }
179
180 /// Whether the index was built from zero records.
181 #[must_use]
182 pub fn is_empty(&self) -> bool {
183 self.total == 0
184 }
185
186 /// Whether `field` is one of the indexed fields.
187 #[must_use]
188 pub fn is_indexed(&self, field: &str) -> bool {
189 self.fields.contains_key(field)
190 }
191
192 /// The set of indexed field names, in unspecified order.
193 pub fn indexed_fields(&self) -> impl Iterator<Item = &str> {
194 self.fields.keys().map(String::as_str)
195 }
196
197 /// Resolves `evaluator`'s filter to a candidate key set, or `None` if the
198 /// index cannot bound it (scan everything in that case).
199 ///
200 /// When this returns `Some(keys)`, `keys` is a **superset** of the records
201 /// the evaluator accepts — re-run
202 /// [`evaluate`](crate::FilterEvaluator::evaluate) over them for the exact
203 /// result. The keys are unique and in unspecified order.
204 ///
205 /// Takes a validated [`FilterEvaluator`], so the recursive walk is bounded
206 /// by [`MAX_FILTER_DEPTH`](crate::MAX_FILTER_DEPTH).
207 ///
208 /// # Examples
209 ///
210 /// ```
211 /// use iqdb_filter::{FilterEvaluator, MetadataIndex};
212 /// use iqdb_types::{Filter, Metadata, Value};
213 ///
214 /// # fn main() -> iqdb_types::Result<()> {
215 /// let rows = [
216 /// (0_usize, [("tier".to_string(), Value::Int(1))].into_iter().collect::<Metadata>()),
217 /// (1, [("tier".to_string(), Value::Int(2))].into_iter().collect::<Metadata>()),
218 /// ];
219 /// let index = MetadataIndex::build(&["tier"], rows.iter().map(|(k, m)| (*k, Some(m))));
220 ///
221 /// // Indexed equality resolves.
222 /// let eq = FilterEvaluator::new(Filter::eq("tier", Value::Int(1)))?;
223 /// assert_eq!(index.candidates(&eq), Some(vec![0]));
224 ///
225 /// // A range over an indexed field is left to a full scan.
226 /// let range = FilterEvaluator::new(Filter::gt("tier", Value::Int(1)))?;
227 /// assert_eq!(index.candidates(&range), None);
228 /// # Ok(())
229 /// # }
230 /// ```
231 #[must_use]
232 pub fn candidates(&self, evaluator: &FilterEvaluator) -> Option<Vec<K>> {
233 self.resolve(evaluator.filter())
234 .map(|set| set.into_iter().collect())
235 }
236
237 /// Returns the candidate set for `filter`, or `None` when it cannot be
238 /// bounded by the index.
239 fn resolve(&self, filter: &Filter) -> Option<HashSet<K>> {
240 match filter {
241 Filter::Eq { field, value } => self.postings_for(field, value),
242 Filter::In { field, values } => {
243 let mut acc: HashSet<K> = HashSet::new();
244 for value in values {
245 // Any non-indexable member makes the union unbounded.
246 acc.extend(self.postings_for(field, value)?);
247 }
248 Some(acc)
249 }
250 Filter::And(children) => {
251 // Intersect the children that resolve; an unresolved child is
252 // left to `evaluate`. The intersection of resolvable
253 // constraints is still a superset of the true matches.
254 let mut acc: Option<HashSet<K>> = None;
255 for child in children {
256 if let Some(set) = self.resolve(child) {
257 acc = Some(match acc {
258 None => set,
259 Some(current) => intersect(current, &set),
260 });
261 }
262 }
263 acc
264 }
265 Filter::Or(children) => {
266 // A union is only bounded if every branch is.
267 let mut acc: HashSet<K> = HashSet::new();
268 for child in children {
269 acc.extend(self.resolve(child)?);
270 }
271 Some(acc)
272 }
273 // Negation, inequality, and ranges are anti-selective or
274 // non-equality: the index cannot bound them, so the caller scans.
275 Filter::Neq { .. }
276 | Filter::Not(_)
277 | Filter::Lt { .. }
278 | Filter::Lte { .. }
279 | Filter::Gt { .. }
280 | Filter::Gte { .. } => None,
281 }
282 }
283
284 /// The posting set for `field == value`, or `None` if the field is not
285 /// indexed or the value is not an index key (a `Float`). A `Some(empty)`
286 /// means the field is indexed but nothing carries that value.
287 fn postings_for(&self, field: &str, value: &Value) -> Option<HashSet<K>> {
288 let postings = self.fields.get(field)?;
289 let index_key = IndexKey::from_value(value)?;
290 Some(
291 postings
292 .get(&index_key)
293 .map(|keys| keys.iter().cloned().collect())
294 .unwrap_or_default(),
295 )
296 }
297
298 /// Estimates the fraction of records `evaluator`'s filter passes, in
299 /// `[0.0, 1.0]`, using real posting counts where the index can and the
300 /// structural [`estimate_selectivity`](crate::estimate_selectivity)
301 /// fallback elsewhere.
302 ///
303 /// This is the data-backed counterpart to the structural estimate: an
304 /// indexed `Eq` / `In` leaf contributes its actual `matches / total`
305 /// fraction, so a selector backed by the index makes sharper pre/post
306 /// decisions. With zero records it falls back entirely to the structural
307 /// estimate.
308 ///
309 /// # Examples
310 ///
311 /// ```
312 /// use iqdb_filter::{FilterEvaluator, MetadataIndex};
313 /// use iqdb_types::{Filter, Metadata, Value};
314 ///
315 /// # fn main() -> iqdb_types::Result<()> {
316 /// // 1 of 4 rows has status == 1: the index knows the true 0.25.
317 /// let rows = [
318 /// (0_usize, [("status".to_string(), Value::Int(1))].into_iter().collect::<Metadata>()),
319 /// (1, [("status".to_string(), Value::Int(2))].into_iter().collect::<Metadata>()),
320 /// (2, [("status".to_string(), Value::Int(2))].into_iter().collect::<Metadata>()),
321 /// (3, [("status".to_string(), Value::Int(3))].into_iter().collect::<Metadata>()),
322 /// ];
323 /// let index = MetadataIndex::build(&["status"], rows.iter().map(|(k, m)| (*k, Some(m))));
324 ///
325 /// let eq = FilterEvaluator::new(Filter::eq("status", Value::Int(1)))?;
326 /// assert!((index.estimate_selectivity(&eq) - 0.25).abs() < 1e-9);
327 /// # Ok(())
328 /// # }
329 /// ```
330 #[must_use]
331 pub fn estimate_selectivity(&self, evaluator: &FilterEvaluator) -> f64 {
332 if self.total == 0 {
333 return selectivity::structural(evaluator.filter()).clamp(0.0, 1.0);
334 }
335 self.estimate(evaluator.filter()).clamp(0.0, 1.0)
336 }
337
338 fn estimate(&self, filter: &Filter) -> f64 {
339 match filter {
340 Filter::Eq { field, value } => self
341 .leaf_fraction(field, value)
342 .unwrap_or(selectivity::EQ_SELECTIVITY),
343 Filter::Neq { field, value } => {
344 1.0 - self
345 .leaf_fraction(field, value)
346 .unwrap_or(selectivity::EQ_SELECTIVITY)
347 }
348 Filter::In { field, values } => {
349 // Sum the per-value fractions when every member is indexable;
350 // otherwise fall back to the structural estimate for the node.
351 let mut acc = 0.0;
352 for value in values {
353 match self.leaf_fraction(field, value) {
354 Some(f) => acc += f,
355 None => return selectivity::structural(filter),
356 }
357 }
358 acc.min(1.0)
359 }
360 Filter::Lt { .. } | Filter::Lte { .. } | Filter::Gt { .. } | Filter::Gte { .. } => {
361 selectivity::RANGE_SELECTIVITY
362 }
363 Filter::And(children) => children.iter().map(|c| self.estimate(c)).product(),
364 Filter::Or(children) => {
365 1.0 - children
366 .iter()
367 .map(|c| 1.0 - self.estimate(c))
368 .product::<f64>()
369 }
370 Filter::Not(inner) => 1.0 - self.estimate(inner),
371 }
372 }
373
374 /// The real `matches / total` fraction for `field == value`, or `None` when
375 /// the field is not indexed or the value is not an index key.
376 fn leaf_fraction(&self, field: &str, value: &Value) -> Option<f64> {
377 let postings = self.fields.get(field)?;
378 let index_key = IndexKey::from_value(value)?;
379 let count = postings.get(&index_key).map_or(0, Vec::len);
380 Some(count as f64 / self.total as f64)
381 }
382}
383
384/// Intersection that retains the elements of `a` also present in `b`.
385fn intersect<K: Eq + Hash>(a: HashSet<K>, b: &HashSet<K>) -> HashSet<K> {
386 a.into_iter().filter(|k| b.contains(k)).collect()
387}
388
389#[cfg(test)]
390mod tests {
391 #![allow(clippy::unwrap_used)]
392
393 use super::*;
394 use iqdb_types::Metadata;
395
396 fn meta(pairs: &[(&str, Value)]) -> Metadata {
397 pairs
398 .iter()
399 .map(|(k, v)| ((*k).to_string(), v.clone()))
400 .collect()
401 }
402
403 fn corpus() -> Vec<(usize, Metadata)> {
404 vec![
405 (
406 0,
407 meta(&[
408 ("lang", Value::String("rust".into())),
409 ("year", Value::Int(2026)),
410 ]),
411 ),
412 (
413 1,
414 meta(&[
415 ("lang", Value::String("go".into())),
416 ("year", Value::Int(2024)),
417 ]),
418 ),
419 (
420 2,
421 meta(&[
422 ("lang", Value::String("rust".into())),
423 ("year", Value::Int(2020)),
424 ]),
425 ),
426 (3, meta(&[("lang", Value::String("rust".into()))])),
427 ]
428 }
429
430 fn index() -> MetadataIndex<usize> {
431 let rows = corpus();
432 MetadataIndex::build(&["lang", "year"], rows.iter().map(|(k, m)| (*k, Some(m))))
433 }
434
435 fn sorted(mut v: Vec<usize>) -> Vec<usize> {
436 v.sort_unstable();
437 v
438 }
439
440 fn cands(filter: Filter) -> Option<Vec<usize>> {
441 let evaluator = FilterEvaluator::new(filter).unwrap();
442 index().candidates(&evaluator).map(sorted)
443 }
444
445 #[test]
446 fn build_counts_all_records_and_fields() {
447 let idx = index();
448 assert_eq!(idx.len(), 4);
449 assert!(!idx.is_empty());
450 assert!(idx.is_indexed("lang"));
451 assert!(idx.is_indexed("year"));
452 assert!(!idx.is_indexed("missing"));
453 }
454
455 #[test]
456 fn eq_on_indexed_field_resolves() {
457 assert_eq!(
458 cands(Filter::eq("lang", Value::String("rust".into()))),
459 Some(vec![0, 2, 3])
460 );
461 }
462
463 #[test]
464 fn eq_with_no_postings_is_empty_not_none() {
465 assert_eq!(
466 cands(Filter::eq("lang", Value::String("zig".into()))),
467 Some(vec![])
468 );
469 }
470
471 #[test]
472 fn eq_on_unindexed_field_is_none() {
473 assert_eq!(
474 cands(Filter::eq("author", Value::String("ada".into()))),
475 None
476 );
477 }
478
479 #[test]
480 fn float_literal_is_none() {
481 assert_eq!(cands(Filter::eq("year", Value::Float(2026.0))), None);
482 }
483
484 #[test]
485 fn in_resolves_to_union() {
486 assert_eq!(
487 cands(Filter::is_in(
488 "lang",
489 vec![Value::String("go".into()), Value::String("rust".into())]
490 )),
491 Some(vec![0, 1, 2, 3])
492 );
493 }
494
495 #[test]
496 fn and_intersects_resolvable_children() {
497 // lang == rust AND year == 2026 -> only row 0.
498 assert_eq!(
499 cands(Filter::and(vec![
500 Filter::eq("lang", Value::String("rust".into())),
501 Filter::eq("year", Value::Int(2026)),
502 ])),
503 Some(vec![0])
504 );
505 }
506
507 #[test]
508 fn and_narrows_using_only_the_resolvable_child() {
509 // lang == rust AND (range, unresolved) -> narrowed to rust rows; the
510 // range is left for `evaluate`.
511 assert_eq!(
512 cands(Filter::and(vec![
513 Filter::eq("lang", Value::String("rust".into())),
514 Filter::gt("year", Value::Int(2021)),
515 ])),
516 Some(vec![0, 2, 3])
517 );
518 }
519
520 #[test]
521 fn or_with_unresolvable_child_is_none() {
522 assert_eq!(
523 cands(Filter::or(vec![
524 Filter::eq("lang", Value::String("rust".into())),
525 Filter::gt("year", Value::Int(2021)),
526 ])),
527 None
528 );
529 }
530
531 #[test]
532 fn not_and_ranges_are_none() {
533 assert_eq!(
534 cands(Filter::not(Filter::eq(
535 "lang",
536 Value::String("rust".into())
537 ))),
538 None
539 );
540 assert_eq!(cands(Filter::gt("year", Value::Int(2000))), None);
541 }
542
543 #[test]
544 fn candidates_are_a_superset_of_true_matches() {
545 // For the narrowing-And case, every true match must be present.
546 let rows = corpus();
547 let filter = Filter::and(vec![
548 Filter::eq("lang", Value::String("rust".into())),
549 Filter::gt("year", Value::Int(2021)),
550 ]);
551 let evaluator = FilterEvaluator::new(filter).unwrap();
552 let candidates: std::collections::HashSet<usize> = index()
553 .candidates(&evaluator)
554 .unwrap()
555 .into_iter()
556 .collect();
557
558 for (k, m) in &rows {
559 if evaluator.evaluate(Some(m)) {
560 assert!(
561 candidates.contains(k),
562 "true match {k} missing from candidates"
563 );
564 }
565 }
566 }
567
568 #[test]
569 fn index_backed_selectivity_uses_real_counts() {
570 let idx = index();
571 // 3 of 4 rows are rust.
572 let rust = FilterEvaluator::new(Filter::eq("lang", Value::String("rust".into()))).unwrap();
573 assert!((idx.estimate_selectivity(&rust) - 0.75).abs() < 1e-9);
574 // 1 of 4 rows has year == 2026.
575 let y = FilterEvaluator::new(Filter::eq("year", Value::Int(2026))).unwrap();
576 assert!((idx.estimate_selectivity(&y) - 0.25).abs() < 1e-9);
577 }
578
579 #[test]
580 fn empty_index_falls_back_to_structural() {
581 let empty: MetadataIndex<usize> = MetadataIndex::build(&["lang"], std::iter::empty());
582 assert!(empty.is_empty());
583 let eq = FilterEvaluator::new(Filter::eq("lang", Value::String("rust".into()))).unwrap();
584 // Structural EQ estimate, not a divide-by-zero.
585 let s = empty.estimate_selectivity(&eq);
586 assert!((0.0..=1.0).contains(&s));
587 }
588
589 #[test]
590 fn records_without_metadata_count_but_do_not_post() {
591 let rows: Vec<(usize, Option<&Metadata>)> = vec![(0, None), (1, None)];
592 let idx: MetadataIndex<usize> = MetadataIndex::build(&["lang"], rows);
593 assert_eq!(idx.len(), 2);
594 let eq = FilterEvaluator::new(Filter::eq("lang", Value::String("rust".into()))).unwrap();
595 assert_eq!(idx.candidates(&eq), Some(vec![]));
596 }
597}