Skip to main content

kora_doc/
index.rs

1//! Secondary index data structures for document collections.
2//!
3//! Four index types cover the query patterns a document engine needs:
4//!
5//! | Type | Struct | Backing store | Use case |
6//! |------|--------|---------------|----------|
7//! | `Hash` | [`HashIndex`] | `HashMap<u32, Vec<DocId>>` | Equality lookups (`field = value`) |
8//! | `Sorted` | [`SortedIndex`] | `BTreeMap<OrderedF64, Vec<DocId>>` | Numeric range queries (`field >= N`) |
9//! | `Array` | (reuses [`HashIndex`]) | `HashMap<u32, Vec<DocId>>` | Array membership (`field CONTAINS value`) |
10//! | `Unique` | [`UniqueIndex`] | `HashMap<u32, Vec<DocId>>` | Unique-constraint enforcement |
11//!
12//! All hash-based indexes key on 32-bit FNV-1a hashes of the field value's
13//! canonical byte representation. Buckets store `DocId` lists in sorted order
14//! so that set intersection and union can be performed with merge-join
15//! ([`intersect_sorted`], [`union_sorted`]).
16//!
17//! [`CollectionIndexes`] groups all index instances for a single collection
18//! behind a unified accessor API, creating index structures lazily on first
19//! access. [`IndexConfig`] records which fields have indexes and of what type,
20//! driving the engine's index-maintenance loop on writes.
21//!
22//! ## Unique Index Semantics
23//!
24//! [`UniqueIndex`] is structurally identical to [`HashIndex`] but carries
25//! different semantics: the engine must verify exact-value uniqueness against
26//! the candidate list before inserting, because 32-bit hash collisions are
27//! expected.
28
29use std::collections::{BTreeMap, HashMap};
30
31use thiserror::Error;
32
33use crate::registry::{DocId, FieldId};
34
35/// Type of secondary index on a collection field.
36#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
37pub enum IndexType {
38    /// Hash-based equality index for fast point lookups.
39    Hash,
40    /// B-tree sorted index for range queries on numeric fields.
41    Sorted,
42    /// Hash-based index for array element membership queries.
43    Array,
44    /// Hash-based unique constraint index.
45    Unique,
46}
47
48/// Errors from index operations.
49#[derive(Debug, Error, PartialEq, Eq)]
50pub enum IndexError {
51    /// An index already exists for this field.
52    #[error("index already exists for field {0}")]
53    AlreadyExists(FieldId),
54    /// No index exists for this field.
55    #[error("no index found for field {0}")]
56    NotFound(FieldId),
57    /// A unique constraint violation occurred.
58    #[error("unique index violation: hash {hash} already maps to doc_id {existing_doc_id}")]
59    UniqueViolation {
60        /// The colliding hash bucket key.
61        hash: u32,
62        /// The document that already occupies the slot.
63        existing_doc_id: DocId,
64    },
65}
66
67/// Per-collection mapping of field IDs to their configured index types.
68#[derive(Debug, Default)]
69pub struct IndexConfig {
70    entries: HashMap<FieldId, IndexType>,
71}
72
73impl IndexConfig {
74    /// Create an empty index configuration.
75    #[must_use]
76    pub fn new() -> Self {
77        Self::default()
78    }
79
80    /// Register an index for a field. Returns an error if one already exists.
81    pub fn add(&mut self, field_id: FieldId, index_type: IndexType) -> Result<(), IndexError> {
82        if self.entries.contains_key(&field_id) {
83            return Err(IndexError::AlreadyExists(field_id));
84        }
85        self.entries.insert(field_id, index_type);
86        Ok(())
87    }
88
89    /// Remove an index configuration for a field. Returns an error if not found.
90    pub fn remove(&mut self, field_id: FieldId) -> Result<IndexType, IndexError> {
91        self.entries
92            .remove(&field_id)
93            .ok_or(IndexError::NotFound(field_id))
94    }
95
96    /// Look up the index type for a field.
97    #[must_use]
98    pub fn lookup(&self, field_id: FieldId) -> Option<IndexType> {
99        self.entries.get(&field_id).copied()
100    }
101
102    /// Return all configured field-to-index-type pairs.
103    #[must_use]
104    pub fn entries(&self) -> &HashMap<FieldId, IndexType> {
105        &self.entries
106    }
107}
108
109/// FNV-1a 32-bit hash for index bucket keys.
110#[must_use]
111pub fn hash32(data: &[u8]) -> u32 {
112    let mut hash: u32 = 0x811c_9dc5;
113    for &byte in data {
114        hash ^= u32::from(byte);
115        hash = hash.wrapping_mul(0x0100_0193);
116    }
117    hash
118}
119
120/// Hash-based equality index mapping FNV-1a bucket keys to sorted doc ID lists.
121#[derive(Debug, Default)]
122pub struct HashIndex {
123    buckets: HashMap<u32, Vec<DocId>>,
124}
125
126impl HashIndex {
127    /// Create an empty hash index.
128    #[must_use]
129    pub fn new() -> Self {
130        Self::default()
131    }
132
133    /// Add a document to a hash bucket, maintaining sorted order. Duplicate adds are idempotent.
134    pub fn add(&mut self, hash: u32, doc_id: DocId) {
135        let bucket = self.buckets.entry(hash).or_default();
136        match bucket.binary_search(&doc_id) {
137            Ok(_) => {}
138            Err(pos) => bucket.insert(pos, doc_id),
139        }
140    }
141
142    /// Remove a document from a hash bucket. No-op if not present.
143    pub fn remove(&mut self, hash: u32, doc_id: DocId) {
144        if let Some(bucket) = self.buckets.get_mut(&hash) {
145            if let Ok(pos) = bucket.binary_search(&doc_id) {
146                bucket.remove(pos);
147            }
148            if bucket.is_empty() {
149                self.buckets.remove(&hash);
150            }
151        }
152    }
153
154    /// Look up all document IDs in a hash bucket.
155    #[must_use]
156    pub fn lookup(&self, hash: u32) -> &[DocId] {
157        self.buckets.get(&hash).map_or(&[], Vec::as_slice)
158    }
159
160    /// Remove all entries from the index.
161    pub fn clear(&mut self) {
162        self.buckets.clear();
163    }
164}
165
166/// Wrapper around `f64` providing total ordering suitable for `BTreeMap` keys.
167///
168/// Ordering: all negative values (ordered normally) < negative zero < positive zero <
169/// all positive values (ordered normally) < NaN.
170#[derive(Debug, Clone, Copy)]
171pub struct OrderedF64(f64);
172
173impl OrderedF64 {
174    /// Wrap a raw `f64` value.
175    #[must_use]
176    pub fn new(value: f64) -> Self {
177        Self(value)
178    }
179
180    /// Return the inner `f64` value.
181    #[must_use]
182    pub fn value(self) -> f64 {
183        self.0
184    }
185
186    fn to_sort_key(self) -> u64 {
187        let bits = self.0.to_bits();
188        if self.0.is_nan() {
189            return u64::MAX;
190        }
191        if bits >> 63 == 1 {
192            !bits
193        } else {
194            bits | (1 << 63)
195        }
196    }
197}
198
199impl PartialEq for OrderedF64 {
200    fn eq(&self, other: &Self) -> bool {
201        self.to_sort_key() == other.to_sort_key()
202    }
203}
204
205impl Eq for OrderedF64 {}
206
207impl PartialOrd for OrderedF64 {
208    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
209        Some(self.cmp(other))
210    }
211}
212
213impl Ord for OrderedF64 {
214    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
215        self.to_sort_key().cmp(&other.to_sort_key())
216    }
217}
218
219/// Sorted index backed by a `BTreeMap` for numeric range queries.
220#[derive(Debug, Default)]
221pub struct SortedIndex {
222    tree: BTreeMap<OrderedF64, Vec<DocId>>,
223}
224
225impl SortedIndex {
226    /// Create an empty sorted index.
227    #[must_use]
228    pub fn new() -> Self {
229        Self::default()
230    }
231
232    /// Add a document to a sorted bucket, maintaining sorted order within the bucket.
233    pub fn add(&mut self, key: f64, doc_id: DocId) {
234        let bucket = self.tree.entry(OrderedF64::new(key)).or_default();
235        match bucket.binary_search(&doc_id) {
236            Ok(_) => {}
237            Err(pos) => bucket.insert(pos, doc_id),
238        }
239    }
240
241    /// Remove a document from a sorted bucket. No-op if not present.
242    pub fn remove(&mut self, key: f64, doc_id: DocId) {
243        let ordered = OrderedF64::new(key);
244        if let Some(bucket) = self.tree.get_mut(&ordered) {
245            if let Ok(pos) = bucket.binary_search(&doc_id) {
246                bucket.remove(pos);
247            }
248            if bucket.is_empty() {
249                self.tree.remove(&ordered);
250            }
251        }
252    }
253
254    /// Return all document IDs whose keys fall within `[min, max]` (inclusive), sorted.
255    #[must_use]
256    pub fn range_query(&self, min: f64, max: f64) -> Vec<DocId> {
257        let lo = OrderedF64::new(min);
258        let hi = OrderedF64::new(max);
259        let mut result = Vec::new();
260        for (_key, bucket) in self.tree.range(lo..=hi) {
261            result.extend_from_slice(bucket);
262        }
263        result.sort_unstable();
264        result.dedup();
265        result
266    }
267
268    /// Remove all entries from the index.
269    pub fn clear(&mut self) {
270        self.tree.clear();
271    }
272}
273
274/// Unique-constraint helper index mapping hash bucket keys to sorted doc ID candidates.
275///
276/// Collisions are expected with 32-bit hashes; caller-side value verification must enforce
277/// exact-value uniqueness before insert.
278#[derive(Debug, Default)]
279pub struct UniqueIndex {
280    entries: HashMap<u32, Vec<DocId>>,
281}
282
283impl UniqueIndex {
284    /// Create an empty unique index.
285    #[must_use]
286    pub fn new() -> Self {
287        Self::default()
288    }
289
290    /// Add a document candidate to a hash bucket, maintaining sorted order.
291    pub fn add(&mut self, hash: u32, doc_id: DocId) {
292        let bucket = self.entries.entry(hash).or_default();
293        match bucket.binary_search(&doc_id) {
294            Ok(_) => {}
295            Err(pos) => bucket.insert(pos, doc_id),
296        }
297    }
298
299    /// Remove a document from a hash bucket. No-op if not present.
300    pub fn remove(&mut self, hash: u32, doc_id: DocId) {
301        if let Some(bucket) = self.entries.get_mut(&hash) {
302            if let Ok(pos) = bucket.binary_search(&doc_id) {
303                bucket.remove(pos);
304            }
305            if bucket.is_empty() {
306                self.entries.remove(&hash);
307            }
308        }
309    }
310
311    /// Look up all candidate documents occupying a hash bucket.
312    #[must_use]
313    pub fn lookup(&self, hash: u32) -> &[DocId] {
314        self.entries.get(&hash).map_or(&[], Vec::as_slice)
315    }
316
317    /// Remove all entries from the index.
318    pub fn clear(&mut self) {
319        self.entries.clear();
320    }
321}
322
323/// Container holding all index instances for a single collection.
324#[derive(Debug, Default)]
325pub struct CollectionIndexes {
326    hash_indexes: HashMap<FieldId, HashIndex>,
327    sorted_indexes: HashMap<FieldId, SortedIndex>,
328    array_indexes: HashMap<FieldId, HashIndex>,
329    unique_indexes: HashMap<FieldId, UniqueIndex>,
330}
331
332impl CollectionIndexes {
333    /// Create an empty collection indexes container.
334    #[must_use]
335    pub fn new() -> Self {
336        Self::default()
337    }
338
339    /// Return a mutable reference to the hash index for a field, creating one if absent.
340    pub fn get_or_create_hash(&mut self, field_id: FieldId) -> &mut HashIndex {
341        self.hash_indexes.entry(field_id).or_default()
342    }
343
344    /// Return a mutable reference to the sorted index for a field, creating one if absent.
345    pub fn get_or_create_sorted(&mut self, field_id: FieldId) -> &mut SortedIndex {
346        self.sorted_indexes.entry(field_id).or_default()
347    }
348
349    /// Return a mutable reference to the array index for a field, creating one if absent.
350    pub fn get_or_create_array(&mut self, field_id: FieldId) -> &mut HashIndex {
351        self.array_indexes.entry(field_id).or_default()
352    }
353
354    /// Return a mutable reference to the unique index for a field, creating one if absent.
355    pub fn get_or_create_unique(&mut self, field_id: FieldId) -> &mut UniqueIndex {
356        self.unique_indexes.entry(field_id).or_default()
357    }
358
359    /// Return an immutable reference to the hash index for a field.
360    #[must_use]
361    pub fn hash(&self, field_id: FieldId) -> Option<&HashIndex> {
362        self.hash_indexes.get(&field_id)
363    }
364
365    /// Return an immutable reference to the sorted index for a field.
366    #[must_use]
367    pub fn sorted(&self, field_id: FieldId) -> Option<&SortedIndex> {
368        self.sorted_indexes.get(&field_id)
369    }
370
371    /// Return an immutable reference to the array index for a field.
372    #[must_use]
373    pub fn array(&self, field_id: FieldId) -> Option<&HashIndex> {
374        self.array_indexes.get(&field_id)
375    }
376
377    /// Return an immutable reference to the unique index for a field.
378    #[must_use]
379    pub fn unique(&self, field_id: FieldId) -> Option<&UniqueIndex> {
380        self.unique_indexes.get(&field_id)
381    }
382
383    /// Remove all indexes for a field across all index types.
384    pub fn remove_field(&mut self, field_id: FieldId) {
385        self.hash_indexes.remove(&field_id);
386        self.sorted_indexes.remove(&field_id);
387        self.array_indexes.remove(&field_id);
388        self.unique_indexes.remove(&field_id);
389    }
390}
391
392/// Compute the sorted intersection of two sorted `DocId` slices (merge-join).
393#[must_use]
394pub fn intersect_sorted(a: &[DocId], b: &[DocId]) -> Vec<DocId> {
395    let mut result = Vec::new();
396    let (mut i, mut j) = (0, 0);
397    while i < a.len() && j < b.len() {
398        match a[i].cmp(&b[j]) {
399            std::cmp::Ordering::Less => i += 1,
400            std::cmp::Ordering::Greater => j += 1,
401            std::cmp::Ordering::Equal => {
402                result.push(a[i]);
403                i += 1;
404                j += 1;
405            }
406        }
407    }
408    result
409}
410
411/// Compute the sorted union of two sorted `DocId` slices (merge-union).
412#[must_use]
413pub fn union_sorted(a: &[DocId], b: &[DocId]) -> Vec<DocId> {
414    let mut result = Vec::with_capacity(a.len() + b.len());
415    let (mut i, mut j) = (0, 0);
416    while i < a.len() && j < b.len() {
417        match a[i].cmp(&b[j]) {
418            std::cmp::Ordering::Less => {
419                result.push(a[i]);
420                i += 1;
421            }
422            std::cmp::Ordering::Greater => {
423                result.push(b[j]);
424                j += 1;
425            }
426            std::cmp::Ordering::Equal => {
427                result.push(a[i]);
428                i += 1;
429                j += 1;
430            }
431        }
432    }
433    result.extend_from_slice(&a[i..]);
434    result.extend_from_slice(&b[j..]);
435    result
436}
437
438#[cfg(test)]
439mod tests {
440    use super::*;
441
442    #[test]
443    fn hash_index_add_remove_lookup() {
444        let mut idx = HashIndex::new();
445        idx.add(100, 1);
446        idx.add(100, 3);
447        idx.add(100, 2);
448        assert_eq!(idx.lookup(100), &[1, 2, 3]);
449
450        idx.remove(100, 2);
451        assert_eq!(idx.lookup(100), &[1, 3]);
452
453        idx.remove(100, 1);
454        idx.remove(100, 3);
455        assert_eq!(idx.lookup(100), &[] as &[DocId]);
456    }
457
458    #[test]
459    fn hash_index_duplicate_add_is_idempotent() {
460        let mut idx = HashIndex::new();
461        idx.add(42, 7);
462        idx.add(42, 7);
463        idx.add(42, 7);
464        assert_eq!(idx.lookup(42), &[7]);
465    }
466
467    #[test]
468    fn hash_index_remove_nonexistent_is_noop() {
469        let mut idx = HashIndex::new();
470        idx.remove(999, 1);
471        idx.add(10, 5);
472        idx.remove(10, 99);
473        assert_eq!(idx.lookup(10), &[5]);
474    }
475
476    #[test]
477    fn sorted_index_add_remove_range_query() {
478        let mut idx = SortedIndex::new();
479        idx.add(1.0, 10);
480        idx.add(3.0, 30);
481        idx.add(2.0, 20);
482        idx.add(2.0, 25);
483
484        let result = idx.range_query(1.5, 3.0);
485        assert_eq!(result, vec![20, 25, 30]);
486
487        idx.remove(2.0, 20);
488        let result = idx.range_query(1.5, 3.0);
489        assert_eq!(result, vec![25, 30]);
490    }
491
492    #[test]
493    fn ordered_f64_ordering_negative_zero_positive() {
494        let neg = OrderedF64::new(-5.0);
495        let zero = OrderedF64::new(0.0);
496        let pos = OrderedF64::new(10.0);
497        let nan = OrderedF64::new(f64::NAN);
498
499        assert!(neg < zero);
500        assert!(zero < pos);
501        assert!(pos < nan);
502        assert!(neg < pos);
503        assert!(neg < nan);
504    }
505
506    #[test]
507    fn unique_index_add_lookup() {
508        let mut idx = UniqueIndex::new();
509        idx.add(42, 1);
510        assert_eq!(idx.lookup(42), &[1]);
511    }
512
513    #[test]
514    fn unique_index_allows_hash_collisions() {
515        let mut idx = UniqueIndex::new();
516        idx.add(42, 1);
517        idx.add(42, 2);
518        assert_eq!(idx.lookup(42), &[1, 2]);
519    }
520
521    #[test]
522    fn unique_index_same_doc_id_same_hash_is_ok() {
523        let mut idx = UniqueIndex::new();
524        idx.add(42, 1);
525        idx.add(42, 1);
526        assert_eq!(idx.lookup(42), &[1]);
527    }
528
529    #[test]
530    fn unique_index_remove_doc_id_only() {
531        let mut idx = UniqueIndex::new();
532        idx.add(42, 1);
533        idx.add(42, 2);
534        idx.remove(42, 1);
535        assert_eq!(idx.lookup(42), &[2]);
536        idx.remove(42, 2);
537        assert_eq!(idx.lookup(42), &[] as &[DocId]);
538    }
539
540    #[test]
541    fn index_config_add_remove_entries() {
542        let mut config = IndexConfig::new();
543        config.add(0, IndexType::Hash).expect("add should succeed");
544        config
545            .add(1, IndexType::Sorted)
546            .expect("add should succeed");
547
548        assert_eq!(config.lookup(0), Some(IndexType::Hash));
549        assert_eq!(config.lookup(1), Some(IndexType::Sorted));
550        assert_eq!(config.entries().len(), 2);
551
552        let removed = config.remove(0).expect("remove should succeed");
553        assert_eq!(removed, IndexType::Hash);
554        assert_eq!(config.lookup(0), None);
555    }
556
557    #[test]
558    fn index_config_duplicate_add_rejected() {
559        let mut config = IndexConfig::new();
560        config.add(0, IndexType::Hash).expect("add should succeed");
561        let err = config
562            .add(0, IndexType::Sorted)
563            .expect_err("duplicate add must fail");
564        assert!(matches!(err, IndexError::AlreadyExists(0)));
565    }
566
567    #[test]
568    fn index_config_remove_nonexistent_rejected() {
569        let mut config = IndexConfig::new();
570        let err = config
571            .remove(99)
572            .expect_err("remove non-existent must fail");
573        assert!(matches!(err, IndexError::NotFound(99)));
574    }
575
576    #[test]
577    fn intersect_sorted_disjoint() {
578        let result = intersect_sorted(&[1, 3, 5], &[2, 4, 6]);
579        assert!(result.is_empty());
580    }
581
582    #[test]
583    fn intersect_sorted_overlapping() {
584        let result = intersect_sorted(&[1, 2, 3, 5], &[2, 3, 4, 5]);
585        assert_eq!(result, vec![2, 3, 5]);
586    }
587
588    #[test]
589    fn intersect_sorted_empty_input() {
590        assert!(intersect_sorted(&[], &[1, 2, 3]).is_empty());
591        assert!(intersect_sorted(&[1, 2, 3], &[]).is_empty());
592        assert!(intersect_sorted(&[], &[]).is_empty());
593    }
594
595    #[test]
596    fn union_sorted_disjoint() {
597        let result = union_sorted(&[1, 3, 5], &[2, 4, 6]);
598        assert_eq!(result, vec![1, 2, 3, 4, 5, 6]);
599    }
600
601    #[test]
602    fn union_sorted_overlapping() {
603        let result = union_sorted(&[1, 2, 3], &[2, 3, 4]);
604        assert_eq!(result, vec![1, 2, 3, 4]);
605    }
606
607    #[test]
608    fn union_sorted_empty_input() {
609        assert_eq!(union_sorted(&[], &[1, 2]), vec![1, 2]);
610        assert_eq!(union_sorted(&[1, 2], &[]), vec![1, 2]);
611        assert!(union_sorted(&[], &[]).is_empty());
612    }
613}