Skip to main content

nodedb_vector/sieve/
collection.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! SieveCollection — a keyed map of specialized HNSW subindices, one per
4//! stable predicate signature (e.g. "tenant_id=42", "lang=en").
5
6use std::collections::HashMap;
7
8use crate::error::VectorError;
9use crate::hnsw::graph::HnswIndex;
10use nodedb_types::hnsw::HnswParams;
11use nodedb_types::vector_distance::DistanceMetric;
12
13/// A stable predicate signature string, e.g. `"tenant_id=42"` or `"lang=en"`.
14pub type PredicateSignature = String;
15
16/// A collection of specialized HNSW subindices, one per stable predicate.
17///
18/// Each subindex is built over the subset of vectors that match its predicate.
19/// Smaller datasets mean a lower `sub_m` is sufficient, keeping memory overhead
20/// proportional to the per-predicate population rather than the global index.
21pub struct SieveCollection {
22    /// Map from predicate signature → specialized HNSW subindex.
23    subindices: HashMap<PredicateSignature, HnswIndex>,
24    /// `M` parameter used when creating subindices.
25    /// Smaller than the global M because each subindex covers fewer vectors.
26    sub_m: usize,
27}
28
29impl SieveCollection {
30    /// Create an empty collection.  `sub_m` is the HNSW `M` parameter used for
31    /// every subindex built by this collection.
32    pub fn new(sub_m: usize) -> Self {
33        Self {
34            subindices: HashMap::new(),
35            sub_m,
36        }
37    }
38
39    /// Build (or rebuild) a specialized subindex for `signature` from the
40    /// provided `(id, vector)` pairs.
41    ///
42    /// All vectors must have length `dim`.  The subindex uses `metric` for
43    /// distance computation.
44    ///
45    /// # Errors
46    ///
47    /// Returns `VectorError` if any insertion fails (e.g. dimension mismatch).
48    pub fn build_subindex(
49        &mut self,
50        signature: PredicateSignature,
51        vectors: &[(u32, Vec<f32>)],
52        dim: usize,
53        metric: DistanceMetric,
54    ) -> Result<(), VectorError> {
55        let params = HnswParams {
56            m: self.sub_m,
57            m0: self.sub_m * 2,
58            ef_construction: 200,
59            metric,
60        };
61        let mut index = HnswIndex::new(dim, params);
62        for (_, vec) in vectors {
63            index.insert(vec.clone())?;
64        }
65        self.subindices.insert(signature, index);
66        Ok(())
67    }
68
69    /// Returns `true` if a subindex exists for the given signature.
70    pub fn has(&self, signature: &PredicateSignature) -> bool {
71        self.subindices.contains_key(signature)
72    }
73
74    /// Returns a shared reference to the subindex for `signature`, or `None`.
75    pub fn get(&self, signature: &PredicateSignature) -> Option<&HnswIndex> {
76        self.subindices.get(signature)
77    }
78
79    /// Remove the subindex for `signature`, freeing its memory.
80    pub fn drop(&mut self, signature: &PredicateSignature) {
81        self.subindices.remove(signature);
82    }
83
84    /// All predicate signatures currently held in this collection.
85    pub fn signatures(&self) -> Vec<&PredicateSignature> {
86        self.subindices.keys().collect()
87    }
88}
89
90// Expose SearchResult at crate level through this module for callers that
91// import from `sieve::collection`.
92pub use crate::hnsw::graph::SearchResult as SubindexSearchResult;
93
94#[cfg(test)]
95mod tests {
96    use super::*;
97
98    fn sample_vectors(n: usize, dim: usize) -> Vec<(u32, Vec<f32>)> {
99        (0..n).map(|i| (i as u32, vec![i as f32; dim])).collect()
100    }
101
102    #[test]
103    fn build_subindex_has_and_get() {
104        let mut coll = SieveCollection::new(8);
105        let vecs = sample_vectors(5, 3);
106        coll.build_subindex("tenant_id=42".to_string(), &vecs, 3, DistanceMetric::L2)
107            .expect("build should succeed");
108
109        assert!(coll.has(&"tenant_id=42".to_string()));
110        let idx = coll.get(&"tenant_id=42".to_string());
111        assert!(idx.is_some());
112        assert_eq!(idx.unwrap().len(), 5);
113    }
114
115    #[test]
116    fn drop_removes_subindex() {
117        let mut coll = SieveCollection::new(8);
118        let vecs = sample_vectors(5, 3);
119        coll.build_subindex("lang=en".to_string(), &vecs, 3, DistanceMetric::Cosine)
120            .expect("build should succeed");
121        assert!(coll.has(&"lang=en".to_string()));
122
123        coll.drop(&"lang=en".to_string());
124        assert!(!coll.has(&"lang=en".to_string()));
125        assert!(coll.get(&"lang=en".to_string()).is_none());
126    }
127
128    #[test]
129    fn signatures_lists_all_keys() {
130        let mut coll = SieveCollection::new(8);
131        let vecs = sample_vectors(3, 2);
132        coll.build_subindex("a".to_string(), &vecs, 2, DistanceMetric::L2)
133            .expect("build a");
134        coll.build_subindex("b".to_string(), &vecs, 2, DistanceMetric::L2)
135            .expect("build b");
136
137        let mut sigs: Vec<String> = coll.signatures().into_iter().cloned().collect();
138        sigs.sort();
139        assert_eq!(sigs, vec!["a".to_string(), "b".to_string()]);
140    }
141
142    #[test]
143    fn search_on_subindex() {
144        let mut coll = SieveCollection::new(8);
145        let vecs = sample_vectors(5, 3);
146        coll.build_subindex("tenant_id=1".to_string(), &vecs, 3, DistanceMetric::L2)
147            .expect("build");
148
149        let idx = coll.get(&"tenant_id=1".to_string()).unwrap();
150        let results = idx.search(&[2.0, 2.0, 2.0], 2, 32);
151        assert!(!results.is_empty());
152    }
153}