Skip to main content

grafeo_engine/database/
index.rs

1//! Index management for GrafeoDB (property, vector, and text indexes).
2
3use grafeo_common::grafeo_info;
4#[cfg(any(feature = "vector-index", feature = "text-index"))]
5use std::sync::Arc;
6
7#[cfg(feature = "text-index")]
8use parking_lot::RwLock;
9
10use grafeo_common::utils::error::Result;
11
12impl super::GrafeoDB {
13    // =========================================================================
14    // PROPERTY INDEX API
15    // =========================================================================
16
17    /// Creates an index on a node property for O(1) lookups by value.
18    ///
19    /// After creating an index, calls to [`Self::find_nodes_by_property`] will be
20    /// O(1) instead of O(n) for this property. The index is automatically
21    /// maintained when properties are set or removed.
22    ///
23    /// # Example
24    ///
25    /// ```no_run
26    /// # use grafeo_engine::GrafeoDB;
27    /// # use grafeo_common::types::Value;
28    /// # let db = GrafeoDB::new_in_memory();
29    /// // Create an index on the 'email' property
30    /// db.create_property_index("email");
31    ///
32    /// // Now lookups by email are O(1)
33    /// let nodes = db.find_nodes_by_property("email", &Value::from("alix@example.com"));
34    /// ```
35    pub fn create_property_index(&self, property: &str) {
36        self.lpg_store().create_property_index(property);
37    }
38
39    /// Drops an index on a node property.
40    ///
41    /// Returns `true` if the index existed and was removed.
42    pub fn drop_property_index(&self, property: &str) -> bool {
43        self.lpg_store().drop_property_index(property)
44    }
45
46    /// Returns `true` if the property has an index.
47    #[must_use]
48    pub fn has_property_index(&self, property: &str) -> bool {
49        self.lpg_store().has_property_index(property)
50    }
51
52    /// Finds all nodes that have a specific property value.
53    ///
54    /// If the property is indexed, this is O(1). Otherwise, it scans all nodes
55    /// which is O(n). Use [`Self::create_property_index`] for frequently queried properties.
56    ///
57    /// # Example
58    ///
59    /// ```no_run
60    /// # use grafeo_engine::GrafeoDB;
61    /// # use grafeo_common::types::Value;
62    /// # let db = GrafeoDB::new_in_memory();
63    /// // Create index for fast lookups (optional but recommended)
64    /// db.create_property_index("city");
65    ///
66    /// // Find all nodes where city = "NYC"
67    /// let nyc_nodes = db.find_nodes_by_property("city", &Value::from("NYC"));
68    /// ```
69    #[must_use]
70    pub fn find_nodes_by_property(
71        &self,
72        property: &str,
73        value: &grafeo_common::types::Value,
74    ) -> Vec<grafeo_common::types::NodeId> {
75        self.lpg_store().find_nodes_by_property(property, value)
76    }
77
78    // =========================================================================
79    // VECTOR INDEX API
80    // =========================================================================
81
82    /// Creates a vector similarity index on a node property.
83    ///
84    /// This enables efficient approximate nearest-neighbor search on vector
85    /// properties. Currently validates the index parameters and scans existing
86    /// nodes to verify the property contains vectors of the expected dimensions.
87    ///
88    /// # Arguments
89    ///
90    /// * `label` - Node label to index (e.g., `"Doc"`)
91    /// * `property` - Property containing vector embeddings (e.g., `"embedding"`)
92    /// * `dimensions` - Expected vector dimensions (inferred from data if `None`)
93    /// * `metric` - Distance metric: `"cosine"` (default), `"euclidean"`, `"dot_product"`, `"manhattan"`
94    /// * `m` - HNSW links per node (default: 16). Higher = better recall, more memory.
95    /// * `ef_construction` - Construction beam width (default: 128). Higher = better index quality, slower build.
96    /// * `quantization` - Quantization mode: `None` (default), `"scalar"`, `"binary"`, or `"product"`.
97    ///   Quantized indexes use less memory at the cost of slightly lower recall.
98    ///
99    /// # Errors
100    ///
101    /// Returns an error if the metric is invalid, no vectors are found, or
102    /// dimensions don't match.
103    #[allow(clippy::too_many_arguments)]
104    pub fn create_vector_index(
105        &self,
106        label: &str,
107        property: &str,
108        dimensions: Option<usize>,
109        metric: Option<&str>,
110        m: Option<usize>,
111        ef_construction: Option<usize>,
112        quantization: Option<&str>,
113    ) -> Result<()> {
114        use grafeo_common::types::{PropertyKey, Value};
115        use grafeo_core::index::vector::DistanceMetric;
116
117        let metric = match metric {
118            Some(m) => DistanceMetric::from_str(m).ok_or_else(|| {
119                grafeo_common::utils::error::Error::Internal(format!(
120                    "Unknown distance metric '{}'. Use: cosine, euclidean, dot_product, manhattan",
121                    m
122                ))
123            })?,
124            None => DistanceMetric::Cosine,
125        };
126
127        #[cfg(feature = "vector-index")]
128        let quantization_type = Self::parse_quantization(quantization)?;
129        #[cfg(not(feature = "vector-index"))]
130        let _ = quantization;
131
132        // Scan nodes to validate vectors exist and check dimensions
133        let prop_key = PropertyKey::new(property);
134        let mut found_dims: Option<usize> = dimensions;
135        let mut vector_count = 0usize;
136
137        #[cfg(feature = "vector-index")]
138        let mut vectors: Vec<(grafeo_common::types::NodeId, Vec<f32>)> = Vec::new();
139
140        for node in self.lpg_store().nodes_with_label(label) {
141            if let Some(Value::Vector(v)) = node.properties.get(&prop_key) {
142                if let Some(expected) = found_dims {
143                    if v.len() != expected {
144                        return Err(grafeo_common::utils::error::Error::Internal(format!(
145                            "Vector dimension mismatch: expected {}, found {} on node {}",
146                            expected,
147                            v.len(),
148                            node.id.0
149                        )));
150                    }
151                } else {
152                    found_dims = Some(v.len());
153                }
154                vector_count += 1;
155                #[cfg(feature = "vector-index")]
156                vectors.push((node.id, v.to_vec()));
157            }
158        }
159
160        let Some(dims) = found_dims else {
161            // No vectors found yet: caller must have supplied explicit dimensions
162            // so we can create an empty index that auto-populates via set_node_property.
163            return if let Some(d) = dimensions {
164                #[cfg(feature = "vector-index")]
165                {
166                    let index = Self::build_vector_index(
167                        d,
168                        metric,
169                        m,
170                        ef_construction,
171                        quantization_type,
172                        0,
173                    );
174                    self.lpg_store()
175                        .add_vector_index(label, property, Arc::new(index));
176                }
177
178                let _ = (m, ef_construction);
179                grafeo_info!(
180                    "Empty vector index created: :{label}({property}) - 0 vectors, {d} dimensions, metric={metric_name}",
181                    metric_name = metric.name()
182                );
183                Ok(())
184            } else {
185                Err(grafeo_common::utils::error::Error::Internal(format!(
186                    "No vector properties found on :{label}({property}) and no dimensions specified"
187                )))
188            };
189        };
190
191        // Build and populate the vector index
192        #[cfg(feature = "vector-index")]
193        {
194            use grafeo_core::index::vector::VectorIndexKind;
195
196            let index = Self::build_vector_index(
197                dims,
198                metric,
199                m,
200                ef_construction,
201                quantization_type,
202                vectors.len(),
203            );
204
205            match &index {
206                VectorIndexKind::Hnsw(_) => {
207                    let accessor = grafeo_core::index::vector::PropertyVectorAccessor::new(
208                        &**self.lpg_store(),
209                        property,
210                    );
211                    for (node_id, vec) in &vectors {
212                        index.insert(*node_id, vec, &accessor);
213                    }
214                }
215                VectorIndexKind::Quantized(q_idx) => {
216                    for (node_id, vec) in &vectors {
217                        q_idx.insert(*node_id, vec);
218                    }
219                }
220            }
221
222            self.lpg_store()
223                .add_vector_index(label, property, Arc::new(index));
224        }
225
226        // Suppress unused variable warnings when vector-index is off
227        let _ = (m, ef_construction);
228
229        grafeo_info!(
230            "Vector index created: :{label}({property}) - {vector_count} vectors, {dims} dimensions, metric={metric_name}",
231            metric_name = metric.name()
232        );
233
234        Ok(())
235    }
236
237    /// Parses a quantization string into a [`QuantizationType`].
238    #[cfg(feature = "vector-index")]
239    fn parse_quantization(
240        quantization: Option<&str>,
241    ) -> Result<grafeo_core::index::vector::QuantizationType> {
242        use grafeo_core::index::vector::QuantizationType;
243        match quantization {
244            None | Some("none") => Ok(QuantizationType::None),
245            Some("scalar") => Ok(QuantizationType::Scalar),
246            Some("binary") => Ok(QuantizationType::Binary),
247            Some("product") => Ok(QuantizationType::Product { num_subvectors: 8 }),
248            Some(other) => Err(grafeo_common::utils::error::Error::Internal(format!(
249                "Unknown quantization type '{other}'. Use: scalar, binary, product"
250            ))),
251        }
252    }
253
254    /// Builds a [`VectorIndexKind`] from the given parameters.
255    #[cfg(feature = "vector-index")]
256    fn build_vector_index(
257        dims: usize,
258        metric: grafeo_core::index::vector::DistanceMetric,
259        m: Option<usize>,
260        ef_construction: Option<usize>,
261        quantization: grafeo_core::index::vector::QuantizationType,
262        capacity: usize,
263    ) -> grafeo_core::index::vector::VectorIndexKind {
264        use grafeo_core::index::vector::{
265            HnswConfig, HnswIndex, QuantizationType, QuantizedHnswIndex, VectorIndexKind,
266        };
267
268        let mut config = HnswConfig::new(dims, metric);
269        if let Some(m_val) = m {
270            config = config.with_m(m_val);
271        }
272        if let Some(ef_c) = ef_construction {
273            config = config.with_ef_construction(ef_c);
274        }
275
276        match quantization {
277            QuantizationType::None => {
278                VectorIndexKind::Hnsw(HnswIndex::with_capacity(config, capacity))
279            }
280            _ => VectorIndexKind::Quantized(QuantizedHnswIndex::new(config, quantization)),
281        }
282    }
283
284    /// Drops a vector index for the given label and property.
285    ///
286    /// Returns `true` if the index existed and was removed, `false` if no
287    /// index was found.
288    ///
289    /// After dropping, [`vector_search`](Self::vector_search) for this
290    /// label+property pair will return an error.
291    #[cfg(feature = "vector-index")]
292    pub fn drop_vector_index(&self, label: &str, property: &str) -> bool {
293        let removed = self.lpg_store().remove_vector_index(label, property);
294        if removed {
295            grafeo_info!("Vector index dropped: :{label}({property})");
296        }
297        removed
298    }
299
300    /// Drops and recreates a vector index, rescanning all matching nodes.
301    ///
302    /// In normal usage you do **not** need to call this. Vector indexes
303    /// auto-sync when nodes are created or updated via
304    /// [`set_node_property`](Self::set_node_property),
305    /// [`batch_create_nodes`](Self::batch_create_nodes), or
306    /// [`batch_create_nodes_with_props`](Self::batch_create_nodes_with_props).
307    ///
308    /// Use `rebuild_vector_index` only when:
309    /// - Data was loaded through non-standard paths (e.g., persistence
310    ///   restore or direct store manipulation) before the index existed.
311    /// - You want to compact the index after many deletions (HNSW does
312    ///   not reclaim deleted-node slots automatically).
313    /// - The index configuration needs to be refreshed after upgrading.
314    ///
315    /// When the index still exists, the previous configuration (dimensions,
316    /// metric, M, ef\_construction) is preserved. When it has already been
317    /// dropped, dimensions are inferred from existing data and default
318    /// parameters are used.
319    ///
320    /// # Errors
321    ///
322    /// Returns an error if the rebuild fails (e.g., no matching vectors found
323    /// and no dimensions can be inferred).
324    #[cfg(feature = "vector-index")]
325    pub fn rebuild_vector_index(&self, label: &str, property: &str) -> Result<()> {
326        // Preserve config and quantization type from existing index if available
327        let existing = self.lpg_store().get_vector_index(label, property);
328
329        let (config, quantization_name) = if let Some(ref idx) = existing {
330            let qt = match idx.quantization_type() {
331                Some(grafeo_core::index::vector::QuantizationType::Scalar) => Some("scalar"),
332                Some(grafeo_core::index::vector::QuantizationType::Binary) => Some("binary"),
333                Some(grafeo_core::index::vector::QuantizationType::Product { .. }) => {
334                    Some("product")
335                }
336                _ => None,
337            };
338            (Some(idx.config().clone()), qt)
339        } else {
340            (None, None)
341        };
342
343        self.lpg_store().remove_vector_index(label, property);
344
345        if let Some(config) = config {
346            self.create_vector_index(
347                label,
348                property,
349                Some(config.dimensions),
350                Some(config.metric.name()),
351                Some(config.m),
352                Some(config.ef_construction),
353                quantization_name,
354            )
355        } else {
356            // Index was already dropped: infer dimensions from data
357            self.create_vector_index(label, property, None, None, None, None, None)
358        }
359    }
360
361    // =========================================================================
362    // TEXT INDEX API
363    // =========================================================================
364
365    /// Creates a BM25 text index on a node property for full-text search.
366    ///
367    /// Indexes all existing nodes with the given label and property.
368    /// The index stays in sync automatically as nodes are created, updated,
369    /// or deleted. Use [`rebuild_text_index`](Self::rebuild_text_index) only
370    /// if the index was created before existing data was loaded.
371    ///
372    /// # Errors
373    ///
374    /// Returns an error if the label has no nodes or the property contains no text values.
375    #[cfg(feature = "text-index")]
376    pub fn create_text_index(&self, label: &str, property: &str) -> Result<()> {
377        use grafeo_common::types::{PropertyKey, Value};
378        use grafeo_core::index::text::{BM25Config, InvertedIndex};
379
380        let mut index = InvertedIndex::new(BM25Config::default());
381        let prop_key = PropertyKey::new(property);
382
383        // Index all existing nodes with this label + property
384        let nodes = self.lpg_store().nodes_by_label(label);
385        for node_id in nodes {
386            if let Some(Value::String(text)) =
387                self.lpg_store().get_node_property(node_id, &prop_key)
388            {
389                index.insert(node_id, text.as_str());
390            }
391        }
392
393        self.lpg_store()
394            .add_text_index(label, property, Arc::new(RwLock::new(index)));
395        Ok(())
396    }
397
398    /// Drops a text index on a label+property pair.
399    ///
400    /// Returns `true` if the index existed and was removed.
401    #[cfg(feature = "text-index")]
402    pub fn drop_text_index(&self, label: &str, property: &str) -> bool {
403        self.lpg_store().remove_text_index(label, property)
404    }
405
406    /// Rebuilds a text index by re-scanning all matching nodes.
407    ///
408    /// Use after bulk property updates to keep the index current.
409    ///
410    /// # Errors
411    ///
412    /// Returns an error if no text index exists for this label+property.
413    #[cfg(feature = "text-index")]
414    pub fn rebuild_text_index(&self, label: &str, property: &str) -> Result<()> {
415        self.lpg_store().remove_text_index(label, property);
416        self.create_text_index(label, property)
417    }
418}