Skip to main content

grafeo_engine/database/
index.rs

1//! Index management for GrafeoDB (property, vector, and text indexes).
2
3use grafeo_common::grafeo_info;
4#[cfg(any(feature = "vector-index", feature = "text-index"))]
5use std::sync::Arc;
6
7#[cfg(feature = "text-index")]
8use parking_lot::RwLock;
9
10use grafeo_common::utils::error::Result;
11
12impl super::GrafeoDB {
13    // =========================================================================
14    // PROPERTY INDEX API
15    // =========================================================================
16
17    /// Creates an index on a node property for O(1) lookups by value.
18    ///
19    /// After creating an index, calls to [`Self::find_nodes_by_property`] will be
20    /// O(1) instead of O(n) for this property. The index is automatically
21    /// maintained when properties are set or removed.
22    ///
23    /// # Example
24    ///
25    /// ```no_run
26    /// # use grafeo_engine::GrafeoDB;
27    /// # use grafeo_common::types::Value;
28    /// # let db = GrafeoDB::new_in_memory();
29    /// // Create an index on the 'email' property
30    /// db.create_property_index("email");
31    ///
32    /// // Now lookups by email are O(1)
33    /// let nodes = db.find_nodes_by_property("email", &Value::from("alix@example.com"));
34    /// ```
35    pub fn create_property_index(&self, property: &str) {
36        self.lpg_store().create_property_index(property);
37    }
38
39    /// Drops an index on a node property.
40    ///
41    /// Returns `true` if the index existed and was removed.
42    pub fn drop_property_index(&self, property: &str) -> bool {
43        self.lpg_store().drop_property_index(property)
44    }
45
46    /// Returns `true` if the property has an index.
47    #[must_use]
48    pub fn has_property_index(&self, property: &str) -> bool {
49        self.lpg_store().has_property_index(property)
50    }
51
52    /// Finds all nodes that have a specific property value.
53    ///
54    /// If the property is indexed, this is O(1). Otherwise, it scans all nodes
55    /// which is O(n). Use [`Self::create_property_index`] for frequently queried properties.
56    ///
57    /// # Example
58    ///
59    /// ```no_run
60    /// # use grafeo_engine::GrafeoDB;
61    /// # use grafeo_common::types::Value;
62    /// # let db = GrafeoDB::new_in_memory();
63    /// // Create index for fast lookups (optional but recommended)
64    /// db.create_property_index("city");
65    ///
66    /// // Find all nodes where city = "NYC"
67    /// let nyc_nodes = db.find_nodes_by_property("city", &Value::from("NYC"));
68    /// ```
69    #[must_use]
70    pub fn find_nodes_by_property(
71        &self,
72        property: &str,
73        value: &grafeo_common::types::Value,
74    ) -> Vec<grafeo_common::types::NodeId> {
75        self.lpg_store().find_nodes_by_property(property, value)
76    }
77
78    // =========================================================================
79    // VECTOR INDEX API
80    // =========================================================================
81
82    /// Creates a vector similarity index on a node property.
83    ///
84    /// This enables efficient approximate nearest-neighbor search on vector
85    /// properties. Currently validates the index parameters and scans existing
86    /// nodes to verify the property contains vectors of the expected dimensions.
87    ///
88    /// # Arguments
89    ///
90    /// * `label` - Node label to index (e.g., `"Doc"`)
91    /// * `property` - Property containing vector embeddings (e.g., `"embedding"`)
92    /// * `dimensions` - Expected vector dimensions (inferred from data if `None`)
93    /// * `metric` - Distance metric: `"cosine"` (default), `"euclidean"`, `"dot_product"`, `"manhattan"`
94    /// * `m` - HNSW links per node (default: 16). Higher = better recall, more memory.
95    /// * `ef_construction` - Construction beam width (default: 128). Higher = better index quality, slower build.
96    /// * `quantization` - Quantization mode: `None` (default), `"scalar"`, `"binary"`, or `"product"`.
97    ///   Quantized indexes use less memory at the cost of slightly lower recall.
98    ///
99    /// # Errors
100    ///
101    /// Returns an error if the metric is invalid, no vectors are found, or
102    /// dimensions don't match.
103    #[allow(clippy::too_many_arguments)]
104    pub fn create_vector_index(
105        &self,
106        label: &str,
107        property: &str,
108        dimensions: Option<usize>,
109        metric: Option<&str>,
110        m: Option<usize>,
111        ef_construction: Option<usize>,
112        quantization: Option<&str>,
113    ) -> Result<()> {
114        use grafeo_common::types::{PropertyKey, Value};
115        use grafeo_core::index::vector::DistanceMetric;
116
117        let metric = match metric {
118            Some(m) => DistanceMetric::from_str(m).ok_or_else(|| {
119                grafeo_common::utils::error::Error::Internal(format!(
120                    "Unknown distance metric '{}'. Use: cosine, euclidean, dot_product, manhattan",
121                    m
122                ))
123            })?,
124            None => DistanceMetric::Cosine,
125        };
126
127        #[cfg(feature = "vector-index")]
128        let quantization_type = Self::parse_quantization(quantization)?;
129        #[cfg(not(feature = "vector-index"))]
130        let _ = quantization;
131
132        // Scan nodes to validate vectors exist and check dimensions
133        let prop_key = PropertyKey::new(property);
134        let mut found_dims: Option<usize> = dimensions;
135        let mut vector_count = 0usize;
136
137        #[cfg(feature = "vector-index")]
138        let mut vectors: Vec<(grafeo_common::types::NodeId, Vec<f32>)> = Vec::new();
139
140        let graph = self.graph_store();
141        for node_id in graph.nodes_by_label(label) {
142            if let Some(Value::Vector(v)) = graph.get_node_property(node_id, &prop_key) {
143                if let Some(expected) = found_dims {
144                    if v.len() != expected {
145                        return Err(grafeo_common::utils::error::Error::Internal(format!(
146                            "Vector dimension mismatch: expected {}, found {} on node {}",
147                            expected,
148                            v.len(),
149                            node_id.0
150                        )));
151                    }
152                } else {
153                    found_dims = Some(v.len());
154                }
155                vector_count += 1;
156                #[cfg(feature = "vector-index")]
157                vectors.push((node_id, v.to_vec()));
158            }
159        }
160
161        let Some(dims) = found_dims else {
162            // No vectors found yet: caller must have supplied explicit dimensions
163            // so we can create an empty index that auto-populates via set_node_property.
164            return if let Some(d) = dimensions {
165                #[cfg(feature = "vector-index")]
166                {
167                    let index = Self::build_vector_index(
168                        d,
169                        metric,
170                        m,
171                        ef_construction,
172                        quantization_type,
173                        0,
174                    );
175                    self.lpg_store()
176                        .add_vector_index(label, property, Arc::new(index));
177                }
178
179                let _ = (m, ef_construction);
180                grafeo_info!(
181                    "Empty vector index created: :{label}({property}) - 0 vectors, {d} dimensions, metric={metric_name}",
182                    metric_name = metric.name()
183                );
184                Ok(())
185            } else {
186                Err(grafeo_common::utils::error::Error::Internal(format!(
187                    "No vector properties found on :{label}({property}) and no dimensions specified"
188                )))
189            };
190        };
191
192        // Build and populate the vector index
193        #[cfg(feature = "vector-index")]
194        {
195            use grafeo_core::index::vector::VectorIndexKind;
196
197            let index = Self::build_vector_index(
198                dims,
199                metric,
200                m,
201                ef_construction,
202                quantization_type,
203                vectors.len(),
204            );
205
206            match &index {
207                VectorIndexKind::Hnsw(_) => {
208                    let graph = self.graph_store();
209                    let accessor =
210                        grafeo_core::index::vector::PropertyVectorAccessor::new(&*graph, property);
211                    for (node_id, vec) in &vectors {
212                        index.insert(*node_id, vec, &accessor);
213                    }
214                }
215                VectorIndexKind::Quantized(q_idx) => {
216                    for (node_id, vec) in &vectors {
217                        q_idx.insert(*node_id, vec);
218                    }
219                }
220            }
221
222            self.lpg_store()
223                .add_vector_index(label, property, Arc::new(index));
224        }
225
226        // Suppress unused variable warnings when vector-index is off
227        let _ = (m, ef_construction);
228
229        grafeo_info!(
230            "Vector index created: :{label}({property}) - {vector_count} vectors, {dims} dimensions, metric={metric_name}",
231            metric_name = metric.name()
232        );
233
234        Ok(())
235    }
236
237    /// Parses a quantization string into a [`QuantizationType`].
238    #[cfg(feature = "vector-index")]
239    fn parse_quantization(
240        quantization: Option<&str>,
241    ) -> Result<grafeo_core::index::vector::QuantizationType> {
242        use grafeo_core::index::vector::QuantizationType;
243        match quantization {
244            None | Some("none") => Ok(QuantizationType::None),
245            Some("scalar") => Ok(QuantizationType::Scalar),
246            Some("binary") => Ok(QuantizationType::Binary),
247            Some("product") => Ok(QuantizationType::Product { num_subvectors: 8 }),
248            Some(other) => Err(grafeo_common::utils::error::Error::Internal(format!(
249                "Unknown quantization type '{other}'. Use: scalar, binary, product"
250            ))),
251        }
252    }
253
254    /// Builds a [`VectorIndexKind`] from the given parameters.
255    #[cfg(feature = "vector-index")]
256    fn build_vector_index(
257        dims: usize,
258        metric: grafeo_core::index::vector::DistanceMetric,
259        m: Option<usize>,
260        ef_construction: Option<usize>,
261        quantization: grafeo_core::index::vector::QuantizationType,
262        capacity: usize,
263    ) -> grafeo_core::index::vector::VectorIndexKind {
264        use grafeo_core::index::vector::{
265            HnswConfig, HnswIndex, QuantizationType, QuantizedHnswIndex, VectorIndexKind,
266        };
267
268        let mut config = HnswConfig::new(dims, metric);
269        if let Some(m_val) = m {
270            config = config.with_m(m_val);
271        }
272        if let Some(ef_c) = ef_construction {
273            config = config.with_ef_construction(ef_c);
274        }
275
276        match quantization {
277            QuantizationType::None => {
278                VectorIndexKind::Hnsw(HnswIndex::with_capacity(config, capacity))
279            }
280            _ => VectorIndexKind::Quantized(QuantizedHnswIndex::new(config, quantization)),
281        }
282    }
283
284    /// Drops a vector index for the given label and property.
285    ///
286    /// Returns `true` if the index existed and was removed, `false` if no
287    /// index was found.
288    ///
289    /// After dropping, [`vector_search`](Self::vector_search) for this
290    /// label+property pair will return an error.
291    #[cfg(feature = "vector-index")]
292    pub fn drop_vector_index(&self, label: &str, property: &str) -> bool {
293        let removed = self.lpg_store().remove_vector_index(label, property);
294        if removed {
295            grafeo_info!("Vector index dropped: :{label}({property})");
296        }
297        removed
298    }
299
300    /// Drops and recreates a vector index, rescanning all matching nodes.
301    ///
302    /// In normal usage you do **not** need to call this. Vector indexes
303    /// auto-sync when nodes are created or updated via
304    /// [`set_node_property`](Self::set_node_property),
305    /// [`batch_create_nodes`](Self::batch_create_nodes), or
306    /// [`batch_create_nodes_with_props`](Self::batch_create_nodes_with_props).
307    ///
308    /// Use `rebuild_vector_index` only when:
309    /// - Data was loaded through non-standard paths (e.g., persistence
310    ///   restore or direct store manipulation) before the index existed.
311    /// - You want to compact the index after many deletions (HNSW does
312    ///   not reclaim deleted-node slots automatically).
313    /// - The index configuration needs to be refreshed after upgrading.
314    ///
315    /// When the index still exists, the previous configuration (dimensions,
316    /// metric, M, ef\_construction) is preserved. When it has already been
317    /// dropped, dimensions are inferred from existing data and default
318    /// parameters are used.
319    ///
320    /// # Errors
321    ///
322    /// Returns an error if the rebuild fails (e.g., no matching vectors found
323    /// and no dimensions can be inferred).
324    #[cfg(feature = "vector-index")]
325    pub fn rebuild_vector_index(&self, label: &str, property: &str) -> Result<()> {
326        // Preserve config and quantization type from existing index if available
327        let existing = self.lpg_store().get_vector_index(label, property);
328
329        let (config, quantization_name) = if let Some(ref idx) = existing {
330            let qt = match idx.quantization_type() {
331                Some(grafeo_core::index::vector::QuantizationType::Scalar) => Some("scalar"),
332                Some(grafeo_core::index::vector::QuantizationType::Binary) => Some("binary"),
333                Some(grafeo_core::index::vector::QuantizationType::Product { .. }) => {
334                    Some("product")
335                }
336                _ => None,
337            };
338            (Some(idx.config().clone()), qt)
339        } else {
340            (None, None)
341        };
342
343        self.lpg_store().remove_vector_index(label, property);
344
345        if let Some(config) = config {
346            self.create_vector_index(
347                label,
348                property,
349                Some(config.dimensions),
350                Some(config.metric.name()),
351                Some(config.m),
352                Some(config.ef_construction),
353                quantization_name,
354            )
355        } else {
356            // Index was already dropped: infer dimensions from data
357            self.create_vector_index(label, property, None, None, None, None, None)
358        }
359    }
360
361    // =========================================================================
362    // TEXT INDEX API
363    // =========================================================================
364
365    /// Creates a BM25 text index on a node property for full-text search.
366    ///
367    /// Indexes all existing nodes with the given label and property.
368    /// The index stays in sync automatically as nodes are created, updated,
369    /// or deleted. Use [`rebuild_text_index`](Self::rebuild_text_index) only
370    /// if the index was created before existing data was loaded.
371    ///
372    /// # Errors
373    ///
374    /// Returns an error if the label has no nodes or the property contains no text values.
375    #[cfg(feature = "text-index")]
376    pub fn create_text_index(&self, label: &str, property: &str) -> Result<()> {
377        use grafeo_common::types::{PropertyKey, Value};
378        use grafeo_core::index::text::{BM25Config, InvertedIndex};
379
380        let mut index = InvertedIndex::new(BM25Config::default());
381        let prop_key = PropertyKey::new(property);
382
383        // Index all existing nodes with this label + property
384        let graph = self.graph_store();
385        let nodes = graph.nodes_by_label(label);
386        for node_id in nodes {
387            if let Some(Value::String(text)) = graph.get_node_property(node_id, &prop_key) {
388                index.insert(node_id, text.as_str());
389            }
390        }
391
392        self.lpg_store()
393            .add_text_index(label, property, Arc::new(RwLock::new(index)));
394        Ok(())
395    }
396
397    /// Drops a text index on a label+property pair.
398    ///
399    /// Returns `true` if the index existed and was removed.
400    #[cfg(feature = "text-index")]
401    pub fn drop_text_index(&self, label: &str, property: &str) -> bool {
402        self.lpg_store().remove_text_index(label, property)
403    }
404
405    /// Rebuilds a text index by re-scanning all matching nodes.
406    ///
407    /// Use after bulk property updates to keep the index current.
408    ///
409    /// # Errors
410    ///
411    /// Returns an error if no text index exists for this label+property.
412    #[cfg(feature = "text-index")]
413    pub fn rebuild_text_index(&self, label: &str, property: &str) -> Result<()> {
414        self.lpg_store().remove_text_index(label, property);
415        self.create_text_index(label, property)
416    }
417}