grafeo_engine/database/index.rs
1//! Index management for GrafeoDB (property, vector, and text indexes).
2
3use grafeo_common::grafeo_info;
4#[cfg(any(feature = "vector-index", feature = "text-index"))]
5use std::sync::Arc;
6
7#[cfg(feature = "text-index")]
8use parking_lot::RwLock;
9
10use grafeo_common::utils::error::Result;
11
12impl super::GrafeoDB {
13 // =========================================================================
14 // PROPERTY INDEX API
15 // =========================================================================
16
17 /// Creates an index on a node property for O(1) lookups by value.
18 ///
19 /// After creating an index, calls to [`Self::find_nodes_by_property`] will be
20 /// O(1) instead of O(n) for this property. The index is automatically
21 /// maintained when properties are set or removed.
22 ///
23 /// # Example
24 ///
25 /// ```no_run
26 /// # use grafeo_engine::GrafeoDB;
27 /// # use grafeo_common::types::Value;
28 /// # let db = GrafeoDB::new_in_memory();
29 /// // Create an index on the 'email' property
30 /// db.create_property_index("email");
31 ///
32 /// // Now lookups by email are O(1)
33 /// let nodes = db.find_nodes_by_property("email", &Value::from("alix@example.com"));
34 /// ```
35 pub fn create_property_index(&self, property: &str) {
36 self.lpg_store().create_property_index(property);
37 }
38
39 /// Drops an index on a node property.
40 ///
41 /// Returns `true` if the index existed and was removed.
42 pub fn drop_property_index(&self, property: &str) -> bool {
43 self.lpg_store().drop_property_index(property)
44 }
45
46 /// Returns `true` if the property has an index.
47 #[must_use]
48 pub fn has_property_index(&self, property: &str) -> bool {
49 self.lpg_store().has_property_index(property)
50 }
51
52 /// Finds all nodes that have a specific property value.
53 ///
54 /// If the property is indexed, this is O(1). Otherwise, it scans all nodes
55 /// which is O(n). Use [`Self::create_property_index`] for frequently queried properties.
56 ///
57 /// # Example
58 ///
59 /// ```no_run
60 /// # use grafeo_engine::GrafeoDB;
61 /// # use grafeo_common::types::Value;
62 /// # let db = GrafeoDB::new_in_memory();
63 /// // Create index for fast lookups (optional but recommended)
64 /// db.create_property_index("city");
65 ///
66 /// // Find all nodes where city = "NYC"
67 /// let nyc_nodes = db.find_nodes_by_property("city", &Value::from("NYC"));
68 /// ```
69 #[must_use]
70 pub fn find_nodes_by_property(
71 &self,
72 property: &str,
73 value: &grafeo_common::types::Value,
74 ) -> Vec<grafeo_common::types::NodeId> {
75 self.lpg_store().find_nodes_by_property(property, value)
76 }
77
78 // =========================================================================
79 // VECTOR INDEX API
80 // =========================================================================
81
82 /// Creates a vector similarity index on a node property.
83 ///
84 /// This enables efficient approximate nearest-neighbor search on vector
85 /// properties. Currently validates the index parameters and scans existing
86 /// nodes to verify the property contains vectors of the expected dimensions.
87 ///
88 /// # Arguments
89 ///
90 /// * `label` - Node label to index (e.g., `"Doc"`)
91 /// * `property` - Property containing vector embeddings (e.g., `"embedding"`)
92 /// * `dimensions` - Expected vector dimensions (inferred from data if `None`)
93 /// * `metric` - Distance metric: `"cosine"` (default), `"euclidean"`, `"dot_product"`, `"manhattan"`
94 /// * `m` - HNSW links per node (default: 16). Higher = better recall, more memory.
95 /// * `ef_construction` - Construction beam width (default: 128). Higher = better index quality, slower build.
96 /// * `quantization` - Quantization mode: `None` (default), `"scalar"`, `"binary"`, or `"product"`.
97 /// Quantized indexes use less memory at the cost of slightly lower recall.
98 ///
99 /// # Errors
100 ///
101 /// Returns an error if the metric is invalid, no vectors are found, or
102 /// dimensions don't match.
103 #[allow(clippy::too_many_arguments)]
104 pub fn create_vector_index(
105 &self,
106 label: &str,
107 property: &str,
108 dimensions: Option<usize>,
109 metric: Option<&str>,
110 m: Option<usize>,
111 ef_construction: Option<usize>,
112 quantization: Option<&str>,
113 ) -> Result<()> {
114 use grafeo_common::types::{PropertyKey, Value};
115 use grafeo_core::index::vector::DistanceMetric;
116
117 let metric = match metric {
118 Some(m) => DistanceMetric::from_str(m).ok_or_else(|| {
119 grafeo_common::utils::error::Error::Internal(format!(
120 "Unknown distance metric '{}'. Use: cosine, euclidean, dot_product, manhattan",
121 m
122 ))
123 })?,
124 None => DistanceMetric::Cosine,
125 };
126
127 #[cfg(feature = "vector-index")]
128 let quantization_type = Self::parse_quantization(quantization)?;
129 #[cfg(not(feature = "vector-index"))]
130 let _ = quantization;
131
132 // Scan nodes to validate vectors exist and check dimensions
133 let prop_key = PropertyKey::new(property);
134 let mut found_dims: Option<usize> = dimensions;
135 let mut vector_count = 0usize;
136
137 #[cfg(feature = "vector-index")]
138 let mut vectors: Vec<(grafeo_common::types::NodeId, Vec<f32>)> = Vec::new();
139
140 let graph = self.graph_store();
141 for node_id in graph.nodes_by_label(label) {
142 if let Some(Value::Vector(v)) = graph.get_node_property(node_id, &prop_key) {
143 if let Some(expected) = found_dims {
144 if v.len() != expected {
145 return Err(grafeo_common::utils::error::Error::Internal(format!(
146 "Vector dimension mismatch: expected {}, found {} on node {}",
147 expected,
148 v.len(),
149 node_id.0
150 )));
151 }
152 } else {
153 found_dims = Some(v.len());
154 }
155 vector_count += 1;
156 #[cfg(feature = "vector-index")]
157 vectors.push((node_id, v.to_vec()));
158 }
159 }
160
161 let Some(dims) = found_dims else {
162 // No vectors found yet: caller must have supplied explicit dimensions
163 // so we can create an empty index that auto-populates via set_node_property.
164 return if let Some(d) = dimensions {
165 #[cfg(feature = "vector-index")]
166 {
167 let index = Self::build_vector_index(
168 d,
169 metric,
170 m,
171 ef_construction,
172 quantization_type,
173 0,
174 );
175 self.lpg_store()
176 .add_vector_index(label, property, Arc::new(index));
177 }
178
179 let _ = (m, ef_construction);
180 grafeo_info!(
181 "Empty vector index created: :{label}({property}) - 0 vectors, {d} dimensions, metric={metric_name}",
182 metric_name = metric.name()
183 );
184 Ok(())
185 } else {
186 Err(grafeo_common::utils::error::Error::Internal(format!(
187 "No vector properties found on :{label}({property}) and no dimensions specified"
188 )))
189 };
190 };
191
192 // Build and populate the vector index
193 #[cfg(feature = "vector-index")]
194 {
195 use grafeo_core::index::vector::VectorIndexKind;
196
197 let index = Self::build_vector_index(
198 dims,
199 metric,
200 m,
201 ef_construction,
202 quantization_type,
203 vectors.len(),
204 );
205
206 match &index {
207 VectorIndexKind::Hnsw(_) => {
208 let graph = self.graph_store();
209 let accessor =
210 grafeo_core::index::vector::PropertyVectorAccessor::new(&*graph, property);
211 for (node_id, vec) in &vectors {
212 index.insert(*node_id, vec, &accessor);
213 }
214 }
215 VectorIndexKind::Quantized(q_idx) => {
216 for (node_id, vec) in &vectors {
217 q_idx.insert(*node_id, vec);
218 }
219 }
220 }
221
222 self.lpg_store()
223 .add_vector_index(label, property, Arc::new(index));
224 }
225
226 // Suppress unused variable warnings when vector-index is off
227 let _ = (m, ef_construction);
228
229 grafeo_info!(
230 "Vector index created: :{label}({property}) - {vector_count} vectors, {dims} dimensions, metric={metric_name}",
231 metric_name = metric.name()
232 );
233
234 Ok(())
235 }
236
237 /// Parses a quantization string into a [`QuantizationType`].
238 #[cfg(feature = "vector-index")]
239 fn parse_quantization(
240 quantization: Option<&str>,
241 ) -> Result<grafeo_core::index::vector::QuantizationType> {
242 use grafeo_core::index::vector::QuantizationType;
243 match quantization {
244 None | Some("none") => Ok(QuantizationType::None),
245 Some("scalar") => Ok(QuantizationType::Scalar),
246 Some("binary") => Ok(QuantizationType::Binary),
247 Some("product") => Ok(QuantizationType::Product { num_subvectors: 8 }),
248 Some(other) => Err(grafeo_common::utils::error::Error::Internal(format!(
249 "Unknown quantization type '{other}'. Use: scalar, binary, product"
250 ))),
251 }
252 }
253
254 /// Builds a [`VectorIndexKind`] from the given parameters.
255 #[cfg(feature = "vector-index")]
256 fn build_vector_index(
257 dims: usize,
258 metric: grafeo_core::index::vector::DistanceMetric,
259 m: Option<usize>,
260 ef_construction: Option<usize>,
261 quantization: grafeo_core::index::vector::QuantizationType,
262 capacity: usize,
263 ) -> grafeo_core::index::vector::VectorIndexKind {
264 use grafeo_core::index::vector::{
265 HnswConfig, HnswIndex, QuantizationType, QuantizedHnswIndex, VectorIndexKind,
266 };
267
268 let mut config = HnswConfig::new(dims, metric);
269 if let Some(m_val) = m {
270 config = config.with_m(m_val);
271 }
272 if let Some(ef_c) = ef_construction {
273 config = config.with_ef_construction(ef_c);
274 }
275
276 match quantization {
277 QuantizationType::None => {
278 VectorIndexKind::Hnsw(HnswIndex::with_capacity(config, capacity))
279 }
280 _ => VectorIndexKind::Quantized(QuantizedHnswIndex::new(config, quantization)),
281 }
282 }
283
284 /// Drops a vector index for the given label and property.
285 ///
286 /// Returns `true` if the index existed and was removed, `false` if no
287 /// index was found.
288 ///
289 /// After dropping, [`vector_search`](Self::vector_search) for this
290 /// label+property pair will return an error.
291 #[cfg(feature = "vector-index")]
292 pub fn drop_vector_index(&self, label: &str, property: &str) -> bool {
293 let removed = self.lpg_store().remove_vector_index(label, property);
294 if removed {
295 grafeo_info!("Vector index dropped: :{label}({property})");
296 }
297 removed
298 }
299
300 /// Drops and recreates a vector index, rescanning all matching nodes.
301 ///
302 /// In normal usage you do **not** need to call this. Vector indexes
303 /// auto-sync when nodes are created or updated via
304 /// [`set_node_property`](Self::set_node_property),
305 /// [`batch_create_nodes`](Self::batch_create_nodes), or
306 /// [`batch_create_nodes_with_props`](Self::batch_create_nodes_with_props).
307 ///
308 /// Use `rebuild_vector_index` only when:
309 /// - Data was loaded through non-standard paths (e.g., persistence
310 /// restore or direct store manipulation) before the index existed.
311 /// - You want to compact the index after many deletions (HNSW does
312 /// not reclaim deleted-node slots automatically).
313 /// - The index configuration needs to be refreshed after upgrading.
314 ///
315 /// When the index still exists, the previous configuration (dimensions,
316 /// metric, M, ef\_construction) is preserved. When it has already been
317 /// dropped, dimensions are inferred from existing data and default
318 /// parameters are used.
319 ///
320 /// # Errors
321 ///
322 /// Returns an error if the rebuild fails (e.g., no matching vectors found
323 /// and no dimensions can be inferred).
324 #[cfg(feature = "vector-index")]
325 pub fn rebuild_vector_index(&self, label: &str, property: &str) -> Result<()> {
326 // Preserve config and quantization type from existing index if available
327 let existing = self.lpg_store().get_vector_index(label, property);
328
329 let (config, quantization_name) = if let Some(ref idx) = existing {
330 let qt = match idx.quantization_type() {
331 Some(grafeo_core::index::vector::QuantizationType::Scalar) => Some("scalar"),
332 Some(grafeo_core::index::vector::QuantizationType::Binary) => Some("binary"),
333 Some(grafeo_core::index::vector::QuantizationType::Product { .. }) => {
334 Some("product")
335 }
336 _ => None,
337 };
338 (Some(idx.config().clone()), qt)
339 } else {
340 (None, None)
341 };
342
343 self.lpg_store().remove_vector_index(label, property);
344
345 if let Some(config) = config {
346 self.create_vector_index(
347 label,
348 property,
349 Some(config.dimensions),
350 Some(config.metric.name()),
351 Some(config.m),
352 Some(config.ef_construction),
353 quantization_name,
354 )
355 } else {
356 // Index was already dropped: infer dimensions from data
357 self.create_vector_index(label, property, None, None, None, None, None)
358 }
359 }
360
361 // =========================================================================
362 // TEXT INDEX API
363 // =========================================================================
364
365 /// Creates a BM25 text index on a node property for full-text search.
366 ///
367 /// Indexes all existing nodes with the given label and property.
368 /// The index stays in sync automatically as nodes are created, updated,
369 /// or deleted. Use [`rebuild_text_index`](Self::rebuild_text_index) only
370 /// if the index was created before existing data was loaded.
371 ///
372 /// # Errors
373 ///
374 /// Returns an error if the label has no nodes or the property contains no text values.
375 #[cfg(feature = "text-index")]
376 pub fn create_text_index(&self, label: &str, property: &str) -> Result<()> {
377 use grafeo_common::types::{PropertyKey, Value};
378 use grafeo_core::index::text::{BM25Config, InvertedIndex};
379
380 let mut index = InvertedIndex::new(BM25Config::default());
381 let prop_key = PropertyKey::new(property);
382
383 // Index all existing nodes with this label + property
384 let graph = self.graph_store();
385 let nodes = graph.nodes_by_label(label);
386 for node_id in nodes {
387 if let Some(Value::String(text)) = graph.get_node_property(node_id, &prop_key) {
388 index.insert(node_id, text.as_str());
389 }
390 }
391
392 self.lpg_store()
393 .add_text_index(label, property, Arc::new(RwLock::new(index)));
394 Ok(())
395 }
396
397 /// Drops a text index on a label+property pair.
398 ///
399 /// Returns `true` if the index existed and was removed.
400 #[cfg(feature = "text-index")]
401 pub fn drop_text_index(&self, label: &str, property: &str) -> bool {
402 self.lpg_store().remove_text_index(label, property)
403 }
404
405 /// Rebuilds a text index by re-scanning all matching nodes.
406 ///
407 /// Use after bulk property updates to keep the index current.
408 ///
409 /// # Errors
410 ///
411 /// Returns an error if no text index exists for this label+property.
412 #[cfg(feature = "text-index")]
413 pub fn rebuild_text_index(&self, label: &str, property: &str) -> Result<()> {
414 self.lpg_store().remove_text_index(label, property);
415 self.create_text_index(label, property)
416 }
417}