pulsedb/vector/mod.rs
1//! Vector index abstractions for semantic search.
2//!
3//! This module provides a trait-based abstraction over vector indexes,
4//! allowing different ANN (Approximate Nearest Neighbor) backends.
5//! The primary implementation uses [`hnsw_rs`] (pure Rust, ADR-005).
6//!
7//! # Architecture
8//!
9//! ```text
10//! ┌──────────────────────────────────┐
11//! │ VectorIndex trait │
12//! └──────────┬───────────────────────┘
13//! │
14//! ┌───────┴────────┐
15//! │ HnswIndex │ (hnsw_rs wrapper)
16//! └────────────────┘
17//! ```
18//!
19//! Embeddings stored in redb are the **source of truth**. The HNSW index
20//! is a derived, rebuildable structure — if files are missing or corrupt,
21//! rebuild from stored embeddings.
22
23mod hnsw;
24
25pub use hnsw::HnswIndex;
26
27use std::path::Path;
28
29use crate::error::Result;
30
31/// Vector index trait for approximate nearest neighbor search.
32///
33/// Implementations must be `Send + Sync` for use inside `PulseDB`.
34/// IDs are `usize` to align with hnsw_rs's `DataId` type.
35///
36/// All mutating methods (`insert`, `delete`) take `&self` and use
37/// interior mutability. This enables concurrent reads during search
38/// while writes are serialized internally.
39pub trait VectorIndex: Send + Sync {
40 /// Inserts a single vector with the given ID.
41 fn insert(&self, id: usize, embedding: &[f32]) -> Result<()>;
42
43 /// Inserts a batch of vectors.
44 ///
45 /// More efficient than individual inserts for large batches
46 /// due to reduced locking overhead and parallel insertion.
47 fn insert_batch(&self, items: &[(&Vec<f32>, usize)]) -> Result<()>;
48
49 /// Searches for the k nearest neighbors to the query vector.
50 ///
51 /// Returns `(id, distance)` pairs sorted by distance ascending
52 /// (closest first). Distance metric is cosine distance:
53 /// 0.0 = identical, 2.0 = opposite.
54 fn search(&self, query: &[f32], k: usize, ef_search: usize) -> Result<Vec<(usize, f32)>>;
55
56 /// Searches with a filter predicate applied during traversal.
57 ///
58 /// Only points where `filter(id)` returns `true` are considered.
59 /// This is filter-during-traversal, NOT post-filtering — critical
60 /// for maintaining result count when many points are filtered.
61 ///
62 /// The filter must implement `hnsw_rs::FilterT` (closures do
63 /// automatically via blanket impl).
64 fn search_filtered(
65 &self,
66 query: &[f32],
67 k: usize,
68 ef_search: usize,
69 filter: &(dyn Fn(&usize) -> bool + Sync),
70 ) -> Result<Vec<(usize, f32)>>;
71
72 /// Marks an ID as deleted (soft-delete).
73 ///
74 /// The vector remains in the graph but is excluded from search
75 /// results. HNSW graphs don't support point removal — removing
76 /// nodes breaks proximity edges that other nodes rely on.
77 fn delete(&self, id: usize) -> Result<()>;
78
79 /// Returns true if the given ID is marked as deleted.
80 fn is_deleted(&self, id: usize) -> bool;
81
82 /// Returns the number of active (non-deleted) vectors.
83 fn len(&self) -> usize;
84
85 /// Returns true if the index has no active vectors.
86 fn is_empty(&self) -> bool {
87 self.len() == 0
88 }
89
90 /// Persists index metadata to disk.
91 fn save(&self, dir: &Path, name: &str) -> Result<()>;
92}