Skip to main content

pulsedb/vector/
mod.rs

1//! Vector index abstractions for semantic search.
2//!
3//! This module provides a trait-based abstraction over vector indexes,
4//! allowing different ANN (Approximate Nearest Neighbor) backends.
5//! The primary implementation uses [`hnsw_rs`] (pure Rust, ADR-005).
6//!
7//! # Architecture
8//!
9//! ```text
10//! ┌──────────────────────────────────┐
11//! │         VectorIndex trait         │
12//! └──────────┬───────────────────────┘
13//!            │
14//!    ┌───────┴────────┐
15//!    │   HnswIndex    │  (hnsw_rs wrapper)
16//!    └────────────────┘
17//! ```
18//!
19//! Embeddings stored in redb are the **source of truth**. The HNSW index
20//! is a derived, rebuildable structure — if files are missing or corrupt,
21//! rebuild from stored embeddings.
22
23mod hnsw;
24
25pub use hnsw::HnswIndex;
26
27use std::path::Path;
28
29use crate::error::Result;
30
31/// Vector index trait for approximate nearest neighbor search.
32///
33/// Implementations must be `Send + Sync` for use inside `PulseDB`.
34/// IDs are `usize` to align with hnsw_rs's `DataId` type.
35///
36/// All mutating methods (`insert`, `delete`) take `&self` and use
37/// interior mutability. This enables concurrent reads during search
38/// while writes are serialized internally.
39pub trait VectorIndex: Send + Sync {
40    /// Inserts a single vector with the given ID.
41    fn insert(&self, id: usize, embedding: &[f32]) -> Result<()>;
42
43    /// Inserts a batch of vectors.
44    ///
45    /// More efficient than individual inserts for large batches
46    /// due to reduced locking overhead and parallel insertion.
47    fn insert_batch(&self, items: &[(&Vec<f32>, usize)]) -> Result<()>;
48
49    /// Searches for the k nearest neighbors to the query vector.
50    ///
51    /// Returns `(id, distance)` pairs sorted by distance ascending
52    /// (closest first). Distance metric is cosine distance:
53    /// 0.0 = identical, 2.0 = opposite.
54    fn search(&self, query: &[f32], k: usize, ef_search: usize) -> Result<Vec<(usize, f32)>>;
55
56    /// Searches with a filter predicate applied during traversal.
57    ///
58    /// Only points where `filter(id)` returns `true` are considered.
59    /// This is filter-during-traversal, NOT post-filtering — critical
60    /// for maintaining result count when many points are filtered.
61    ///
62    /// The filter must implement `hnsw_rs::FilterT` (closures do
63    /// automatically via blanket impl).
64    fn search_filtered(
65        &self,
66        query: &[f32],
67        k: usize,
68        ef_search: usize,
69        filter: &(dyn Fn(&usize) -> bool + Sync),
70    ) -> Result<Vec<(usize, f32)>>;
71
72    /// Marks an ID as deleted (soft-delete).
73    ///
74    /// The vector remains in the graph but is excluded from search
75    /// results. HNSW graphs don't support point removal — removing
76    /// nodes breaks proximity edges that other nodes rely on.
77    fn delete(&self, id: usize) -> Result<()>;
78
79    /// Returns true if the given ID is marked as deleted.
80    fn is_deleted(&self, id: usize) -> bool;
81
82    /// Returns the number of active (non-deleted) vectors.
83    fn len(&self) -> usize;
84
85    /// Returns true if the index has no active vectors.
86    fn is_empty(&self) -> bool {
87        self.len() == 0
88    }
89
90    /// Persists index metadata to disk.
91    fn save(&self, dir: &Path, name: &str) -> Result<()>;
92}