Skip to main content

ruvector_core/
integration.rs

1//! Cross-integration helpers for ruvnet crate ecosystem.
2//!
3//! This module provides ergonomic adapters that make it straightforward to use
4//! `ruvector-core` as a dependency from other ruvnet crates:
5//!
6//! - **ruv-FANN**: neural-network weights can be stored and retrieved via
7//!   [`FannAdapter`] using cosine similarity search across layer embeddings.
8//! - **sparc / semantic file search**: [`SemanticSearchAdapter`] wraps
9//!   [`VectorDB`] with file-path metadata so sparc can locate relevant source
10//!   files by embedding query strings.
11//!
12//! Both adapters are thin, zero-overhead wrappers — they own no additional
13//! memory beyond what the underlying [`VectorDB`] already holds.
14
15use crate::error::{Result, RuvectorError};
16use crate::types::{DbOptions, DistanceMetric, HnswConfig, SearchQuery, SearchResult, VectorEntry};
17use crate::vector_db::VectorDB;
18use std::collections::HashMap;
19
20// ── ruv-FANN integration ────────────────────────────────────────────────────
21
22/// Adapter that lets ruv-FANN store and retrieve layer-weight embeddings.
23///
24/// Each neural-network layer can be fingerprinted as a flat `f32` embedding
25/// (e.g. the flattened weight matrix or its PCA projection).  Storing these
26/// fingerprints in RuVector enables fast recall of "similar layers" across
27/// model checkpoints.
28///
29/// # Example
30/// ```no_run
31/// use ruvector_core::integration::FannAdapter;
32///
33/// let mut adapter = FannAdapter::new(128, "./fann_index.db").unwrap();
34/// adapter.store_layer("model_v1/layer_0", &[0.1f32; 128], None).unwrap();
35/// let similar = adapter.find_similar_layers(&[0.1f32; 128], 5).unwrap();
36/// ```
37pub struct FannAdapter {
38    db: VectorDB,
39}
40
41impl FannAdapter {
42    /// Create a new adapter backed by a RuVector database.
43    ///
44    /// `dimensions` must match the size of the layer embeddings you intend
45    /// to store.  Cosine distance is used because weight embeddings are
46    /// typically meaningful up to scale.
47    pub fn new(dimensions: usize, storage_path: impl Into<String>) -> Result<Self> {
48        let options = DbOptions {
49            dimensions,
50            distance_metric: DistanceMetric::Cosine,
51            storage_path: storage_path.into(),
52            hnsw_config: Some(HnswConfig {
53                m: 16,
54                ef_construction: 100,
55                ef_search: 100,
56                max_elements: 100_000,
57            }),
58            quantization: None,
59        };
60        Ok(Self {
61            db: VectorDB::new(options)?,
62        })
63    }
64
65    /// Store a layer embedding identified by `layer_id`.
66    ///
67    /// `metadata` can carry arbitrary JSON-serialisable key-value pairs
68    /// (e.g. model name, checkpoint step, layer type).
69    pub fn store_layer(
70        &self,
71        layer_id: impl Into<String>,
72        embedding: &[f32],
73        metadata: Option<HashMap<String, serde_json::Value>>,
74    ) -> Result<String> {
75        let id = layer_id.into();
76        self.db.insert(VectorEntry {
77            id: Some(id),
78            vector: embedding.to_vec(),
79            metadata,
80        })
81    }
82
83    /// Find the `k` most similar layer embeddings to `query`.
84    ///
85    /// Returns results sorted by ascending cosine distance.
86    pub fn find_similar_layers(&self, query: &[f32], k: usize) -> Result<Vec<SearchResult>> {
87        self.db.search(SearchQuery {
88            vector: query.to_vec(),
89            k,
90            filter: None,
91            ef_search: None,
92        })
93    }
94
95    /// Find similar layers with a filter on metadata fields.
96    ///
97    /// Only results where every `(key, value)` in `filter` matches are returned.
98    pub fn find_similar_layers_filtered(
99        &self,
100        query: &[f32],
101        k: usize,
102        filter: HashMap<String, serde_json::Value>,
103    ) -> Result<Vec<SearchResult>> {
104        self.db.search(SearchQuery {
105            vector: query.to_vec(),
106            k,
107            filter: Some(filter),
108            ef_search: None,
109        })
110    }
111
112    /// Delete a layer embedding by ID.
113    pub fn delete_layer(&self, layer_id: &str) -> Result<bool> {
114        self.db.delete(layer_id)
115    }
116
117    /// Total number of stored layer embeddings.
118    pub fn len(&self) -> Result<usize> {
119        self.db.len()
120    }
121
122    /// Returns `true` if no embeddings have been stored yet.
123    pub fn is_empty(&self) -> Result<bool> {
124        self.db.is_empty()
125    }
126}
127
128// ── sparc / semantic file search integration ────────────────────────────────
129
130/// A file-path entry as indexed by [`SemanticSearchAdapter`].
131#[derive(Debug, Clone)]
132pub struct FileEntry {
133    /// Absolute or relative path to the source file.
134    pub path: String,
135    /// Brief human-readable description of the file's contents.
136    pub description: String,
137    /// The embedding dimension used to index this file.
138    pub dimensions: usize,
139}
140
141/// Adapter for sparc-style semantic file search.
142///
143/// sparc needs to locate relevant source files given a natural-language query
144/// string.  This adapter stores one embedding per file (derived externally,
145/// e.g. from an ONNX all-MiniLM model) and retrieves the closest matches
146/// using HNSW approximate nearest-neighbour search.
147///
148/// # Example
149/// ```no_run
150/// use ruvector_core::integration::SemanticSearchAdapter;
151///
152/// let mut adapter = SemanticSearchAdapter::new(384, "./sparc_index.db").unwrap();
153///
154/// // Index source files (embeddings produced by your embedding pipeline)
155/// adapter.index_file("src/auth/service.rs", "authentication service", &[0.0f32; 384]).unwrap();
156/// adapter.index_file("src/user/model.rs", "user data model", &[0.1f32; 384]).unwrap();
157///
158/// // Query with a natural-language description
159/// let results = adapter.search("jwt token validation", &[0.05f32; 384], 5).unwrap();
160/// for r in results {
161///     println!("  {} (score={:.4})", r.id, r.score);
162/// }
163/// ```
164pub struct SemanticSearchAdapter {
165    db: VectorDB,
166    dimensions: usize,
167}
168
169impl SemanticSearchAdapter {
170    /// Create a new adapter.
171    ///
172    /// `dimensions` is the embedding dimension of your model (e.g. 384 for
173    /// all-MiniLM-L6-v2, 768 for BERT-base).
174    pub fn new(dimensions: usize, storage_path: impl Into<String>) -> Result<Self> {
175        let options = DbOptions {
176            dimensions,
177            distance_metric: DistanceMetric::Cosine,
178            storage_path: storage_path.into(),
179            hnsw_config: Some(HnswConfig {
180                m: 16,
181                ef_construction: 100,
182                ef_search: 100,
183                max_elements: 500_000,
184            }),
185            quantization: None,
186        };
187        Ok(Self {
188            db: VectorDB::new(options)?,
189            dimensions,
190        })
191    }
192
193    /// Index a source file.
194    ///
195    /// The file `path` is used as the vector ID so look-ups are O(1).
196    /// `description` is stored in metadata for debugging / display.
197    /// `embedding` must have the same length as the adapter's `dimensions`.
198    pub fn index_file(
199        &self,
200        path: impl Into<String>,
201        description: impl Into<String>,
202        embedding: &[f32],
203    ) -> Result<String> {
204        let path_str = path.into();
205        if embedding.len() != self.dimensions {
206            return Err(RuvectorError::DimensionMismatch {
207                expected: self.dimensions,
208                actual: embedding.len(),
209            });
210        }
211
212        let mut metadata = HashMap::new();
213        metadata.insert(
214            "description".to_string(),
215            serde_json::Value::String(description.into()),
216        );
217        metadata.insert(
218            "path".to_string(),
219            serde_json::Value::String(path_str.clone()),
220        );
221
222        self.db.insert(VectorEntry {
223            id: Some(path_str),
224            vector: embedding.to_vec(),
225            metadata: Some(metadata),
226        })
227    }
228
229    /// Remove a previously indexed file.
230    pub fn remove_file(&self, path: &str) -> Result<bool> {
231        self.db.delete(path)
232    }
233
234    /// Search for source files semantically related to `query_embedding`.
235    ///
236    /// Returns up to `k` results sorted by ascending cosine distance
237    /// (most relevant first).  Each [`SearchResult`] has `.id` set to the
238    /// file path and `.metadata` containing the description.
239    pub fn search(
240        &self,
241        _query_text: &str,
242        query_embedding: &[f32],
243        k: usize,
244    ) -> Result<Vec<SearchResult>> {
245        if query_embedding.len() != self.dimensions {
246            return Err(RuvectorError::DimensionMismatch {
247                expected: self.dimensions,
248                actual: query_embedding.len(),
249            });
250        }
251        self.db.search(SearchQuery {
252            vector: query_embedding.to_vec(),
253            k,
254            filter: None,
255            ef_search: None,
256        })
257    }
258
259    /// Total number of indexed files.
260    pub fn len(&self) -> Result<usize> {
261        self.db.len()
262    }
263
264    /// Returns `true` if no files have been indexed yet.
265    pub fn is_empty(&self) -> Result<bool> {
266        self.db.is_empty()
267    }
268
269    /// List all indexed file paths.
270    pub fn list_files(&self) -> Result<Vec<String>> {
271        self.db.keys()
272    }
273}
274
275// ── Shared utility ──────────────────────────────────────────────────────────
276
277/// Normalise a vector to unit length for cosine-distance workloads.
278///
279/// Returns the original vector unchanged if its norm is effectively zero
280/// (to avoid division by zero on zero vectors).
281#[inline]
282pub fn normalize(v: &[f32]) -> Vec<f32> {
283    let norm_sq: f32 = v.iter().map(|x| x * x).sum();
284    if norm_sq < f32::EPSILON {
285        return v.to_vec();
286    }
287    let norm = norm_sq.sqrt();
288    v.iter().map(|x| x / norm).collect()
289}
290
291/// Compute the cosine similarity in [−1, 1] between two vectors.
292///
293/// Both inputs are treated as raw (un-normalised) vectors.
294/// Returns `0.0` if either vector is zero-length.
295#[inline]
296pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
297    debug_assert_eq!(a.len(), b.len(), "cosine_similarity: length mismatch");
298    let (mut dot, mut norm_a, mut norm_b) = (0.0f32, 0.0f32, 0.0f32);
299    for (&ai, &bi) in a.iter().zip(b.iter()) {
300        dot += ai * bi;
301        norm_a += ai * ai;
302        norm_b += bi * bi;
303    }
304    let denom = norm_a.sqrt() * norm_b.sqrt();
305    if denom > f32::EPSILON {
306        dot / denom
307    } else {
308        0.0
309    }
310}
311
312#[cfg(test)]
313mod tests {
314    use super::*;
315    use tempfile::tempdir;
316
317    #[test]
318    fn test_normalize_unit_vector() {
319        let v = vec![3.0f32, 4.0];
320        let n = normalize(&v);
321        let norm: f32 = n.iter().map(|x| x * x).sum::<f32>().sqrt();
322        assert!(
323            (norm - 1.0).abs() < 1e-6,
324            "Expected unit norm, got {}",
325            norm
326        );
327    }
328
329    #[test]
330    fn test_normalize_zero_vector() {
331        let v = vec![0.0f32, 0.0, 0.0];
332        let n = normalize(&v);
333        assert_eq!(n, v, "Zero vector should be returned unchanged");
334    }
335
336    #[test]
337    fn test_cosine_similarity_identical() {
338        let v = vec![1.0f32, 2.0, 3.0];
339        let sim = cosine_similarity(&v, &v);
340        assert!(
341            (sim - 1.0).abs() < 1e-5,
342            "Identical vectors: expected 1.0, got {}",
343            sim
344        );
345    }
346
347    #[test]
348    fn test_cosine_similarity_orthogonal() {
349        let a = vec![1.0f32, 0.0];
350        let b = vec![0.0f32, 1.0];
351        let sim = cosine_similarity(&a, &b);
352        assert!(
353            sim.abs() < 1e-5,
354            "Orthogonal vectors: expected 0.0, got {}",
355            sim
356        );
357    }
358
359    #[test]
360    fn test_semantic_search_adapter_roundtrip() {
361        let dir = tempdir().unwrap();
362        let path = dir.path().join("sparc.db").to_string_lossy().to_string();
363        let adapter = SemanticSearchAdapter::new(4, path).unwrap();
364
365        let emb_a = normalize(&[1.0, 0.0, 0.0, 0.0]);
366        let emb_b = normalize(&[0.0, 1.0, 0.0, 0.0]);
367        let emb_c = normalize(&[0.0, 0.0, 1.0, 0.0]);
368
369        // hnsw_rs requires at least 2 elements before searching.
370        adapter
371            .index_file("src/auth.rs", "authentication", &emb_a)
372            .unwrap();
373        adapter
374            .index_file("src/user.rs", "user model", &emb_b)
375            .unwrap();
376        adapter
377            .index_file("src/storage.rs", "storage layer", &emb_c)
378            .unwrap();
379
380        assert_eq!(adapter.len().unwrap(), 3);
381
382        // Query close to emb_a — should return src/auth.rs first
383        let results = adapter.search("auth", &emb_a, 2).unwrap();
384        assert!(!results.is_empty());
385        assert_eq!(results[0].id, "src/auth.rs");
386    }
387
388    #[test]
389    fn test_fann_adapter_store_and_retrieve() {
390        let dir = tempdir().unwrap();
391        let path = dir.path().join("fann.db").to_string_lossy().to_string();
392        let adapter = FannAdapter::new(4, path).unwrap();
393
394        let layer_emb_0 = normalize(&[1.0, 1.0, 0.0, 0.0]);
395        let layer_emb_1 = normalize(&[0.0, 0.0, 1.0, 1.0]);
396        let layer_emb_2 = normalize(&[1.0, 0.0, 1.0, 0.0]);
397
398        // hnsw_rs requires at least 2 elements before searching.
399        adapter
400            .store_layer("model_v1/layer_0", &layer_emb_0, None)
401            .unwrap();
402        adapter
403            .store_layer("model_v1/layer_1", &layer_emb_1, None)
404            .unwrap();
405        adapter
406            .store_layer("model_v1/layer_2", &layer_emb_2, None)
407            .unwrap();
408
409        let results = adapter.find_similar_layers(&layer_emb_0, 1).unwrap();
410        assert!(!results.is_empty());
411        assert_eq!(results[0].id, "model_v1/layer_0");
412    }
413}