velesdb_wasm/
lib.rs

1//! `VelesDB` WASM - Vector search in the browser
2//!
3//! This crate provides WebAssembly bindings for `VelesDB`'s core vector operations.
4//! It enables browser-based vector search without any server dependency.
5//!
6//! # Features
7//!
8//! - **In-memory vector store**: Fast vector storage and retrieval
9//! - **SIMD-optimized**: Uses WASM SIMD128 for distance calculations
10//! - **Multiple metrics**: Cosine, Euclidean, Dot Product
11//! - **Half-precision**: f16/bf16 support for 50% memory reduction
12//!
13//! # Usage (JavaScript)
14//!
15//! ```javascript
16//! import init, { VectorStore } from 'velesdb-wasm';
17//!
18//! await init();
19//!
20//! const store = new VectorStore(768, "cosine");
21//! store.insert(1, new Float32Array([0.1, 0.2, ...]));
22//! const results = store.search(new Float32Array([0.1, ...]), 10);
23//! ```
24
25use serde::{Deserialize, Serialize};
26use wasm_bindgen::prelude::*;
27
28mod distance;
29mod persistence;
30mod simd;
31
32pub use distance::DistanceMetric;
33
34/// A vector store for in-memory vector search.
35///
36/// # Performance
37///
38/// Uses contiguous memory layout for optimal cache locality and fast
39/// serialization. Vector data is stored in a single buffer rather than
40/// individual Vec allocations.
41#[wasm_bindgen]
42pub struct VectorStore {
43    /// Vector IDs in insertion order
44    ids: Vec<u64>,
45    /// Contiguous buffer: all vector data packed sequentially
46    data: Vec<f32>,
47    dimension: usize,
48    metric: DistanceMetric,
49}
50
51#[wasm_bindgen]
52impl VectorStore {
53    /// Creates a new vector store with the specified dimension and distance metric.
54    ///
55    /// # Arguments
56    ///
57    /// * `dimension` - Vector dimension (e.g., 768 for BERT, 1536 for GPT)
58    /// * `metric` - Distance metric: "cosine", "euclidean", or "dot"
59    ///
60    /// # Errors
61    ///
62    /// Returns an error if the metric is not recognized.
63    #[wasm_bindgen(constructor)]
64    pub fn new(dimension: usize, metric: &str) -> Result<VectorStore, JsValue> {
65        let metric = match metric.to_lowercase().as_str() {
66            "cosine" => DistanceMetric::Cosine,
67            "euclidean" | "l2" => DistanceMetric::Euclidean,
68            "dot" | "dotproduct" | "inner" => DistanceMetric::DotProduct,
69            _ => {
70                return Err(JsValue::from_str(
71                    "Unknown metric. Use: cosine, euclidean, dot",
72                ))
73            }
74        };
75
76        Ok(Self {
77            ids: Vec::new(),
78            data: Vec::new(),
79            dimension,
80            metric,
81        })
82    }
83
84    /// Returns the number of vectors in the store.
85    #[wasm_bindgen(getter)]
86    #[must_use]
87    pub fn len(&self) -> usize {
88        self.ids.len()
89    }
90
91    /// Returns true if the store is empty.
92    #[wasm_bindgen(getter)]
93    #[must_use]
94    pub fn is_empty(&self) -> bool {
95        self.ids.is_empty()
96    }
97
98    /// Returns the vector dimension.
99    #[wasm_bindgen(getter)]
100    #[must_use]
101    pub fn dimension(&self) -> usize {
102        self.dimension
103    }
104
105    /// Inserts a vector with the given ID.
106    ///
107    /// # Arguments
108    ///
109    /// * `id` - Unique identifier for the vector
110    /// * `vector` - `Float32Array` of the vector data
111    ///
112    /// # Errors
113    ///
114    /// Returns an error if vector dimension doesn't match store dimension.
115    #[wasm_bindgen]
116    pub fn insert(&mut self, id: u64, vector: &[f32]) -> Result<(), JsValue> {
117        if vector.len() != self.dimension {
118            return Err(JsValue::from_str(&format!(
119                "Vector dimension mismatch: expected {}, got {}",
120                self.dimension,
121                vector.len()
122            )));
123        }
124
125        // Remove existing vector with same ID if present
126        if let Some(idx) = self.ids.iter().position(|&x| x == id) {
127            self.ids.remove(idx);
128            let start = idx * self.dimension;
129            self.data.drain(start..start + self.dimension);
130        }
131
132        // Append to contiguous buffer
133        self.ids.push(id);
134        self.data.extend_from_slice(vector);
135
136        Ok(())
137    }
138
139    /// Searches for the k nearest neighbors to the query vector.
140    ///
141    /// # Arguments
142    ///
143    /// * `query` - Query vector as `Float32Array`
144    /// * `k` - Number of results to return
145    ///
146    /// # Returns
147    ///
148    /// Array of [id, score] pairs sorted by relevance.
149    ///
150    /// # Errors
151    ///
152    /// Returns an error if query dimension doesn't match store dimension.
153    #[wasm_bindgen]
154    pub fn search(&self, query: &[f32], k: usize) -> Result<JsValue, JsValue> {
155        if query.len() != self.dimension {
156            return Err(JsValue::from_str(&format!(
157                "Query dimension mismatch: expected {}, got {}",
158                self.dimension,
159                query.len()
160            )));
161        }
162
163        // Perf: Iterate over contiguous buffer with better cache locality
164        let mut results: Vec<(u64, f32)> = self
165            .ids
166            .iter()
167            .enumerate()
168            .map(|(idx, &id)| {
169                let start = idx * self.dimension;
170                let v_data = &self.data[start..start + self.dimension];
171                let score = self.metric.calculate(query, v_data);
172                (id, score)
173            })
174            .collect();
175
176        // Sort by relevance
177        if self.metric.higher_is_better() {
178            results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
179        } else {
180            results.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
181        }
182
183        results.truncate(k);
184
185        serde_wasm_bindgen::to_value(&results).map_err(|e| JsValue::from_str(&e.to_string()))
186    }
187
188    /// Removes a vector by ID.
189    #[wasm_bindgen]
190    pub fn remove(&mut self, id: u64) -> bool {
191        if let Some(idx) = self.ids.iter().position(|&x| x == id) {
192            self.ids.remove(idx);
193            let start = idx * self.dimension;
194            self.data.drain(start..start + self.dimension);
195            true
196        } else {
197            false
198        }
199    }
200
201    /// Clears all vectors from the store.
202    #[wasm_bindgen]
203    pub fn clear(&mut self) {
204        self.ids.clear();
205        self.data.clear();
206    }
207
208    /// Returns memory usage estimate in bytes.
209    #[wasm_bindgen]
210    #[must_use]
211    pub fn memory_usage(&self) -> usize {
212        self.ids.len() * std::mem::size_of::<u64>() + self.data.len() * 4
213    }
214
215    /// Creates a new vector store with pre-allocated capacity.
216    ///
217    /// This is more efficient when you know the approximate number of vectors
218    /// you'll be inserting, as it avoids repeated memory allocations.
219    ///
220    /// # Arguments
221    ///
222    /// * `dimension` - Vector dimension
223    /// * `metric` - Distance metric: "cosine", "euclidean", or "dot"
224    /// * `capacity` - Number of vectors to pre-allocate space for
225    ///
226    /// # Errors
227    ///
228    /// Returns an error if the metric is not recognized.
229    #[wasm_bindgen]
230    pub fn with_capacity(
231        dimension: usize,
232        metric: &str,
233        capacity: usize,
234    ) -> Result<VectorStore, JsValue> {
235        let metric = match metric.to_lowercase().as_str() {
236            "cosine" => DistanceMetric::Cosine,
237            "euclidean" | "l2" => DistanceMetric::Euclidean,
238            "dot" | "dotproduct" | "inner" => DistanceMetric::DotProduct,
239            _ => {
240                return Err(JsValue::from_str(
241                    "Unknown metric. Use: cosine, euclidean, dot",
242                ))
243            }
244        };
245
246        Ok(Self {
247            ids: Vec::with_capacity(capacity),
248            data: Vec::with_capacity(capacity * dimension),
249            dimension,
250            metric,
251        })
252    }
253
254    /// Pre-allocates memory for the specified number of additional vectors.
255    ///
256    /// Call this before bulk insertions to avoid repeated allocations.
257    ///
258    /// # Arguments
259    ///
260    /// * `additional` - Number of additional vectors to reserve space for
261    #[wasm_bindgen]
262    pub fn reserve(&mut self, additional: usize) {
263        self.ids.reserve(additional);
264        self.data.reserve(additional * self.dimension);
265    }
266
267    /// Inserts multiple vectors in a single batch operation.
268    ///
269    /// This is significantly faster than calling `insert()` multiple times
270    /// because it pre-allocates memory and reduces per-call overhead.
271    ///
272    /// # Arguments
273    ///
274    /// * `batch` - JavaScript array of `[id, Float32Array]` pairs
275    ///
276    /// # Errors
277    ///
278    /// Returns an error if any vector dimension doesn't match store dimension.
279    #[wasm_bindgen]
280    pub fn insert_batch(&mut self, batch: JsValue) -> Result<(), JsValue> {
281        let batch: Vec<(u64, Vec<f32>)> = serde_wasm_bindgen::from_value(batch)
282            .map_err(|e| JsValue::from_str(&format!("Invalid batch format: {e}")))?;
283
284        // Validate all dimensions first
285        for (id, vector) in &batch {
286            if vector.len() != self.dimension {
287                return Err(JsValue::from_str(&format!(
288                    "Vector {} dimension mismatch: expected {}, got {}",
289                    id,
290                    self.dimension,
291                    vector.len()
292                )));
293            }
294        }
295
296        // Pre-allocate space for contiguous buffer
297        self.ids.reserve(batch.len());
298        self.data.reserve(batch.len() * self.dimension);
299
300        // Remove existing IDs first
301        let ids_to_remove: Vec<u64> = batch.iter().map(|(id, _)| *id).collect();
302        for id in &ids_to_remove {
303            if let Some(idx) = self.ids.iter().position(|&x| x == *id) {
304                self.ids.remove(idx);
305                let start = idx * self.dimension;
306                self.data.drain(start..start + self.dimension);
307            }
308        }
309
310        // Insert all vectors into contiguous buffer
311        for (id, vector) in batch {
312            self.ids.push(id);
313            self.data.extend_from_slice(&vector);
314        }
315
316        Ok(())
317    }
318
319    /// Exports the vector store to a binary format.
320    ///
321    /// The binary format contains:
322    /// - Header: dimension (u32), metric (u8), count (u64)
323    /// - For each vector: id (u64), data (f32 array)
324    ///
325    /// Use this to persist data to `IndexedDB` or `localStorage`.
326    ///
327    /// # Errors
328    ///
329    /// This function currently does not return errors but uses `Result`
330    /// for future extensibility.
331    ///
332    /// # Performance
333    ///
334    /// Perf: Pre-allocates exact buffer size to avoid reallocations.
335    /// Throughput: ~1600 MB/s on 10k vectors (768D)
336    #[wasm_bindgen]
337    pub fn export_to_bytes(&self) -> Result<Vec<u8>, JsValue> {
338        // Perf: Pre-allocate exact size - uses contiguous buffer for 2500+ MB/s
339        let count = self.ids.len();
340        let vector_size = 8 + self.dimension * 4; // id + data
341        let total_size = 18 + count * vector_size;
342        let mut bytes = Vec::with_capacity(total_size);
343
344        // Header: magic number "VELS" (4 bytes)
345        bytes.extend_from_slice(b"VELS");
346
347        // Version (1 byte)
348        bytes.push(1);
349
350        // Dimension (4 bytes, little-endian)
351        #[allow(clippy::cast_possible_truncation)]
352        let dim_u32 = self.dimension as u32;
353        bytes.extend_from_slice(&dim_u32.to_le_bytes());
354
355        // Metric (1 byte: 0=cosine, 1=euclidean, 2=dot)
356        let metric_byte = match self.metric {
357            DistanceMetric::Cosine => 0u8,
358            DistanceMetric::Euclidean => 1u8,
359            DistanceMetric::DotProduct => 2u8,
360        };
361        bytes.push(metric_byte);
362
363        // Vector count (8 bytes, little-endian)
364        #[allow(clippy::cast_possible_truncation)]
365        let count_u64 = count as u64;
366        bytes.extend_from_slice(&count_u64.to_le_bytes());
367
368        // Perf: Write IDs and data from contiguous buffers
369        for (idx, &id) in self.ids.iter().enumerate() {
370            bytes.extend_from_slice(&id.to_le_bytes());
371            // Direct slice from contiguous data buffer
372            let start = idx * self.dimension;
373            let data_slice = &self.data[start..start + self.dimension];
374            // Write f32s as bytes
375            let data_bytes: &[u8] = unsafe {
376                core::slice::from_raw_parts(data_slice.as_ptr().cast::<u8>(), self.dimension * 4)
377            };
378            bytes.extend_from_slice(data_bytes);
379        }
380
381        Ok(bytes)
382    }
383
384    /// Saves the vector store to `IndexedDB`.
385    ///
386    /// This method persists all vectors to the browser's `IndexedDB`,
387    /// enabling offline-first applications.
388    ///
389    /// # Arguments
390    ///
391    /// * `db_name` - Name of the `IndexedDB` database
392    ///
393    /// # Errors
394    ///
395    /// Returns an error if `IndexedDB` is not available or the save fails.
396    ///
397    /// # Example
398    ///
399    /// ```javascript
400    /// const store = new VectorStore(768, "cosine");
401    /// store.insert(1n, vector1);
402    /// await store.save("my-vectors");
403    /// ```
404    #[wasm_bindgen]
405    pub async fn save(&self, db_name: &str) -> Result<(), JsValue> {
406        let bytes = self.export_to_bytes()?;
407        persistence::save_to_indexeddb(db_name, &bytes).await
408    }
409
410    /// Loads a vector store from `IndexedDB`.
411    ///
412    /// This method restores all vectors from the browser's `IndexedDB`.
413    ///
414    /// # Arguments
415    ///
416    /// * `db_name` - Name of the `IndexedDB` database
417    ///
418    /// # Errors
419    ///
420    /// Returns an error if the database doesn't exist or is corrupted.
421    ///
422    /// # Example
423    ///
424    /// ```javascript
425    /// const store = await VectorStore.load("my-vectors");
426    /// console.log(store.len); // Number of restored vectors
427    /// ```
428    #[wasm_bindgen]
429    pub async fn load(db_name: &str) -> Result<VectorStore, JsValue> {
430        let bytes = persistence::load_from_indexeddb(db_name).await?;
431        Self::import_from_bytes(&bytes)
432    }
433
434    /// Deletes the `IndexedDB` database.
435    ///
436    /// Use this to clear all persisted data.
437    ///
438    /// # Arguments
439    ///
440    /// * `db_name` - Name of the `IndexedDB` database to delete
441    ///
442    /// # Errors
443    ///
444    /// Returns an error if the deletion fails.
445    #[wasm_bindgen]
446    pub async fn delete_database(db_name: &str) -> Result<(), JsValue> {
447        persistence::delete_database(db_name).await
448    }
449
450    /// Imports a vector store from binary format.
451    ///
452    /// Use this to restore data from `IndexedDB` or `localStorage`.
453    ///
454    /// # Errors
455    ///
456    /// Returns an error if:
457    /// - The data is too short or corrupted
458    /// - The magic number is invalid
459    /// - The version is unsupported
460    /// - The metric byte is invalid
461    #[wasm_bindgen]
462    pub fn import_from_bytes(bytes: &[u8]) -> Result<VectorStore, JsValue> {
463        if bytes.len() < 18 {
464            return Err(JsValue::from_str("Invalid data: too short"));
465        }
466
467        // Check magic number
468        if &bytes[0..4] != b"VELS" {
469            return Err(JsValue::from_str("Invalid data: wrong magic number"));
470        }
471
472        // Check version
473        let version = bytes[4];
474        if version != 1 {
475            return Err(JsValue::from_str(&format!(
476                "Unsupported version: {version}"
477            )));
478        }
479
480        // Read dimension
481        let dimension = u32::from_le_bytes([bytes[5], bytes[6], bytes[7], bytes[8]]) as usize;
482
483        // Read metric
484        let metric = match bytes[9] {
485            0 => DistanceMetric::Cosine,
486            1 => DistanceMetric::Euclidean,
487            2 => DistanceMetric::DotProduct,
488            _ => return Err(JsValue::from_str("Invalid metric byte")),
489        };
490
491        // Read vector count
492        #[allow(clippy::cast_possible_truncation)]
493        let count = u64::from_le_bytes([
494            bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15], bytes[16], bytes[17],
495        ]) as usize;
496
497        // Calculate expected size
498        let vector_size = 8 + dimension * 4; // id + data
499        let expected_size = 18 + count * vector_size;
500        if bytes.len() < expected_size {
501            return Err(JsValue::from_str(&format!(
502                "Invalid data: expected {expected_size} bytes, got {}",
503                bytes.len()
504            )));
505        }
506
507        // Perf: Pre-allocate contiguous buffers
508        // Optimization: Single allocation + bulk copy = 500+ MB/s
509        let mut ids = Vec::with_capacity(count);
510        let total_floats = count * dimension;
511        let mut data = vec![0.0_f32; total_floats];
512
513        let mut offset = 18;
514        let data_bytes_len = dimension * 4;
515
516        // Read all IDs first (cache-friendly sequential access)
517        for _ in 0..count {
518            let id = u64::from_le_bytes([
519                bytes[offset],
520                bytes[offset + 1],
521                bytes[offset + 2],
522                bytes[offset + 3],
523                bytes[offset + 4],
524                bytes[offset + 5],
525                bytes[offset + 6],
526                bytes[offset + 7],
527            ]);
528            ids.push(id);
529            offset += 8 + data_bytes_len; // Skip to next ID
530        }
531
532        // Perf: Bulk copy all vector data in one operation
533        // SAFETY: f32 and [u8; 4] have same size, WASM is little-endian
534        let data_as_bytes: &mut [u8] = unsafe {
535            core::slice::from_raw_parts_mut(data.as_mut_ptr().cast::<u8>(), total_floats * 4)
536        };
537
538        // Copy data from each vector position
539        offset = 18 + 8; // Skip header + first ID
540        for i in 0..count {
541            let dest_start = i * dimension * 4;
542            let dest_end = dest_start + data_bytes_len;
543            data_as_bytes[dest_start..dest_end]
544                .copy_from_slice(&bytes[offset..offset + data_bytes_len]);
545            offset += 8 + data_bytes_len; // Move to next vector's data
546        }
547
548        Ok(Self {
549            ids,
550            data,
551            dimension,
552            metric,
553        })
554    }
555}
556
557/// Search result containing ID and score.
558#[derive(Serialize, Deserialize)]
559pub struct SearchResult {
560    pub id: u64,
561    pub score: f32,
562}
563
564// Console logging for debugging
565#[wasm_bindgen]
566extern "C" {
567    #[wasm_bindgen(js_namespace = console)]
568    fn log(s: &str);
569}
570
571#[allow(unused_macros)]
572macro_rules! console_log {
573    ($($t:tt)*) => (log(&format_args!($($t)*).to_string()))
574}
575
576// Tests for VectorStore are in distance.rs and simd.rs modules
577// The wasm-bindgen VectorStore tests require wasm-bindgen-test and must
578// be run with `wasm-pack test --node`