use std::collections::HashMap;
use std::fs;
use std::hash::{BuildHasher, Hasher};
use std::path::Path;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::RwLock;
use serde::{Deserialize, Serialize};
use crate::error::MemoryError;
use crate::vector_backend::{VectorBackend, VectorHit, VectorIndexConfig};
use usearch::ffi::{IndexOptions, MetricKind, ScalarKind};
use usearch::Index;
const SCALAR_KIND: ScalarKind = ScalarKind::F32;
const KEYMAP_SENTINEL: u64 = u64::MAX;
#[derive(Debug, Clone, Serialize, Deserialize)]
struct UsearchSidecarManifestV1 {
schema_version: u32,
generation_id: String,
basename: String,
manifest_file_name: String,
data_file_name: String,
keys_file_name: String,
graph_digest: String,
data_digest: String,
keys_digest: String,
dimensions: usize,
vector_count: u64,
hnsw_sidecar_format_version: u32,
backend_kind: String, backend_version: String,
source_sqlite_epoch: Option<u64>,
created_at: String,
}
pub struct UsearchBackend {
index: RwLock<Index>,
key_to_id: RwLock<HashMap<String, u64>>,
id_to_key: RwLock<HashMap<u64, String>>,
next_hash_seed: AtomicU64,
config: VectorIndexConfig,
dirty: std::sync::atomic::AtomicBool,
}
impl UsearchBackend {
pub fn new(config: VectorIndexConfig) -> Result<Self, MemoryError> {
validate_dimensions(config.dimensions)?;
let options = IndexOptions {
dimensions: config.dimensions,
metric: MetricKind::Cos,
quantization: SCALAR_KIND,
connectivity: config.m,
expansion_add: config.ef_construction,
expansion_search: config.ef_search,
multi: false,
};
let index = Index::new(&options).map_err(|e| {
MemoryError::HnswError(format!("usearch::Index::new failed: {e:?}"))
})?;
index.reserve(config.max_elements).map_err(|e| {
MemoryError::HnswError(format!("usearch::Index::reserve failed: {e:?}"))
})?;
Ok(Self {
index: RwLock::new(index),
key_to_id: RwLock::new(HashMap::new()),
id_to_key: RwLock::new(HashMap::new()),
next_hash_seed: AtomicU64::new(0),
config,
dirty: std::sync::atomic::AtomicBool::new(false),
})
}
pub fn load(
dir: &Path,
basename: &str,
config: VectorIndexConfig,
) -> Result<Self, MemoryError> {
let manifest_path = manifest_path(dir, basename);
let manifest_bytes = fs::read(&manifest_path).map_err(|e| {
MemoryError::StorageError(format!(
"usearch sidecar manifest read failed at {:?}: {e}",
manifest_path
))
})?;
let manifest: UsearchSidecarManifestV1 =
serde_json::from_slice(&manifest_bytes).map_err(|e| {
MemoryError::StorageError(format!(
"usearch sidecar manifest parse failed: {e}"
))
})?;
if manifest.backend_kind != "usearch" {
return Err(MemoryError::StorageError(format!(
"sidecar was written by '{}', not usearch. Rejecting to avoid data corruption.",
manifest.backend_kind
)));
}
if manifest.hnsw_sidecar_format_version != 1 {
return Err(MemoryError::StorageError(format!(
"unsupported sidecar format version: {}",
manifest.hnsw_sidecar_format_version
)));
}
let options = IndexOptions {
dimensions: config.dimensions,
metric: MetricKind::Cos,
quantization: SCALAR_KIND,
connectivity: config.m,
expansion_add: config.ef_construction,
expansion_search: config.ef_search,
multi: false,
};
let index = Index::new(&options).map_err(|e| {
MemoryError::HnswError(format!("usearch::Index::new failed during load: {e:?}"))
})?;
let data_path = dir.join(&manifest.data_file_name);
let data_path_str = data_path.to_str().ok_or_else(|| {
MemoryError::StorageError(format!("non-UTF8 data path: {:?}", data_path))
})?;
index.load(data_path_str).map_err(|e| {
MemoryError::StorageError(format!("usearch::Index::load failed: {e:?}"))
})?;
let keys_path = dir.join(&manifest.keys_file_name);
let keymap_raw = fs::read_to_string(&keys_path).map_err(|e| {
MemoryError::StorageError(format!(
"usearch keymap read failed at {:?}: {e}",
keys_path
))
})?;
let mut key_to_id = HashMap::new();
let mut id_to_key = HashMap::new();
for line in keymap_raw.lines() {
if line.is_empty() {
continue;
}
if let Some((id_str, key)) = line.split_once('\t') {
if let Ok(id) = id_str.parse::<u64>() {
if id == KEYMAP_SENTINEL {
continue;
}
key_to_id.insert(key.to_string(), id);
id_to_key.insert(id, key.to_string());
}
}
}
Ok(Self {
index: RwLock::new(index),
key_to_id: RwLock::new(key_to_id),
id_to_key: RwLock::new(id_to_key),
next_hash_seed: AtomicU64::new(0),
config,
dirty: std::sync::atomic::AtomicBool::new(false),
})
}
fn hash_key(&self, key: &str) -> u64 {
let mut hasher = std::hash::DefaultHasher::new();
hasher.write(key.as_bytes());
hasher.finish()
}
pub fn save_to_disk(&self, dir: &Path, basename: &str) -> Result<(), MemoryError> {
fs::create_dir_all(dir).map_err(|e| {
MemoryError::StorageError(format!(
"usearch sidecar dir create failed: {:?}: {e}",
dir
))
})?;
let data_path = dir.join(data_file_name(basename));
let _data_path_str = data_path.to_str().ok_or_else(|| {
MemoryError::StorageError(format!("non-UTF8 data path: {:?}", data_path))
})?;
let index = self.index.read().unwrap_or_else(|e| e.into_inner());
let buf_len = index.serialized_length();
let mut bytes = vec![0u8; buf_len];
let written = {
let len = bytes.len();
index.save_to_buffer(&mut bytes).map_err(|e| {
MemoryError::StorageError(format!("usearch save_to_buffer failed: {e:?}"))
})?;
len
};
let _ = written; let data_tmp = dir.join(format!("{}.tmp", data_file_name(basename)));
fs::write(&data_tmp, &bytes).map_err(|e| {
MemoryError::StorageError(format!(
"usearch data tmp write failed: {:?}: {e}",
data_tmp
))
})?;
fs::rename(&data_tmp, &data_path).map_err(|e| {
MemoryError::StorageError(format!(
"usearch data rename failed: {:?} → {:?}: {e}",
data_tmp, data_path
))
})?;
drop(index);
let keys_path = dir.join(keys_file_name(basename));
let keymap_raw = {
let id_to_key = self.id_to_key.read().unwrap_or_else(|e| e.into_inner());
let mut s = String::new();
for (id, key) in id_to_key.iter() {
s.push_str(&format!("{}\t{}\n", id, key));
}
s.push_str(&format!("{}\n", KEYMAP_SENTINEL));
s
};
let keys_tmp = dir.join(format!("{}.tmp", keys_file_name(basename)));
fs::write(&keys_tmp, &keymap_raw).map_err(|e| {
MemoryError::StorageError(format!(
"usearch keys tmp write failed: {:?}: {e}",
keys_tmp
))
})?;
fs::rename(&keys_tmp, &keys_path).map_err(|e| {
MemoryError::StorageError(format!(
"usearch keys rename failed: {:?} → {:?}: {e}",
keys_tmp, keys_path
))
})?;
let manifest = UsearchSidecarManifestV1 {
schema_version: 1,
generation_id: generate_generation_id(),
basename: basename.to_string(),
manifest_file_name: manifest_file_name(basename),
data_file_name: data_file_name(basename),
keys_file_name: keys_file_name(basename),
graph_digest: "n/a (usearch format opaque)".to_string(),
data_digest: blake3_digest_hex(&bytes),
keys_digest: blake3_digest_hex(keymap_raw.as_bytes()),
dimensions: self.config.dimensions,
vector_count: {
let idx = self.index.read().unwrap_or_else(|e| e.into_inner());
idx.size() as u64
},
hnsw_sidecar_format_version: 1,
backend_kind: "usearch".to_string(),
backend_version: "2.25.3".to_string(),
source_sqlite_epoch: Some(current_epoch_secs()),
created_at: chrono::Utc::now().to_rfc3339(),
};
let manifest_path = manifest_path(dir, basename);
let manifest_json = serde_json::to_vec_pretty(&manifest).map_err(|e| {
MemoryError::StorageError(format!("manifest serialize failed: {e}"))
})?;
let manifest_tmp = dir.join(format!("{}.tmp", manifest_file_name(basename)));
fs::write(&manifest_tmp, &manifest_json).map_err(|e| {
MemoryError::StorageError(format!(
"manifest tmp write failed: {:?}: {e}",
manifest_tmp
))
})?;
fs::rename(&manifest_tmp, &manifest_path).map_err(|e| {
MemoryError::StorageError(format!(
"manifest rename failed: {:?} → {:?}: {e}",
manifest_tmp, manifest_path
))
})?;
self.dirty.store(false, std::sync::atomic::Ordering::SeqCst);
Ok(())
}
pub fn config(&self) -> &VectorIndexConfig {
&self.config
}
pub fn is_dirty(&self) -> bool {
self.dirty.load(std::sync::atomic::Ordering::SeqCst)
}
}
impl std::fmt::Debug for UsearchBackend {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("UsearchBackend")
.field("dimensions", &self.config.dimensions)
.field("m", &self.config.m)
.field("ef_construction", &self.config.ef_construction)
.field("ef_search", &self.config.ef_search)
.field("dirty", &self.dirty.load(std::sync::atomic::Ordering::SeqCst))
.field("size", &{
let idx = self.index.read().unwrap_or_else(|e| e.into_inner());
idx.size()
})
.finish()
}
}
impl VectorBackend for UsearchBackend {
fn insert(&self, key: String, vector: &[f32]) -> Result<(), MemoryError> {
validate_dimensions_vs_config(vector.len(), self.config.dimensions)?;
let id = self.hash_key(&key);
{
let mut key_to_id = self.key_to_id.write().unwrap_or_else(|e| e.into_inner());
if let Some(existing) = key_to_id.get(&key) {
if *existing != id {
return Err(MemoryError::HnswError(format!(
"usearch key collision: '{key}' hashes to {id} but map has {}",
existing
)));
}
}
key_to_id.insert(key.clone(), id);
}
{
let mut id_to_key = self.id_to_key.write().unwrap_or_else(|e| e.into_inner());
id_to_key.insert(id, key);
}
let index = self.index.read().unwrap_or_else(|e| e.into_inner());
index.add(id, vector).map_err(|e| {
MemoryError::HnswError(format!("usearch::Index::add failed: {e:?}"))
})?;
drop(index);
self.dirty.store(true, std::sync::atomic::Ordering::SeqCst);
Ok(())
}
fn delete(&self, key: &str) -> Result<(), MemoryError> {
let id = self.hash_key(key);
{
let mut key_to_id = self.key_to_id.write().unwrap_or_else(|e| e.into_inner());
key_to_id.remove(key);
}
{
let mut id_to_key = self.id_to_key.write().unwrap_or_else(|e| e.into_inner());
id_to_key.remove(&id);
}
let index = self.index.read().unwrap_or_else(|e| e.into_inner());
let _ = index.remove(id).map_err(|e| {
MemoryError::HnswError(format!("usearch::Index::remove failed: {e:?}"))
})?;
drop(index);
self.dirty.store(true, std::sync::atomic::Ordering::SeqCst);
Ok(())
}
fn update(&self, key: String, vector: &[f32]) -> Result<(), MemoryError> {
self.delete(&key)?;
self.insert(key, vector)
}
fn search(&self, query: &[f32], top_k: usize) -> Result<Vec<VectorHit>, MemoryError> {
validate_dimensions_vs_config(query.len(), self.config.dimensions)?;
if top_k == 0 {
return Ok(Vec::new());
}
let id_to_key = self.id_to_key.read().unwrap_or_else(|e| e.into_inner());
let index = self.index.read().unwrap_or_else(|e| e.into_inner());
if index.size() == 0 {
return Ok(Vec::new());
}
let fetch_count = top_k.min(index.size());
let matches = index.search(query, fetch_count).map_err(|e| {
MemoryError::HnswError(format!("usearch::Index::search failed: {e:?}"))
})?;
let mut hits: Vec<VectorHit> = matches
.keys
.iter()
.zip(matches.distances.iter())
.filter_map(|(id, dist)| {
id_to_key.get(id).map(|key| VectorHit {
key: key.clone(),
distance: *dist,
})
})
.collect();
hits.truncate(top_k);
Ok(hits)
}
fn len(&self) -> usize {
let index = self.index.read().unwrap_or_else(|e| e.into_inner());
index.size()
}
fn is_empty(&self) -> bool {
self.len() == 0
}
fn save(&self, dir: &Path, basename: &str) -> Result<(), MemoryError> {
self.save_to_disk(dir, basename)
}
fn backend_name(&self) -> &'static str {
"usearch 2.25 (single-file vector search, C++ via cxx bridge)"
}
}
fn validate_dimensions(d: usize) -> Result<(), MemoryError> {
if d == 0 {
return Err(MemoryError::HnswError(
"usearch dimensions must be > 0".to_string(),
));
}
Ok(())
}
fn validate_dimensions_vs_config(actual: usize, expected: usize) -> Result<(), MemoryError> {
if actual != expected {
return Err(MemoryError::HnswError(format!(
"vector has {actual} dimensions, index expects {expected}"
)));
}
Ok(())
}
fn manifest_path(dir: &Path, basename: &str) -> std::path::PathBuf {
dir.join(manifest_file_name(basename))
}
fn manifest_file_name(basename: &str) -> String {
format!("{basename}.hnsw.manifest.json")
}
fn data_file_name(basename: &str) -> String {
format!("{basename}.hnsw.data")
}
fn keys_file_name(basename: &str) -> String {
format!("{basename}.hnsw.keys")
}
fn current_epoch_secs() -> u64 {
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0)
}
fn generate_generation_id() -> String {
use std::time::{SystemTime, UNIX_EPOCH};
let nanos = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_nanos())
.unwrap_or(0);
format!("gen-{:x}", nanos)
}
fn blake3_digest_hex(bytes: &[u8]) -> String {
use blake3::Hasher;
let mut h = Hasher::new();
h.update(bytes);
let out = h.finalize();
out.to_hex().to_string()
}
#[cfg(test)]
mod tests {
use super::*;
use std::collections::HashSet;
use tempfile::TempDir;
fn test_config() -> VectorIndexConfig {
VectorIndexConfig {
m: 8,
ef_construction: 64,
ef_search: 32,
dimensions: 4,
max_elements: 100,
compaction_threshold: 0.3,
flush_interval_secs: None,
}
}
#[test]
fn new_creates_empty_index() {
let b = UsearchBackend::new(test_config()).unwrap();
assert!(b.is_empty());
assert_eq!(b.len(), 0);
}
#[test]
fn insert_then_search_returns_match() {
let b = UsearchBackend::new(test_config()).unwrap();
b.insert("fact:1".to_string(), &[1.0, 0.0, 0.0, 0.0]).unwrap();
b.insert("fact:2".to_string(), &[0.0, 1.0, 0.0, 0.0]).unwrap();
b.insert("fact:3".to_string(), &[0.0, 0.0, 1.0, 0.0]).unwrap();
assert_eq!(b.len(), 3);
let hits = b.search(&[1.0, 0.0, 0.0, 0.0], 2).unwrap();
assert!(!hits.is_empty());
assert_eq!(hits[0].key, "fact:1");
}
#[test]
fn delete_removes_from_search() {
let b = UsearchBackend::new(test_config()).unwrap();
b.insert("a".to_string(), &[1.0, 0.0, 0.0, 0.0]).unwrap();
b.insert("b".to_string(), &[0.0, 1.0, 0.0, 0.0]).unwrap();
b.delete("a").unwrap();
assert_eq!(b.len(), 1);
let hits = b.search(&[1.0, 0.0, 0.0, 0.0], 5).unwrap();
let keys: HashSet<_> = hits.iter().map(|h| h.key.clone()).collect();
assert!(!keys.contains("a"));
assert!(keys.contains("b"));
}
#[test]
fn update_replaces_existing() {
let b = UsearchBackend::new(test_config()).unwrap();
b.insert("k".to_string(), &[1.0, 0.0, 0.0, 0.0]).unwrap();
b.update("k".to_string(), &[0.0, 0.0, 0.0, 1.0]).unwrap();
assert_eq!(b.len(), 1); let hits = b.search(&[0.0, 0.0, 0.0, 1.0], 1).unwrap();
assert_eq!(hits[0].key, "k");
}
#[test]
fn search_with_wrong_dimensions_errors() {
let b = UsearchBackend::new(test_config()).unwrap();
let result = b.search(&[1.0, 0.0], 5);
assert!(result.is_err());
}
#[test]
fn insert_with_wrong_dimensions_errors() {
let b = UsearchBackend::new(test_config()).unwrap();
let result = b.insert("k".to_string(), &[1.0, 0.0]);
assert!(result.is_err());
}
#[test]
fn save_then_load_round_trips() {
let tmp = TempDir::new().unwrap();
let dir = tmp.path();
let b = UsearchBackend::new(test_config()).unwrap();
b.insert("fact:a".to_string(), &[1.0, 0.0, 0.0, 0.0]).unwrap();
b.insert("fact:b".to_string(), &[0.0, 1.0, 0.0, 0.0]).unwrap();
b.insert("fact:c".to_string(), &[0.0, 0.0, 1.0, 0.0]).unwrap();
b.save(dir, "test").unwrap();
assert!(dir.join("test.hnsw.manifest.json").exists());
assert!(dir.join("test.hnsw.data").exists());
assert!(dir.join("test.hnsw.keys").exists());
let b2 = UsearchBackend::load(dir, "test", test_config()).unwrap();
assert_eq!(b2.len(), 3);
let hits = b2.search(&[1.0, 0.0, 0.0, 0.0], 1).unwrap();
assert_eq!(hits[0].key, "fact:a");
}
#[test]
fn load_rejects_hnsw_rs_backend_kind() {
let tmp = TempDir::new().unwrap();
let dir = tmp.path();
let manifest = serde_json::json!({
"schema_version": 1,
"generation_id": "fake",
"basename": "test",
"manifest_file_name": "test.hnsw.manifest.json",
"data_file_name": "test.hnsw.data",
"keys_file_name": "test.hnsw.keys",
"graph_digest": "n/a",
"data_digest": "n/a",
"keys_digest": "n/a",
"dimensions": 4,
"vector_count": 0,
"hnsw_sidecar_format_version": 1,
"backend_kind": "hnsw_rs",
"backend_version": "0.3.4",
"source_sqlite_epoch": null,
"created_at": "2026-06-02T00:00:00Z"
});
fs::write(dir.join("test.hnsw.manifest.json"), serde_json::to_vec_pretty(&manifest).unwrap()).unwrap();
fs::write(dir.join("test.hnsw.data"), []).unwrap();
fs::write(dir.join("test.hnsw.keys"), "").unwrap();
let result = UsearchBackend::load(dir, "test", test_config());
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("hnsw_rs"));
}
#[test]
fn backend_name_includes_usearch() {
let b = UsearchBackend::new(test_config()).unwrap();
assert!(b.backend_name().contains("usearch"));
}
}