pub mod types;
pub mod distance;
pub mod hnsw;
pub mod quantize;
pub mod engine;
pub mod persist;
pub use types::{
DistanceMetric, HNSW_MAX_NEIGHBORS, HnswNode, IndexParams, IndexType, QuantizationType,
SearchFilter, VectorEmbedding, VectorEmbeddingHeader, VectorError, VectorIndexMeta,
VectorSearchResult,
};
pub use distance::{
compute_distance, compute_similarity, cosine_distance, cosine_similarity, dot_product,
euclidean_distance, euclidean_distance_squared, hamming_distance, l1_norm, l2_norm,
manhattan_distance, normalize, normalize_inplace,
};
pub use hnsw::{HnswIndex, HnswStats};
pub use quantize::{
Int8QuantParams, PqParams, binary_cosine_approx, bytes_to_f16, dequantize_f16_to_f32,
dequantize_int8_signed_to_f32, dequantize_int8_to_f32, f16_to_bytes, f16_to_f32, f32_to_f16,
quantize_f32_to_binary, quantize_f32_to_f16, quantize_f32_to_int8, quantize_f32_to_int8_signed,
};
pub use engine::{
VectorEngine, batch_store, delete_embedding, find_similar, get_embedding, get_index_meta,
get_index_stats, list_datasets, rebuild_index, search, search_filtered, set_ef_search,
store_embedding, total_vectors,
};
pub use persist::{
SerializedIndex, SerializedLayer, VectorIndexHeader, VectorRecord, deserialize_index,
serialize_index,
};
pub mod models {
pub const OPENAI_ADA_002: u32 = 0x61646132;
pub const OPENAI_EMBED_3_SMALL: u32 = 0x65336d73;
pub const OPENAI_EMBED_3_LARGE: u32 = 0x65336c67;
pub const CLIP_VIT_B32: u32 = 0x636c6232;
pub const CLIP_VIT_L14: u32 = 0x636c6c14;
pub const SBERT_BASE: u32 = 0x73626572;
pub const COHERE_EMBED_V3: u32 = 0x636f6833;
pub const BGE_BASE: u32 = 0x62676562;
pub fn hash_model_name(name: &str) -> u32 {
let mut hash = 0u32;
for byte in name.bytes() {
hash = hash.wrapping_mul(31).wrapping_add(byte as u32);
}
hash
}
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::vec;
use alloc::vec::Vec;
fn random_embedding(dim: usize, seed: u64) -> Vec<f32> {
let mut rng = seed;
let mut v: Vec<f32> = (0..dim)
.map(|_| {
rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1);
((rng >> 33) as f32 / (1u64 << 31) as f32) * 2.0 - 1.0
})
.collect();
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
for x in &mut v {
*x /= norm;
}
v
}
#[test]
fn test_full_workflow() {
let mut engine = VectorEngine::new();
let dim = 128;
for i in 0..20 {
let embedding = random_embedding(dim, i as u64);
engine
.store_embedding("test/dataset", i as u64, "test-model", &embedding)
.unwrap();
}
assert_eq!(engine.total_vectors(), 20);
assert!(engine.has_index("test/dataset"));
let query = random_embedding(dim, 5); let results = engine.search("test/dataset", &query, 5).unwrap();
assert!(!results.is_empty());
assert!(results[0].score > 0.9);
let similar = engine.find_similar("test/dataset", 10, 3).unwrap();
assert_eq!(similar.len(), 3);
assert!(similar.iter().all(|r| r.object_id != 10));
engine.delete("test/dataset", 5).unwrap();
assert!(!engine.contains("test/dataset", 5));
assert!(engine.contains("test/dataset", 6));
}
#[test]
fn test_quantization_workflow() {
let embedding = vec![0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8];
let f16 = quantize_f32_to_f16(&embedding);
let back_f16 = dequantize_f16_to_f32(&f16);
for (orig, back) in embedding.iter().zip(back_f16.iter()) {
assert!((orig - back).abs() < 0.01);
}
let (int8, params) = quantize_f32_to_int8(&embedding);
let back_int8 = dequantize_int8_to_f32(&int8, ¶ms);
for (orig, back) in embedding.iter().zip(back_int8.iter()) {
assert!((orig - back).abs() < 0.1);
}
let binary = quantize_f32_to_binary(&embedding);
assert_eq!(binary.len(), 1); }
#[test]
fn test_distance_functions() {
let a = vec![1.0, 0.0, 0.0];
let b = vec![0.0, 1.0, 0.0];
let c = vec![1.0, 0.0, 0.0];
assert!((cosine_similarity(&a, &b) - 0.0).abs() < 1e-6);
assert!((cosine_similarity(&a, &c) - 1.0).abs() < 1e-6);
assert!((euclidean_distance(&a, &b) - 1.414).abs() < 0.01);
assert!((euclidean_distance(&a, &c) - 0.0).abs() < 1e-6);
assert!((dot_product(&a, &b) - 0.0).abs() < 1e-6);
assert!((dot_product(&a, &c) - 1.0).abs() < 1e-6);
}
#[test]
fn test_hnsw_recall() {
let mut index = HnswIndex::new(16, 200);
index.set_ef_search(100);
let dim = 64;
let n = 100;
let mut vectors: Vec<Vec<f32>> = Vec::new();
for i in 0..n {
let v = random_embedding(dim, i as u64);
vectors.push(v.clone());
index.insert(i as u64, &v).unwrap();
}
let mut total_recall = 0.0;
let num_queries = 5;
let k = 10;
for q in 0..num_queries {
let query = &vectors[q * 10];
let hnsw_results = index.search(query, k);
let hnsw_ids: hashbrown::HashSet<_> =
hnsw_results.iter().map(|r| r.object_id).collect();
let mut exact: Vec<_> = (0..n as u64)
.map(|i| (i, cosine_distance(query, &vectors[i as usize])))
.collect();
exact.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
let hits = exact
.iter()
.take(k)
.filter(|(id, _)| hnsw_ids.contains(id))
.count();
total_recall += hits as f32 / k as f32;
}
let avg_recall = total_recall / num_queries as f32;
assert!(avg_recall > 0.8, "Recall {} should be > 0.8", avg_recall);
}
#[test]
fn test_serialization_roundtrip() {
let mut index = HnswIndex::new(16, 200);
for i in 0..5 {
let v = random_embedding(32, i as u64);
index.insert(i as u64, &v).unwrap();
}
let bytes = serialize_index(&index, 42).unwrap();
let restored = deserialize_index(&bytes).unwrap();
assert_eq!(restored.len(), 5);
assert_eq!(restored.dimensions(), 32);
let query = random_embedding(32, 0);
let results = restored.search(&query, 3);
assert!(!results.is_empty());
}
#[test]
fn test_model_ids() {
assert_eq!(
models::hash_model_name("clip-vit-b32"),
models::hash_model_name("clip-vit-b32")
);
assert_ne!(
models::hash_model_name("clip-vit-b32"),
models::hash_model_name("clip-vit-l14")
);
}
}