use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use std::path::Path;
use thiserror::Error;
pub use veclite_index::{
hnsw::{HnswConfig, HnswIndex},
CosineMetric, DotMetric, EuclideanMetric, ManhattanMetric, Metric, SimilarityMetric,
};
pub use veclite_storage::{Record, Storage, StorageError};
#[derive(Error, Debug)]
pub enum VecLiteError {
#[error("Storage error: {0}")]
Storage(#[from] StorageError),
#[error("Serialization error: {0}")]
Serde(#[from] serde_json::Error),
}
pub type Result<T> = std::result::Result<T, VecLiteError>;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchResult {
pub id: String,
pub score: f32,
pub metadata: Option<serde_json::Value>,
}
pub struct SearchBuilder<'a> {
db: &'a VecLite,
query: Vec<f32>,
k: usize,
filters: Vec<(&'a str, serde_json::Value)>,
decay_factor: Option<f32>,
current_time: Option<u64>,
}
impl<'a> SearchBuilder<'a> {
pub fn new(db: &'a VecLite, query: Vec<f32>) -> Self {
Self {
db,
query,
k: 5,
filters: Vec::new(),
decay_factor: None,
current_time: None,
}
}
pub fn top_k(mut self, k: usize) -> Self {
self.k = k;
self
}
pub fn filter<V: Serialize>(mut self, key: &'a str, value: V) -> Self {
if let Ok(v) = serde_json::to_value(value) {
self.filters.push((key, v));
}
self
}
pub fn time_decay(mut self, factor: f32, current_time: u64) -> Self {
self.decay_factor = Some(factor);
self.current_time = Some(current_time);
self
}
pub fn execute(self) -> Result<Vec<SearchResult>> {
let use_hnsw =
self.filters.is_empty() && self.decay_factor.is_none() && self.db.index.is_some();
if use_hnsw {
let hnsw = self.db.index.as_ref().unwrap();
let get_vector = |idx: usize| self.db.storage.records[idx].vector.as_slice();
let distance_fn = |a: &[f32], b: &[f32]| match self.db.metric {
Metric::Cosine => CosineMetric::distance(a, b),
Metric::DotProduct => DotMetric::distance(a, b),
Metric::Euclidean => EuclideanMetric::distance(a, b),
Metric::Manhattan => ManhattanMetric::distance(a, b),
};
let results_indices = hnsw.search(
&self.query,
self.k,
hnsw.config.ef_search,
&get_vector,
&distance_fn,
);
let mut results = Vec::with_capacity(results_indices.len());
for (idx, score) in results_indices {
let r = &self.db.storage.records[idx];
results.push(SearchResult {
id: r.id.clone(),
score,
metadata: r.metadata.clone(),
});
}
return Ok(results);
}
let mut results: Vec<SearchResult> = self
.db
.storage
.records
.par_iter()
.filter(|r| {
if self.filters.is_empty() {
return true;
}
if let Some(ref meta) = r.metadata {
for (k, v) in &self.filters {
if meta.get(*k) != Some(v) {
return false;
}
}
true
} else {
false
}
})
.map(|r| {
let mut score = match self.db.metric {
Metric::Cosine => CosineMetric::distance(&self.query, &r.vector),
Metric::DotProduct => DotMetric::distance(&self.query, &r.vector),
Metric::Euclidean => EuclideanMetric::distance(&self.query, &r.vector),
Metric::Manhattan => ManhattanMetric::distance(&self.query, &r.vector),
};
if let (Some(factor), Some(current), Some(ts)) =
(self.decay_factor, self.current_time, r.timestamp)
{
if current > ts {
let age = (current - ts) as f32;
score *= factor.powf(age / 86400.0);
}
}
SearchResult {
id: r.id.clone(),
score,
metadata: r.metadata.clone(),
}
})
.collect();
let higher_better = match self.db.metric {
Metric::Cosine | Metric::DotProduct => true,
Metric::Euclidean | Metric::Manhattan => false,
};
if higher_better {
results.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
} else {
results.sort_by(|a, b| {
a.score
.partial_cmp(&b.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
}
results.truncate(self.k);
Ok(results)
}
}
pub struct VecLite {
pub storage: Storage,
pub metric: Metric,
pub index: Option<HnswIndex>,
}
impl VecLite {
pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
Self::open_with_metric(path, Metric::Cosine)
}
pub fn open_with_metric<P: AsRef<Path>>(path: P, metric: Metric) -> Result<Self> {
let storage = Storage::open(path)?;
let db = Self {
storage,
metric,
index: None,
};
Ok(db)
}
pub fn enable_hnsw(&mut self, config: HnswConfig) {
let mut index = HnswIndex::new(config);
let get_vector = |idx: usize| self.storage.records[idx].vector.as_slice();
let distance_fn = |a: &[f32], b: &[f32]| match self.metric {
Metric::Cosine => CosineMetric::distance(a, b),
Metric::DotProduct => DotMetric::distance(a, b),
Metric::Euclidean => EuclideanMetric::distance(a, b),
Metric::Manhattan => ManhattanMetric::distance(a, b),
};
for i in 0..self.storage.records.len() {
index.insert(
i,
&self.storage.records[i].vector,
&get_vector,
&distance_fn,
);
}
self.index = Some(index);
}
pub fn insert(
&mut self,
id: &str,
vector: Vec<f32>,
metadata: Option<serde_json::Value>,
) -> Result<()> {
self.insert_with_time(id, vector, metadata, None)
}
pub fn insert_with_time(
&mut self,
id: &str,
vector: Vec<f32>,
metadata: Option<serde_json::Value>,
timestamp: Option<u64>,
) -> Result<()> {
let record = Record {
id: id.to_string(),
vector: vector.clone(),
metadata,
timestamp,
};
let idx = self.storage.records.len();
self.storage.append(record)?;
if let Some(index) = &mut self.index {
let get_vector = |i: usize| self.storage.records[i].vector.as_slice();
let distance_fn = |a: &[f32], b: &[f32]| match self.metric {
Metric::Cosine => CosineMetric::distance(a, b),
Metric::DotProduct => DotMetric::distance(a, b),
Metric::Euclidean => EuclideanMetric::distance(a, b),
Metric::Manhattan => ManhattanMetric::distance(a, b),
};
index.insert(idx, &vector, &get_vector, &distance_fn);
}
Ok(())
}
pub fn insert_batch(
&mut self,
records: Vec<(&str, Vec<f32>, Option<serde_json::Value>)>,
) -> Result<()> {
let start_idx = self.storage.records.len();
let mut recs = Vec::new();
for (id, vector, metadata) in records {
recs.push(Record {
id: id.to_string(),
vector,
metadata,
timestamp: None,
});
}
self.storage.append_batch(recs.clone())?;
if let Some(index) = &mut self.index {
let get_vector = |i: usize| self.storage.records[i].vector.as_slice();
let distance_fn = |a: &[f32], b: &[f32]| match self.metric {
Metric::Cosine => CosineMetric::distance(a, b),
Metric::DotProduct => DotMetric::distance(a, b),
Metric::Euclidean => EuclideanMetric::distance(a, b),
Metric::Manhattan => ManhattanMetric::distance(a, b),
};
for (offset, record) in recs.iter().enumerate() {
index.insert(
start_idx + offset,
&record.vector,
&get_vector,
&distance_fn,
);
}
}
Ok(())
}
pub fn search(&self, query: &[f32], k: usize) -> Result<Vec<SearchResult>> {
SearchBuilder::new(self, query.to_vec()).top_k(k).execute()
}
pub fn build_search(&self, query: &[f32]) -> SearchBuilder<'_> {
SearchBuilder::new(self, query.to_vec())
}
pub fn stats(&self) -> Result<(usize, usize)> {
let stats = self.storage.stats()?;
Ok(stats)
}
}
impl VecLite {
pub fn delete(&mut self, id: &str) -> Result<()> {
let record = Record {
id: id.to_string(),
vector: vec![], metadata: None,
timestamp: None, };
self.storage.append(record)?;
self.storage.records.retain(|r| r.id != id);
Ok(())
}
pub fn update(
&mut self,
id: &str,
vector: Vec<f32>,
metadata: Option<serde_json::Value>,
) -> Result<()> {
self.delete(id)?;
self.insert(id, vector, metadata)?;
Ok(())
}
pub fn compact(&mut self) -> Result<()> {
Ok(())
}
}