use rayon::prelude::*;
use serde::Serialize;
use std::path::Path;
use thiserror::Error;
pub use veclite_index::{
CosineMetric, DotMetric, EuclideanMetric, ManhattanMetric, Metric, SimilarityMetric,
};
pub use veclite_storage::{Record, Storage, StorageError};
#[derive(Error, Debug)]
pub enum VecLiteError {
#[error("Storage error: {0}")]
Storage(#[from] StorageError),
#[error("Serialization error: {0}")]
Serde(#[from] serde_json::Error),
}
pub type Result<T> = std::result::Result<T, VecLiteError>;
#[derive(Debug, Clone)]
pub struct SearchResult {
pub id: String,
pub score: f32,
pub metadata: Option<serde_json::Value>,
}
pub struct SearchBuilder<'a> {
db: &'a VecLite,
query: Vec<f32>,
k: usize,
filters: Vec<(&'a str, serde_json::Value)>,
decay_factor: Option<f32>,
current_time: Option<u64>,
}
impl<'a> SearchBuilder<'a> {
pub fn new(db: &'a VecLite, query: Vec<f32>) -> Self {
Self {
db,
query,
k: 5,
filters: Vec::new(),
decay_factor: None,
current_time: None,
}
}
pub fn top_k(mut self, k: usize) -> Self {
self.k = k;
self
}
pub fn filter<V: Serialize>(mut self, key: &'a str, value: V) -> Self {
if let Ok(v) = serde_json::to_value(value) {
self.filters.push((key, v));
}
self
}
pub fn time_decay(mut self, factor: f32, current_time: u64) -> Self {
self.decay_factor = Some(factor);
self.current_time = Some(current_time);
self
}
pub fn execute(self) -> Result<Vec<SearchResult>> {
let mut results: Vec<SearchResult> = self
.db
.storage
.records
.par_iter()
.filter(|r| {
if self.filters.is_empty() {
return true;
}
if let Some(ref meta) = r.metadata {
for (k, v) in &self.filters {
if meta.get(*k) != Some(v) {
return false;
}
}
true
} else {
false
}
})
.map(|r| {
let mut score = match self.db.metric {
Metric::Cosine => CosineMetric::distance(&self.query, &r.vector),
Metric::DotProduct => DotMetric::distance(&self.query, &r.vector),
Metric::Euclidean => EuclideanMetric::distance(&self.query, &r.vector),
Metric::Manhattan => ManhattanMetric::distance(&self.query, &r.vector),
};
if let (Some(factor), Some(current), Some(ts)) =
(self.decay_factor, self.current_time, r.timestamp)
{
if current > ts {
let age = (current - ts) as f32;
score *= factor.powf(age / 86400.0);
}
}
SearchResult {
id: r.id.clone(),
score,
metadata: r.metadata.clone(),
}
})
.collect();
let higher_better = match self.db.metric {
Metric::Cosine | Metric::DotProduct => true,
Metric::Euclidean | Metric::Manhattan => false,
};
if higher_better {
results.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
} else {
results.sort_by(|a, b| {
a.score
.partial_cmp(&b.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
}
results.truncate(self.k);
Ok(results)
}
}
pub struct VecLite {
pub storage: Storage,
pub metric: Metric,
}
impl VecLite {
pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
Self::open_with_metric(path, Metric::Cosine)
}
pub fn open_with_metric<P: AsRef<Path>>(path: P, metric: Metric) -> Result<Self> {
let storage = Storage::open(path)?;
Ok(Self { storage, metric })
}
pub fn insert(
&mut self,
id: &str,
vector: Vec<f32>,
metadata: Option<serde_json::Value>,
) -> Result<()> {
self.insert_with_time(id, vector, metadata, None)
}
pub fn insert_with_time(
&mut self,
id: &str,
vector: Vec<f32>,
metadata: Option<serde_json::Value>,
timestamp: Option<u64>,
) -> Result<()> {
let record = Record {
id: id.to_string(),
vector,
metadata,
timestamp,
};
self.storage.append(record)?;
Ok(())
}
pub fn insert_batch(
&mut self,
records: Vec<(&str, Vec<f32>, Option<serde_json::Value>)>,
) -> Result<()> {
let mut recs = Vec::new();
for (id, vector, metadata) in records {
recs.push(Record {
id: id.to_string(),
vector,
metadata,
timestamp: None,
});
}
self.storage.append_batch(recs)?;
Ok(())
}
pub fn search(&self, query: &[f32], k: usize) -> Result<Vec<SearchResult>> {
SearchBuilder::new(self, query.to_vec()).top_k(k).execute()
}
pub fn build_search(&self, query: &[f32]) -> SearchBuilder<'_> {
SearchBuilder::new(self, query.to_vec())
}
pub fn stats(&self) -> Result<(usize, usize)> {
let stats = self.storage.stats()?;
Ok(stats)
}
}