veclite-core 1.0.9

Core query and execution engine for VecLite
Documentation
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use std::path::Path;
use thiserror::Error;

pub use veclite_index::{
    hnsw::{HnswConfig, HnswIndex},
    CosineMetric, DotMetric, EuclideanMetric, ManhattanMetric, Metric, SimilarityMetric,
};
pub use veclite_storage::{Record, Storage, StorageError};

#[derive(Error, Debug)]
pub enum VecLiteError {
    #[error("Storage error: {0}")]
    Storage(#[from] StorageError),
    #[error("Serialization error: {0}")]
    Serde(#[from] serde_json::Error),
}

pub type Result<T> = std::result::Result<T, VecLiteError>;

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchResult {
    pub id: String,
    pub score: f32,
    pub metadata: Option<serde_json::Value>,
}

pub struct SearchBuilder<'a> {
    db: &'a VecLite,
    query: Vec<f32>,
    k: usize,
    filters: Vec<(&'a str, serde_json::Value)>,
    decay_factor: Option<f32>,
    current_time: Option<u64>,
}

impl<'a> SearchBuilder<'a> {
    pub fn new(db: &'a VecLite, query: Vec<f32>) -> Self {
        Self {
            db,
            query,
            k: 5,
            filters: Vec::new(),
            decay_factor: None,
            current_time: None,
        }
    }

    pub fn top_k(mut self, k: usize) -> Self {
        self.k = k;
        self
    }

    pub fn filter<V: Serialize>(mut self, key: &'a str, value: V) -> Self {
        if let Ok(v) = serde_json::to_value(value) {
            self.filters.push((key, v));
        }
        self
    }

    pub fn time_decay(mut self, factor: f32, current_time: u64) -> Self {
        self.decay_factor = Some(factor);
        self.current_time = Some(current_time);
        self
    }

    pub fn execute(self) -> Result<Vec<SearchResult>> {
        // If we have an HNSW index and NO filters and NO time decay, we can use it!
        // (HNSW doesn't inherently support post-filtering easily without more work,
        //  but for simple searches it's perfect.)
        let use_hnsw =
            self.filters.is_empty() && self.decay_factor.is_none() && self.db.index.is_some();

        if use_hnsw {
            let hnsw = self.db.index.as_ref().unwrap();

            let get_vector = |idx: usize| self.db.storage.records[idx].vector.as_slice();

            let distance_fn = |a: &[f32], b: &[f32]| match self.db.metric {
                Metric::Cosine => CosineMetric::distance(a, b),
                Metric::DotProduct => DotMetric::distance(a, b),
                Metric::Euclidean => EuclideanMetric::distance(a, b),
                Metric::Manhattan => ManhattanMetric::distance(a, b),
            };

            let results_indices = hnsw.search(
                &self.query,
                self.k,
                hnsw.config.ef_search,
                &get_vector,
                &distance_fn,
            );

            let mut results = Vec::with_capacity(results_indices.len());
            for (idx, score) in results_indices {
                let r = &self.db.storage.records[idx];
                results.push(SearchResult {
                    id: r.id.clone(),
                    score,
                    metadata: r.metadata.clone(),
                });
            }

            return Ok(results);
        }

        let mut results: Vec<SearchResult> = self
            .db
            .storage
            .records
            .par_iter()
            .filter(|r| {
                if self.filters.is_empty() {
                    return true;
                }
                if let Some(ref meta) = r.metadata {
                    for (k, v) in &self.filters {
                        if meta.get(*k) != Some(v) {
                            return false;
                        }
                    }
                    true
                } else {
                    false
                }
            })
            .map(|r| {
                let mut score = match self.db.metric {
                    Metric::Cosine => CosineMetric::distance(&self.query, &r.vector),
                    Metric::DotProduct => DotMetric::distance(&self.query, &r.vector),
                    Metric::Euclidean => EuclideanMetric::distance(&self.query, &r.vector),
                    Metric::Manhattan => ManhattanMetric::distance(&self.query, &r.vector),
                };

                if let (Some(factor), Some(current), Some(ts)) =
                    (self.decay_factor, self.current_time, r.timestamp)
                {
                    if current > ts {
                        let age = (current - ts) as f32;
                        score *= factor.powf(age / 86400.0);
                    }
                }

                SearchResult {
                    id: r.id.clone(),
                    score,
                    metadata: r.metadata.clone(),
                }
            })
            .collect();

        let higher_better = match self.db.metric {
            Metric::Cosine | Metric::DotProduct => true,
            Metric::Euclidean | Metric::Manhattan => false,
        };

        if higher_better {
            results.sort_by(|a, b| {
                b.score
                    .partial_cmp(&a.score)
                    .unwrap_or(std::cmp::Ordering::Equal)
            });
        } else {
            results.sort_by(|a, b| {
                a.score
                    .partial_cmp(&b.score)
                    .unwrap_or(std::cmp::Ordering::Equal)
            });
        }

        results.truncate(self.k);
        Ok(results)
    }
}

pub struct VecLite {
    pub storage: Storage,
    pub metric: Metric,
    pub index: Option<HnswIndex>,
}

impl VecLite {
    /// Open database
    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
        Self::open_with_metric(path, Metric::Cosine)
    }

    /// Open database with specific metric
    pub fn open_with_metric<P: AsRef<Path>>(path: P, metric: Metric) -> Result<Self> {
        let storage = Storage::open(path)?;
        let db = Self {
            storage,
            metric,
            index: None,
        };
        // By default, let's say we don't enable HNSW unless they call a build method,
        // but wait! The prompt said "When inserting a vector, add it to the HnswIndex if HNSW is enabled."
        // Let's create an enable_hnsw() function.
        Ok(db)
    }

    pub fn enable_hnsw(&mut self, config: HnswConfig) {
        let mut index = HnswIndex::new(config);

        let get_vector = |idx: usize| self.storage.records[idx].vector.as_slice();
        let distance_fn = |a: &[f32], b: &[f32]| match self.metric {
            Metric::Cosine => CosineMetric::distance(a, b),
            Metric::DotProduct => DotMetric::distance(a, b),
            Metric::Euclidean => EuclideanMetric::distance(a, b),
            Metric::Manhattan => ManhattanMetric::distance(a, b),
        };

        // Rebuild index from existing records
        for i in 0..self.storage.records.len() {
            index.insert(
                i,
                &self.storage.records[i].vector,
                &get_vector,
                &distance_fn,
            );
        }

        self.index = Some(index);
    }

    /// Insert single vector
    pub fn insert(
        &mut self,
        id: &str,
        vector: Vec<f32>,
        metadata: Option<serde_json::Value>,
    ) -> Result<()> {
        self.insert_with_time(id, vector, metadata, None)
    }

    /// Insert vector with time
    pub fn insert_with_time(
        &mut self,
        id: &str,
        vector: Vec<f32>,
        metadata: Option<serde_json::Value>,
        timestamp: Option<u64>,
    ) -> Result<()> {
        let record = Record {
            id: id.to_string(),
            vector: vector.clone(),
            metadata,
            timestamp,
        };

        let idx = self.storage.records.len();
        self.storage.append(record)?;

        if let Some(index) = &mut self.index {
            let get_vector = |i: usize| self.storage.records[i].vector.as_slice();
            let distance_fn = |a: &[f32], b: &[f32]| match self.metric {
                Metric::Cosine => CosineMetric::distance(a, b),
                Metric::DotProduct => DotMetric::distance(a, b),
                Metric::Euclidean => EuclideanMetric::distance(a, b),
                Metric::Manhattan => ManhattanMetric::distance(a, b),
            };
            index.insert(idx, &vector, &get_vector, &distance_fn);
        }

        Ok(())
    }

    /// Insert multiple vectors
    pub fn insert_batch(
        &mut self,
        records: Vec<(&str, Vec<f32>, Option<serde_json::Value>)>,
    ) -> Result<()> {
        let start_idx = self.storage.records.len();
        let mut recs = Vec::new();
        for (id, vector, metadata) in records {
            recs.push(Record {
                id: id.to_string(),
                vector,
                metadata,
                timestamp: None,
            });
        }
        self.storage.append_batch(recs.clone())?;

        if let Some(index) = &mut self.index {
            let get_vector = |i: usize| self.storage.records[i].vector.as_slice();
            let distance_fn = |a: &[f32], b: &[f32]| match self.metric {
                Metric::Cosine => CosineMetric::distance(a, b),
                Metric::DotProduct => DotMetric::distance(a, b),
                Metric::Euclidean => EuclideanMetric::distance(a, b),
                Metric::Manhattan => ManhattanMetric::distance(a, b),
            };

            for (offset, record) in recs.iter().enumerate() {
                index.insert(
                    start_idx + offset,
                    &record.vector,
                    &get_vector,
                    &distance_fn,
                );
            }
        }

        Ok(())
    }

    /// Simple search
    pub fn search(&self, query: &[f32], k: usize) -> Result<Vec<SearchResult>> {
        SearchBuilder::new(self, query.to_vec()).top_k(k).execute()
    }

    /// Search with filters and decay
    pub fn build_search(&self, query: &[f32]) -> SearchBuilder<'_> {
        SearchBuilder::new(self, query.to_vec())
    }

    /// Database statistics
    pub fn stats(&self) -> Result<(usize, usize)> {
        let stats = self.storage.stats()?;
        Ok(stats)
    }
}

impl VecLite {
    /// Delete a vector by marking it as tombstoned
    pub fn delete(&mut self, id: &str) -> Result<()> {
        // Append a tombstone record
        let record = Record {
            id: id.to_string(),
            vector: vec![], // Empty vector indicates deletion
            metadata: None,
            timestamp: None, // We should probably add deleted flag to Record in Phase 2
        };
        self.storage.append(record)?;

        // Remove from in-memory index
        // This breaks HNSW indexing if we shift indices.
        // But for now, just keep the naive implementation.
        // HNSW removal is not implemented yet.
        self.storage.records.retain(|r| r.id != id);

        Ok(())
    }

    /// Update a vector (delete then insert)
    pub fn update(
        &mut self,
        id: &str,
        vector: Vec<f32>,
        metadata: Option<serde_json::Value>,
    ) -> Result<()> {
        self.delete(id)?;
        self.insert(id, vector, metadata)?;
        Ok(())
    }

    /// Compact the database (remove tombstones and rewrite file)
    pub fn compact(&mut self) -> Result<()> {
        // Phase 2: Rewrite the .vec file excluding deleted records
        // For now, it's a no-op scaffold
        Ok(())
    }
}