lcpfs 2026.1.102

LCP File System - A ZFS-inspired copy-on-write filesystem for Rust
// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0

//! Vector/Semantic Search Module for LCPFS.
//!
//! This module provides native vector embedding storage and similarity search
//! at the filesystem level. Store embeddings alongside files, search by
//! semantic meaning.
//!
//! # Overview
//!
//! The vector search subsystem enables AI-powered semantic search directly
//! within LCPFS. Instead of relying on external vector databases, embeddings
//! are stored alongside file metadata and indexed using HNSW (Hierarchical
//! Navigable Small World) graphs for fast approximate nearest neighbor search.
//!
//! # Use Cases
//!
//! - `lcpfs search "photos from beach vacation"` → finds semantically similar images
//! - `lcpfs search "error handling code"` → finds relevant source files
//! - AI applications storing/querying embeddings without external vector DB
//!
//! # Architecture
//!
//! ```text
//! ┌─────────────────────────────────────────────────────────────┐
//! │                     VectorEngine (Global)                    │
//! │  - Manages indexes per dataset                              │
//! │  - Provides store/search/delete API                         │
//! └─────────────────────────────────────────────────────────────┘
//!//!              ┌───────────────┼───────────────┐
//!              ▼               ▼               ▼
//!       ┌──────────┐    ┌──────────┐    ┌──────────┐
//!       │ Dataset  │    │ Dataset  │    │ Dataset  │
//!       │ "photos" │    │ "docs"   │    │ "code"   │
//!       └────┬─────┘    └────┬─────┘    └────┬─────┘
//!            │               │               │
//!            ▼               ▼               ▼
//!       ┌──────────┐    ┌──────────┐    ┌──────────┐
//!       │   HNSW   │    │   HNSW   │    │   HNSW   │
//!       │  Index   │    │  Index   │    │  Index   │
//!       └──────────┘    └──────────┘    └──────────┘
//! ```
//!
//! # Quick Start
//!
//! ```ignore
//! use lcpfs::vector::{store_embedding, search, find_similar};
//!
//! // Store an embedding for a file
//! let embedding = get_clip_embedding(&image_data); // From your ML model
//! store_embedding("pool/photos", file_id, "clip-vit-b32", &embedding)?;
//!
//! // Search by similarity
//! let query_embedding = get_clip_embedding(&query_image);
//! let results = search("pool/photos", &query_embedding, 10)?;
//!
//! for result in results {
//!     println!("Object {} - Score: {}", result.object_id, result.score);
//! }
//!
//! // Find similar objects
//! let similar = find_similar("pool/photos", some_object_id, 5)?;
//! ```
//!
//! # HNSW Algorithm
//!
//! HNSW builds a multi-layer graph for efficient approximate nearest neighbor search:
//!
//! ```text
//! Layer 2:    [A] -------- [B]
//!              |
//! Layer 1:    [A] -- [C] -- [B] -- [D]
//!              |      |      |      |
//! Layer 0:    [A]-[E]-[C]-[F]-[B]-[G]-[D]-[H]
//!//!               Query enters here, navigates to nearest
//!
//! Insert: O(log N)
//! Search: O(log N)
//! Memory: O(N * M) where M = max neighbors
//! ```
//!
//! # Distance Metrics
//!
//! - **Cosine**: Best for normalized embeddings (most common)
//! - **Euclidean**: L2 distance for unnormalized vectors
//! - **Dot Product**: For maximum inner product search
//!
//! # Quantization
//!
//! Reduce storage requirements while maintaining search quality:
//!
//! | Format | Storage | Quality | Use Case |
//! |--------|---------|---------|----------|
//! | F32    | 100%    | 100%    | Default, full precision |
//! | F16    | 50%     | ~99%    | Good balance |
//! | Int8   | 25%     | ~95%    | Large-scale, memory constrained |
//! | Binary | 3%      | ~85%    | Hamming distance, very fast |
//!
//! # Modules
//!
//! - `types`: Core data structures (embeddings, nodes, results)
//! - `distance`: Distance/similarity functions
//! - `hnsw`: HNSW index implementation
//! - `quantize`: Vector quantization (f32→f16/int8/binary)
//! - `engine`: Global VectorEngine API
//! - `persist`: Serialization for on-disk storage

/// Core data types for vector search.
pub mod types;

/// Distance and similarity functions.
pub mod distance;

/// HNSW (Hierarchical Navigable Small World) index.
pub mod hnsw;

/// Vector quantization (compression).
pub mod quantize;

/// Global vector engine and API.
pub mod engine;

/// Persistence layer for index serialization.
pub mod persist;

// ═══════════════════════════════════════════════════════════════════════════════
// RE-EXPORTS
// ═══════════════════════════════════════════════════════════════════════════════

// Types
pub use types::{
    DistanceMetric, HNSW_MAX_NEIGHBORS, HnswNode, IndexParams, IndexType, QuantizationType,
    SearchFilter, VectorEmbedding, VectorEmbeddingHeader, VectorError, VectorIndexMeta,
    VectorSearchResult,
};

// Distance functions
pub use distance::{
    compute_distance, compute_similarity, cosine_distance, cosine_similarity, dot_product,
    euclidean_distance, euclidean_distance_squared, hamming_distance, l1_norm, l2_norm,
    manhattan_distance, normalize, normalize_inplace,
};

// HNSW index
pub use hnsw::{HnswIndex, HnswStats};

// Quantization
pub use quantize::{
    Int8QuantParams, PqParams, binary_cosine_approx, bytes_to_f16, dequantize_f16_to_f32,
    dequantize_int8_signed_to_f32, dequantize_int8_to_f32, f16_to_bytes, f16_to_f32, f32_to_f16,
    quantize_f32_to_binary, quantize_f32_to_f16, quantize_f32_to_int8, quantize_f32_to_int8_signed,
};

// Engine (global API)
pub use engine::{
    VectorEngine, batch_store, delete_embedding, find_similar, get_embedding, get_index_meta,
    get_index_stats, list_datasets, rebuild_index, search, search_filtered, set_ef_search,
    store_embedding, total_vectors,
};

// Persistence
pub use persist::{
    SerializedIndex, SerializedLayer, VectorIndexHeader, VectorRecord, deserialize_index,
    serialize_index,
};

// ═══════════════════════════════════════════════════════════════════════════════
// CONVENIENCE FUNCTIONS
// ═══════════════════════════════════════════════════════════════════════════════

/// Model ID hash for common embedding models.
///
/// These are commonly used model identifiers for quick reference.
pub mod models {
    /// OpenAI text-embedding-ada-002 (1536 dimensions)
    pub const OPENAI_ADA_002: u32 = 0x61646132; // "ada2"

    /// OpenAI text-embedding-3-small (1536 dimensions)
    pub const OPENAI_EMBED_3_SMALL: u32 = 0x65336d73; // "e3ms"

    /// OpenAI text-embedding-3-large (3072 dimensions)
    pub const OPENAI_EMBED_3_LARGE: u32 = 0x65336c67; // "e3lg"

    /// CLIP ViT-B/32 (512 dimensions)
    pub const CLIP_VIT_B32: u32 = 0x636c6232; // "clb2"

    /// CLIP ViT-L/14 (768 dimensions)
    pub const CLIP_VIT_L14: u32 = 0x636c6c14; // "cll4"

    /// Sentence-BERT (768 dimensions)
    pub const SBERT_BASE: u32 = 0x73626572; // "sber"

    /// Cohere embed-english-v3 (1024 dimensions)
    pub const COHERE_EMBED_V3: u32 = 0x636f6833; // "coh3"

    /// BGE-base-en (768 dimensions)
    pub const BGE_BASE: u32 = 0x62676562; // "bgeb"

    /// Hash a model name to a u32 ID.
    pub fn hash_model_name(name: &str) -> u32 {
        let mut hash = 0u32;
        for byte in name.bytes() {
            hash = hash.wrapping_mul(31).wrapping_add(byte as u32);
        }
        hash
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// TESTS
// ═══════════════════════════════════════════════════════════════════════════════

#[cfg(test)]
mod tests {
    use super::*;
    use alloc::vec;
    use alloc::vec::Vec;

    fn random_embedding(dim: usize, seed: u64) -> Vec<f32> {
        let mut rng = seed;
        let mut v: Vec<f32> = (0..dim)
            .map(|_| {
                rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1);
                ((rng >> 33) as f32 / (1u64 << 31) as f32) * 2.0 - 1.0
            })
            .collect();

        // Normalize
        let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
        for x in &mut v {
            *x /= norm;
        }
        v
    }

    #[test]
    fn test_full_workflow() {
        let mut engine = VectorEngine::new();
        let dim = 128;

        // Store embeddings for several objects
        for i in 0..20 {
            let embedding = random_embedding(dim, i as u64);
            engine
                .store_embedding("test/dataset", i as u64, "test-model", &embedding)
                .unwrap();
        }

        // Verify count
        assert_eq!(engine.total_vectors(), 20);
        assert!(engine.has_index("test/dataset"));

        // Search
        let query = random_embedding(dim, 5); // Same as object 5
        let results = engine.search("test/dataset", &query, 5).unwrap();

        // Object 5 should be the best match (or very close)
        assert!(!results.is_empty());
        assert!(results[0].score > 0.9); // Very similar

        // Find similar
        let similar = engine.find_similar("test/dataset", 10, 3).unwrap();
        assert_eq!(similar.len(), 3);
        assert!(similar.iter().all(|r| r.object_id != 10)); // Excludes self

        // Delete
        engine.delete("test/dataset", 5).unwrap();
        assert!(!engine.contains("test/dataset", 5));
        assert!(engine.contains("test/dataset", 6));
    }

    #[test]
    fn test_quantization_workflow() {
        let embedding = vec![0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8];

        // F16 roundtrip
        let f16 = quantize_f32_to_f16(&embedding);
        let back_f16 = dequantize_f16_to_f32(&f16);
        for (orig, back) in embedding.iter().zip(back_f16.iter()) {
            assert!((orig - back).abs() < 0.01);
        }

        // Int8 roundtrip
        let (int8, params) = quantize_f32_to_int8(&embedding);
        let back_int8 = dequantize_int8_to_f32(&int8, &params);
        for (orig, back) in embedding.iter().zip(back_int8.iter()) {
            assert!((orig - back).abs() < 0.1);
        }

        // Binary
        let binary = quantize_f32_to_binary(&embedding);
        assert_eq!(binary.len(), 1); // 8 dimensions = 1 byte
    }

    #[test]
    fn test_distance_functions() {
        let a = vec![1.0, 0.0, 0.0];
        let b = vec![0.0, 1.0, 0.0];
        let c = vec![1.0, 0.0, 0.0];

        // Orthogonal vectors
        assert!((cosine_similarity(&a, &b) - 0.0).abs() < 1e-6);

        // Identical vectors
        assert!((cosine_similarity(&a, &c) - 1.0).abs() < 1e-6);

        // Euclidean
        assert!((euclidean_distance(&a, &b) - 1.414).abs() < 0.01);
        assert!((euclidean_distance(&a, &c) - 0.0).abs() < 1e-6);

        // Dot product
        assert!((dot_product(&a, &b) - 0.0).abs() < 1e-6);
        assert!((dot_product(&a, &c) - 1.0).abs() < 1e-6);
    }

    #[test]
    fn test_hnsw_recall() {
        let mut index = HnswIndex::new(16, 200);
        index.set_ef_search(100);

        let dim = 64;
        let n = 100;

        // Insert vectors
        let mut vectors: Vec<Vec<f32>> = Vec::new();
        for i in 0..n {
            let v = random_embedding(dim, i as u64);
            vectors.push(v.clone());
            index.insert(i as u64, &v).unwrap();
        }

        // Test recall on a few queries
        let mut total_recall = 0.0;
        let num_queries = 5;
        let k = 10;

        for q in 0..num_queries {
            let query = &vectors[q * 10];

            // HNSW results
            let hnsw_results = index.search(query, k);
            let hnsw_ids: hashbrown::HashSet<_> =
                hnsw_results.iter().map(|r| r.object_id).collect();

            // Brute force (exact)
            let mut exact: Vec<_> = (0..n as u64)
                .map(|i| (i, cosine_distance(query, &vectors[i as usize])))
                .collect();
            exact.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());

            // Count matches
            let hits = exact
                .iter()
                .take(k)
                .filter(|(id, _)| hnsw_ids.contains(id))
                .count();
            total_recall += hits as f32 / k as f32;
        }

        let avg_recall = total_recall / num_queries as f32;
        assert!(avg_recall > 0.8, "Recall {} should be > 0.8", avg_recall);
    }

    #[test]
    fn test_serialization_roundtrip() {
        let mut index = HnswIndex::new(16, 200);

        for i in 0..5 {
            let v = random_embedding(32, i as u64);
            index.insert(i as u64, &v).unwrap();
        }

        let bytes = serialize_index(&index, 42).unwrap();
        let restored = deserialize_index(&bytes).unwrap();

        assert_eq!(restored.len(), 5);
        assert_eq!(restored.dimensions(), 32);

        // Search should work on restored index
        let query = random_embedding(32, 0);
        let results = restored.search(&query, 3);
        assert!(!results.is_empty());
    }

    #[test]
    fn test_model_ids() {
        // Check that hashing is deterministic
        assert_eq!(
            models::hash_model_name("clip-vit-b32"),
            models::hash_model_name("clip-vit-b32")
        );

        // Different models should have different hashes
        assert_ne!(
            models::hash_model_name("clip-vit-b32"),
            models::hash_model_name("clip-vit-l14")
        );
    }
}