Skip to main content

claw_vector/
types.rs

1// types.rs — core domain types: VectorRecord, Collection, DistanceMetric, IndexType,
2//             SearchResult, SearchQuery, and MetadataFilter.
3use chrono::{DateTime, Utc};
4use serde::{Deserialize, Serialize};
5use uuid::Uuid;
6
7use crate::error::{VectorError, VectorResult};
8
9// ─── DistanceMetric ───────────────────────────────────────────────────────────
10
11/// Distance metric used to compare vectors in a collection.
12#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
13#[serde(rename_all = "snake_case")]
14pub enum DistanceMetric {
15    /// Cosine similarity (1 − cosine → lower is closer).
16    Cosine,
17    /// Euclidean (L2) distance.
18    Euclidean,
19    /// Negative dot product (lower is closer).
20    DotProduct,
21}
22
23impl DistanceMetric {
24    /// Compute the distance between two vectors under this metric.
25    pub fn compute(&self, a: &[f32], b: &[f32]) -> f32 {
26        match self {
27            DistanceMetric::Cosine => {
28                let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
29                let na: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
30                let nb: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
31                if na == 0.0 || nb == 0.0 {
32                    1.0
33                } else {
34                    1.0 - dot / (na * nb)
35                }
36            }
37            DistanceMetric::Euclidean => a
38                .iter()
39                .zip(b.iter())
40                .map(|(x, y)| (x - y) * (x - y))
41                .sum::<f32>()
42                .sqrt(),
43            DistanceMetric::DotProduct => -a.iter().zip(b.iter()).map(|(x, y)| x * y).sum::<f32>(),
44        }
45    }
46}
47
48// ─── IndexType ───────────────────────────────────────────────────────────────
49
50/// The backing index algorithm for a collection.
51#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
52#[serde(rename_all = "snake_case")]
53pub enum IndexType {
54    /// Approximate nearest-neighbour search via HNSW.
55    HNSW,
56    /// Brute-force flat scan (best for small collections).
57    Flat,
58}
59
60impl IndexType {
61    /// Choose the appropriate index type based on the collection size.
62    ///
63    /// Returns `Flat` when `vector_count < 1 000`, `HNSW` otherwise.
64    pub fn auto_select(vector_count: usize) -> Self {
65        if vector_count < 1_000 {
66            IndexType::Flat
67        } else {
68            IndexType::HNSW
69        }
70    }
71}
72
73// ─── VectorRecord ────────────────────────────────────────────────────────────
74
75/// A single vector stored in a collection, with optional text and metadata.
76#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
77pub struct VectorRecord {
78    /// Globally unique record identifier.
79    pub id: Uuid,
80    /// Name of the collection this record belongs to.
81    pub collection: String,
82    /// The raw embedding vector.
83    pub vector: Vec<f32>,
84    /// Arbitrary JSON metadata attached to this record.
85    pub metadata: serde_json::Value,
86    /// Original text from which the vector was generated (if stored).
87    pub text: Option<String>,
88    /// UTC timestamp of record creation.
89    pub created_at: DateTime<Utc>,
90}
91
92impl VectorRecord {
93    /// Create a new record with a fresh UUID and no text or metadata.
94    pub fn new(collection: impl Into<String>, vector: Vec<f32>) -> Self {
95        VectorRecord {
96            id: Uuid::new_v4(),
97            collection: collection.into(),
98            vector,
99            metadata: serde_json::json!({}),
100            text: None,
101            created_at: Utc::now(),
102        }
103    }
104
105    /// Builder: attach the original text to this record.
106    pub fn with_text(mut self, text: impl Into<String>) -> Self {
107        self.text = Some(text.into());
108        self
109    }
110
111    /// Builder: attach arbitrary JSON metadata to this record.
112    pub fn with_metadata(mut self, meta: serde_json::Value) -> Self {
113        self.metadata = meta;
114        self
115    }
116
117    /// Return the dimensionality of the stored vector.
118    pub fn dimensions(&self) -> usize {
119        self.vector.len()
120    }
121}
122
123// ─── Collection ──────────────────────────────────────────────────────────────
124
125/// Describes a named collection of vectors with a shared dimension and distance metric.
126#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
127pub struct Collection {
128    /// Workspace identifier used for tenant isolation.
129    pub workspace_id: String,
130    /// Unique collection name.
131    pub name: String,
132    /// Expected vector dimensionality for all records in this collection.
133    pub dimensions: usize,
134    /// Distance metric used for similarity search.
135    pub distance: DistanceMetric,
136    /// Active index algorithm.
137    pub index_type: IndexType,
138    /// UTC timestamp of collection creation.
139    pub created_at: DateTime<Utc>,
140    /// Number of vectors currently stored in this collection.
141    pub vector_count: u64,
142    /// Arbitrary JSON metadata for the collection.
143    pub metadata: serde_json::Value,
144    /// HNSW `ef_construction` build parameter.
145    pub ef_construction: usize,
146    /// HNSW `M` connections parameter.
147    pub m_connections: usize,
148}
149
150// ─── SearchResult ────────────────────────────────────────────────────────────
151
152/// A single result returned by a nearest-neighbour search.
153#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
154pub struct SearchResult {
155    /// Record identifier.
156    pub id: Uuid,
157    /// Normalized similarity score in the range `[0.0, 1.0]` (higher is better).
158    pub score: f32,
159    /// The raw vector (only set when the query requests it).
160    pub vector: Option<Vec<f32>>,
161    /// Record metadata (only set when the query requests it).
162    pub metadata: serde_json::Value,
163    /// Original text (if stored with the record).
164    pub text: Option<String>,
165    /// UTC timestamp when the source record was created.
166    pub created_at: DateTime<Utc>,
167}
168
169/// Additional metrics captured for a search operation.
170#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
171pub struct SearchMetrics {
172    /// Dimensionality of the input query vector.
173    pub query_vector_dims: usize,
174    /// Number of raw ANN or hybrid candidates examined before post-processing.
175    pub candidates_evaluated: usize,
176    /// Number of candidates that survived post-filtering and reranking.
177    pub post_filter_count: usize,
178    /// End-to-end latency for the search in microseconds.
179    pub latency_us: u64,
180}
181
182/// Full search response, including user-visible results and execution metrics.
183#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
184pub struct SearchResponse {
185    /// Ordered nearest-neighbour results.
186    pub results: Vec<SearchResult>,
187    /// Metrics captured while serving the request.
188    pub metrics: SearchMetrics,
189}
190
191// ─── MetadataFilter ──────────────────────────────────────────────────────────
192
193/// A composable DSL for filtering search results by their JSON metadata.
194#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
195#[serde(tag = "op", rename_all = "snake_case")]
196pub enum MetadataFilter {
197    /// Equality check: `metadata[key] == value`.
198    Eq {
199        /// JSON object key to compare.
200        key: String,
201        /// Expected value.
202        value: serde_json::Value,
203    },
204    /// Numeric greater-than check.
205    Gt {
206        /// Dot-notation JSON path to compare.
207        key: String,
208        /// Threshold value.
209        value: f64,
210    },
211    /// Numeric less-than check.
212    Lt {
213        /// Dot-notation JSON path to compare.
214        key: String,
215        /// Threshold value.
216        value: f64,
217    },
218    /// Case-insensitive substring match for string values.
219    Contains {
220        /// Dot-notation JSON path to compare.
221        key: String,
222        /// Substring to search for.
223        value: String,
224    },
225    /// Membership check for scalar JSON values.
226    In {
227        /// Dot-notation JSON path to compare.
228        key: String,
229        /// Candidate values.
230        values: Vec<serde_json::Value>,
231    },
232    /// Presence check for a key or nested path.
233    Exists {
234        /// Dot-notation JSON path whose presence is required.
235        key: String,
236    },
237    /// Logical AND of multiple sub-filters.
238    And(Vec<MetadataFilter>),
239    /// Logical OR of multiple sub-filters.
240    Or(Vec<MetadataFilter>),
241    /// Logical NOT of a sub-filter.
242    Not(Box<MetadataFilter>),
243}
244
245impl MetadataFilter {
246    /// Evaluate this filter against a JSON metadata object.
247    pub fn matches(&self, metadata: &serde_json::Value) -> bool {
248        crate::search::filters::apply_filter(self, metadata)
249    }
250}
251
252/// Post-retrieval reranking configuration.
253#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
254#[serde(tag = "type", rename_all = "snake_case")]
255pub enum RerankerConfig {
256    /// Disable reranking.
257    None,
258    /// Promote diversity using maximal marginal relevance.
259    Diversity {
260        /// Relevance-vs-diversity balance in the range `[0.0, 1.0]`.
261        lambda: f32,
262        /// Stage weight used by the composite reranker.
263        weight: f32,
264    },
265    /// Boost recently created records.
266    Recency {
267        /// Strength of the recency boost.
268        boost: f32,
269        /// Exponential half-life in days.
270        half_life_days: f32,
271        /// Stage weight used by the composite reranker.
272        weight: f32,
273    },
274    /// Apply multiple rerankers in sequence.
275    Composite(Vec<RerankerConfig>),
276}
277
278// ─── SearchQuery ─────────────────────────────────────────────────────────────
279
280/// A nearest-neighbour search query.
281#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
282pub struct SearchQuery {
283    /// Target collection name.
284    pub collection: String,
285    /// Query vector.
286    pub vector: Vec<f32>,
287    /// Maximum number of results to return.
288    pub top_k: usize,
289    /// Optional metadata filter applied after ANN retrieval.
290    pub filter: Option<MetadataFilter>,
291    /// If `true`, each `SearchResult` will include the raw vector.
292    pub include_vectors: bool,
293    /// If `true`, each `SearchResult` will include the JSON metadata.
294    pub include_metadata: bool,
295    /// Override the HNSW `ef_search` parameter for this query.
296    pub ef_search: Option<usize>,
297    /// Optional post-retrieval reranking strategy.
298    pub reranker: Option<RerankerConfig>,
299}
300
301impl SearchQuery {
302    /// Validate the query fields, returning an error for invalid configurations.
303    pub fn validate(&self) -> VectorResult<()> {
304        if self.collection.is_empty() {
305            return Err(VectorError::SearchError(
306                "collection name must not be empty".into(),
307            ));
308        }
309        if self.vector.is_empty() {
310            return Err(VectorError::SearchError(
311                "query vector must not be empty".into(),
312            ));
313        }
314        if self.top_k == 0 {
315            return Err(VectorError::SearchError("top_k must be > 0".into()));
316        }
317        if let Some(filter) = &self.filter {
318            crate::search::filters::validate_filter(filter)?;
319        }
320        Ok(())
321    }
322}
323
324/// Hybrid vector + keyword search query.
325#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
326pub struct HybridQuery {
327    /// Target collection name.
328    pub collection: String,
329    /// Query vector used for ANN retrieval.
330    pub vector: Vec<f32>,
331    /// Optional keyword query used for FTS5 retrieval.
332    pub text: Option<String>,
333    /// Maximum number of results to return.
334    pub top_k: usize,
335    /// Blend factor where `1.0` is vector-only and `0.0` is keyword-only.
336    pub alpha: f32,
337    /// Optional metadata filter applied after fusion.
338    pub filter: Option<MetadataFilter>,
339    /// If `true`, each `SearchResult` will include the raw vector.
340    pub include_vectors: bool,
341    /// Optional post-retrieval reranking strategy.
342    pub reranker: Option<RerankerConfig>,
343}
344
345impl HybridQuery {
346    /// Validate the query fields, returning an error for invalid configurations.
347    pub fn validate(&self) -> VectorResult<()> {
348        if self.collection.is_empty() {
349            return Err(VectorError::SearchError(
350                "collection name must not be empty".into(),
351            ));
352        }
353        if self.vector.is_empty() {
354            return Err(VectorError::SearchError(
355                "query vector must not be empty".into(),
356            ));
357        }
358        if self.top_k == 0 {
359            return Err(VectorError::SearchError("top_k must be > 0".into()));
360        }
361        if !(0.0..=1.0).contains(&self.alpha) {
362            return Err(VectorError::SearchError(
363                "hybrid alpha must be between 0.0 and 1.0".into(),
364            ));
365        }
366        if let Some(filter) = &self.filter {
367            crate::search::filters::validate_filter(filter)?;
368        }
369        Ok(())
370    }
371}
372
373/// Persisted storage statistics for a collection.
374#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
375pub struct CollectionStats {
376    /// Number of vectors stored in the collection.
377    pub vector_count: u64,
378    /// Estimated on-disk size of the collection in bytes.
379    pub size_bytes: u64,
380}
381
382/// Top-level runtime statistics for the vector engine.
383#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
384pub struct EngineStats {
385    /// Number of known collections.
386    pub collection_count: usize,
387    /// Total vectors stored across all collections.
388    pub total_vectors: u64,
389    /// Number of indexes currently loaded in memory.
390    pub loaded_indexes: usize,
391    /// Number of mmap vector files currently opened.
392    pub loaded_mmap_files: usize,
393    /// Embedding cache hit counter.
394    pub embedding_cache_hits: u64,
395    /// Embedding cache miss counter.
396    pub embedding_cache_misses: u64,
397}