claw_vector/types.rs
1// types.rs — core domain types: VectorRecord, Collection, DistanceMetric, IndexType,
2// SearchResult, SearchQuery, and MetadataFilter.
3use chrono::{DateTime, Utc};
4use serde::{Deserialize, Serialize};
5use uuid::Uuid;
6
7use crate::error::{VectorError, VectorResult};
8
9// ─── DistanceMetric ───────────────────────────────────────────────────────────
10
11/// Distance metric used to compare vectors in a collection.
12#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
13#[serde(rename_all = "snake_case")]
14pub enum DistanceMetric {
15 /// Cosine similarity (1 − cosine → lower is closer).
16 Cosine,
17 /// Euclidean (L2) distance.
18 Euclidean,
19 /// Negative dot product (lower is closer).
20 DotProduct,
21}
22
23impl DistanceMetric {
24 /// Compute the distance between two vectors under this metric.
25 pub fn compute(&self, a: &[f32], b: &[f32]) -> f32 {
26 match self {
27 DistanceMetric::Cosine => {
28 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
29 let na: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
30 let nb: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
31 if na == 0.0 || nb == 0.0 {
32 1.0
33 } else {
34 1.0 - dot / (na * nb)
35 }
36 }
37 DistanceMetric::Euclidean => a
38 .iter()
39 .zip(b.iter())
40 .map(|(x, y)| (x - y) * (x - y))
41 .sum::<f32>()
42 .sqrt(),
43 DistanceMetric::DotProduct => -a.iter().zip(b.iter()).map(|(x, y)| x * y).sum::<f32>(),
44 }
45 }
46}
47
48// ─── IndexType ───────────────────────────────────────────────────────────────
49
50/// The backing index algorithm for a collection.
51#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
52#[serde(rename_all = "snake_case")]
53pub enum IndexType {
54 /// Approximate nearest-neighbour search via HNSW.
55 HNSW,
56 /// Brute-force flat scan (best for small collections).
57 Flat,
58}
59
60impl IndexType {
61 /// Choose the appropriate index type based on the collection size.
62 ///
63 /// Returns `Flat` when `vector_count < 1 000`, `HNSW` otherwise.
64 pub fn auto_select(vector_count: usize) -> Self {
65 if vector_count < 1_000 {
66 IndexType::Flat
67 } else {
68 IndexType::HNSW
69 }
70 }
71}
72
73// ─── VectorRecord ────────────────────────────────────────────────────────────
74
75/// A single vector stored in a collection, with optional text and metadata.
76#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
77pub struct VectorRecord {
78 /// Globally unique record identifier.
79 pub id: Uuid,
80 /// Name of the collection this record belongs to.
81 pub collection: String,
82 /// The raw embedding vector.
83 pub vector: Vec<f32>,
84 /// Arbitrary JSON metadata attached to this record.
85 pub metadata: serde_json::Value,
86 /// Original text from which the vector was generated (if stored).
87 pub text: Option<String>,
88 /// UTC timestamp of record creation.
89 pub created_at: DateTime<Utc>,
90}
91
92impl VectorRecord {
93 /// Create a new record with a fresh UUID and no text or metadata.
94 pub fn new(collection: impl Into<String>, vector: Vec<f32>) -> Self {
95 VectorRecord {
96 id: Uuid::new_v4(),
97 collection: collection.into(),
98 vector,
99 metadata: serde_json::json!({}),
100 text: None,
101 created_at: Utc::now(),
102 }
103 }
104
105 /// Builder: attach the original text to this record.
106 pub fn with_text(mut self, text: impl Into<String>) -> Self {
107 self.text = Some(text.into());
108 self
109 }
110
111 /// Builder: attach arbitrary JSON metadata to this record.
112 pub fn with_metadata(mut self, meta: serde_json::Value) -> Self {
113 self.metadata = meta;
114 self
115 }
116
117 /// Return the dimensionality of the stored vector.
118 pub fn dimensions(&self) -> usize {
119 self.vector.len()
120 }
121}
122
123// ─── Collection ──────────────────────────────────────────────────────────────
124
125/// Describes a named collection of vectors with a shared dimension and distance metric.
126#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
127pub struct Collection {
128 /// Workspace identifier used for tenant isolation.
129 pub workspace_id: String,
130 /// Unique collection name.
131 pub name: String,
132 /// Expected vector dimensionality for all records in this collection.
133 pub dimensions: usize,
134 /// Distance metric used for similarity search.
135 pub distance: DistanceMetric,
136 /// Active index algorithm.
137 pub index_type: IndexType,
138 /// UTC timestamp of collection creation.
139 pub created_at: DateTime<Utc>,
140 /// Number of vectors currently stored in this collection.
141 pub vector_count: u64,
142 /// Arbitrary JSON metadata for the collection.
143 pub metadata: serde_json::Value,
144 /// HNSW `ef_construction` build parameter.
145 pub ef_construction: usize,
146 /// HNSW `M` connections parameter.
147 pub m_connections: usize,
148}
149
150// ─── SearchResult ────────────────────────────────────────────────────────────
151
152/// A single result returned by a nearest-neighbour search.
153#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
154pub struct SearchResult {
155 /// Record identifier.
156 pub id: Uuid,
157 /// Normalized similarity score in the range `[0.0, 1.0]` (higher is better).
158 pub score: f32,
159 /// The raw vector (only set when the query requests it).
160 pub vector: Option<Vec<f32>>,
161 /// Record metadata (only set when the query requests it).
162 pub metadata: serde_json::Value,
163 /// Original text (if stored with the record).
164 pub text: Option<String>,
165 /// UTC timestamp when the source record was created.
166 pub created_at: DateTime<Utc>,
167}
168
169/// Additional metrics captured for a search operation.
170#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
171pub struct SearchMetrics {
172 /// Dimensionality of the input query vector.
173 pub query_vector_dims: usize,
174 /// Number of raw ANN or hybrid candidates examined before post-processing.
175 pub candidates_evaluated: usize,
176 /// Number of candidates that survived post-filtering and reranking.
177 pub post_filter_count: usize,
178 /// End-to-end latency for the search in microseconds.
179 pub latency_us: u64,
180}
181
182/// Full search response, including user-visible results and execution metrics.
183#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
184pub struct SearchResponse {
185 /// Ordered nearest-neighbour results.
186 pub results: Vec<SearchResult>,
187 /// Metrics captured while serving the request.
188 pub metrics: SearchMetrics,
189}
190
191// ─── MetadataFilter ──────────────────────────────────────────────────────────
192
193/// A composable DSL for filtering search results by their JSON metadata.
194#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
195#[serde(tag = "op", rename_all = "snake_case")]
196pub enum MetadataFilter {
197 /// Equality check: `metadata[key] == value`.
198 Eq {
199 /// JSON object key to compare.
200 key: String,
201 /// Expected value.
202 value: serde_json::Value,
203 },
204 /// Numeric greater-than check.
205 Gt {
206 /// Dot-notation JSON path to compare.
207 key: String,
208 /// Threshold value.
209 value: f64,
210 },
211 /// Numeric less-than check.
212 Lt {
213 /// Dot-notation JSON path to compare.
214 key: String,
215 /// Threshold value.
216 value: f64,
217 },
218 /// Case-insensitive substring match for string values.
219 Contains {
220 /// Dot-notation JSON path to compare.
221 key: String,
222 /// Substring to search for.
223 value: String,
224 },
225 /// Membership check for scalar JSON values.
226 In {
227 /// Dot-notation JSON path to compare.
228 key: String,
229 /// Candidate values.
230 values: Vec<serde_json::Value>,
231 },
232 /// Presence check for a key or nested path.
233 Exists {
234 /// Dot-notation JSON path whose presence is required.
235 key: String,
236 },
237 /// Logical AND of multiple sub-filters.
238 And(Vec<MetadataFilter>),
239 /// Logical OR of multiple sub-filters.
240 Or(Vec<MetadataFilter>),
241 /// Logical NOT of a sub-filter.
242 Not(Box<MetadataFilter>),
243}
244
245impl MetadataFilter {
246 /// Evaluate this filter against a JSON metadata object.
247 pub fn matches(&self, metadata: &serde_json::Value) -> bool {
248 crate::search::filters::apply_filter(self, metadata)
249 }
250}
251
252/// Post-retrieval reranking configuration.
253#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
254#[serde(tag = "type", rename_all = "snake_case")]
255pub enum RerankerConfig {
256 /// Disable reranking.
257 None,
258 /// Promote diversity using maximal marginal relevance.
259 Diversity {
260 /// Relevance-vs-diversity balance in the range `[0.0, 1.0]`.
261 lambda: f32,
262 /// Stage weight used by the composite reranker.
263 weight: f32,
264 },
265 /// Boost recently created records.
266 Recency {
267 /// Strength of the recency boost.
268 boost: f32,
269 /// Exponential half-life in days.
270 half_life_days: f32,
271 /// Stage weight used by the composite reranker.
272 weight: f32,
273 },
274 /// Apply multiple rerankers in sequence.
275 Composite(Vec<RerankerConfig>),
276}
277
278// ─── SearchQuery ─────────────────────────────────────────────────────────────
279
280/// A nearest-neighbour search query.
281#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
282pub struct SearchQuery {
283 /// Target collection name.
284 pub collection: String,
285 /// Query vector.
286 pub vector: Vec<f32>,
287 /// Maximum number of results to return.
288 pub top_k: usize,
289 /// Optional metadata filter applied after ANN retrieval.
290 pub filter: Option<MetadataFilter>,
291 /// If `true`, each `SearchResult` will include the raw vector.
292 pub include_vectors: bool,
293 /// If `true`, each `SearchResult` will include the JSON metadata.
294 pub include_metadata: bool,
295 /// Override the HNSW `ef_search` parameter for this query.
296 pub ef_search: Option<usize>,
297 /// Optional post-retrieval reranking strategy.
298 pub reranker: Option<RerankerConfig>,
299}
300
301impl SearchQuery {
302 /// Validate the query fields, returning an error for invalid configurations.
303 pub fn validate(&self) -> VectorResult<()> {
304 if self.collection.is_empty() {
305 return Err(VectorError::SearchError(
306 "collection name must not be empty".into(),
307 ));
308 }
309 if self.vector.is_empty() {
310 return Err(VectorError::SearchError(
311 "query vector must not be empty".into(),
312 ));
313 }
314 if self.top_k == 0 {
315 return Err(VectorError::SearchError("top_k must be > 0".into()));
316 }
317 if let Some(filter) = &self.filter {
318 crate::search::filters::validate_filter(filter)?;
319 }
320 Ok(())
321 }
322}
323
324/// Hybrid vector + keyword search query.
325#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
326pub struct HybridQuery {
327 /// Target collection name.
328 pub collection: String,
329 /// Query vector used for ANN retrieval.
330 pub vector: Vec<f32>,
331 /// Optional keyword query used for FTS5 retrieval.
332 pub text: Option<String>,
333 /// Maximum number of results to return.
334 pub top_k: usize,
335 /// Blend factor where `1.0` is vector-only and `0.0` is keyword-only.
336 pub alpha: f32,
337 /// Optional metadata filter applied after fusion.
338 pub filter: Option<MetadataFilter>,
339 /// If `true`, each `SearchResult` will include the raw vector.
340 pub include_vectors: bool,
341 /// Optional post-retrieval reranking strategy.
342 pub reranker: Option<RerankerConfig>,
343}
344
345impl HybridQuery {
346 /// Validate the query fields, returning an error for invalid configurations.
347 pub fn validate(&self) -> VectorResult<()> {
348 if self.collection.is_empty() {
349 return Err(VectorError::SearchError(
350 "collection name must not be empty".into(),
351 ));
352 }
353 if self.vector.is_empty() {
354 return Err(VectorError::SearchError(
355 "query vector must not be empty".into(),
356 ));
357 }
358 if self.top_k == 0 {
359 return Err(VectorError::SearchError("top_k must be > 0".into()));
360 }
361 if !(0.0..=1.0).contains(&self.alpha) {
362 return Err(VectorError::SearchError(
363 "hybrid alpha must be between 0.0 and 1.0".into(),
364 ));
365 }
366 if let Some(filter) = &self.filter {
367 crate::search::filters::validate_filter(filter)?;
368 }
369 Ok(())
370 }
371}
372
373/// Persisted storage statistics for a collection.
374#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
375pub struct CollectionStats {
376 /// Number of vectors stored in the collection.
377 pub vector_count: u64,
378 /// Estimated on-disk size of the collection in bytes.
379 pub size_bytes: u64,
380}
381
382/// Top-level runtime statistics for the vector engine.
383#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
384pub struct EngineStats {
385 /// Number of known collections.
386 pub collection_count: usize,
387 /// Total vectors stored across all collections.
388 pub total_vectors: u64,
389 /// Number of indexes currently loaded in memory.
390 pub loaded_indexes: usize,
391 /// Number of mmap vector files currently opened.
392 pub loaded_mmap_files: usize,
393 /// Embedding cache hit counter.
394 pub embedding_cache_hits: u64,
395 /// Embedding cache miss counter.
396 pub embedding_cache_misses: u64,
397}