Skip to main content

vector/
model.rs

1//! Public API types for the vector database (RFC 0002).
2//!
3//! This module provides the user-facing types for writing vectors to the database.
4//! The API is designed to be simple and ergonomic while enforcing necessary constraints
5//! like dimension matching and metadata schema validation.
6
7use std::collections::HashMap;
8use std::time::Duration;
9
10use common::StorageConfig;
11use serde::{Deserialize, Serialize};
12
13// Re-export types from serde layer
14pub use crate::serde::FieldType;
15pub use crate::serde::collection_meta::DistanceMetric;
16
17/// Reserved field name for the embedding vector stored as an AttributeValue::Vector.
18pub const VECTOR_FIELD_NAME: &str = "vector";
19
20/// A vector with its identifying ID, embedding values, and metadata.
21///
22/// # Identity
23///
24/// A vector is uniquely identified by its `id` within a namespace. The ID is
25/// a user-provided string (max 64 bytes UTF-8) that serves as an external
26/// identifier. The system internally maps external IDs to compact u64 internal
27/// IDs for efficient storage and indexing.
28///
29/// # Upsert Semantics
30///
31/// Writing a vector with an existing ID replaces the previous vector. The old
32/// vector is marked as deleted and a new internal ID is allocated. This ensures
33/// posting lists and metadata indexes are updated correctly without expensive
34/// read-modify-write cycles.
35///
36/// # Embedding Values
37///
38/// The embedding vector must be provided as an attribute with name "vector"
39/// and type `AttributeValue::Vector`. The length must match the `dimensions`
40/// specified in the `Config` when the database was created.
41#[derive(Debug, Clone)]
42pub struct Vector {
43    /// User-provided unique identifier (max 64 bytes UTF-8).
44    pub id: String,
45
46    /// Attributes including the embedding vector (under "vector" field) and metadata.
47    pub attributes: Vec<Attribute>,
48}
49
50impl Vector {
51    /// Creates a new vector with just the embedding values (no other attributes).
52    ///
53    /// The vector values are stored as an attribute with name [`VECTOR_FIELD_NAME`].
54    pub fn new(id: impl Into<String>, values: Vec<f32>) -> Self {
55        Self {
56            id: id.into(),
57            attributes: vec![Attribute::new(
58                VECTOR_FIELD_NAME,
59                AttributeValue::Vector(values),
60            )],
61        }
62    }
63
64    /// Builder-style construction for vectors with attributes.
65    ///
66    /// The vector values are stored as an attribute with name [`VECTOR_FIELD_NAME`].
67    pub fn builder(id: impl Into<String>, values: Vec<f32>) -> VectorBuilder {
68        VectorBuilder {
69            id: id.into(),
70            attributes: vec![Attribute::new(
71                VECTOR_FIELD_NAME,
72                AttributeValue::Vector(values),
73            )],
74        }
75    }
76
77    pub fn attribute(&self, name: &str) -> Option<&AttributeValue> {
78        self.attributes
79            .iter()
80            .filter(|a| a.name == name)
81            .map(|a| &a.value)
82            .next()
83    }
84}
85
86/// Builder for constructing `Vector` instances with attributes.
87#[derive(Debug)]
88pub struct VectorBuilder {
89    id: String,
90    attributes: Vec<Attribute>,
91}
92
93impl VectorBuilder {
94    /// Adds a metadata attribute to the vector.
95    pub fn attribute(mut self, name: impl Into<String>, value: impl Into<AttributeValue>) -> Self {
96        self.attributes
97            .push(Attribute::new(name.into(), value.into()));
98        self
99    }
100
101    /// Builds the final `Vector`.
102    pub fn build(self) -> Vector {
103        Vector {
104            id: self.id,
105            attributes: self.attributes,
106        }
107    }
108}
109
110/// A metadata attribute attached to a vector.
111///
112/// Attributes enable filtered vector search by allowing queries like
113/// `{category="shoes", price < 100}`. Each attribute has a name and a
114/// typed value.
115#[derive(Debug, Clone, PartialEq)]
116pub struct Attribute {
117    pub name: String,
118    pub value: AttributeValue,
119}
120
121impl Attribute {
122    pub fn new(name: impl Into<String>, value: AttributeValue) -> Self {
123        Self {
124            name: name.into(),
125            value,
126        }
127    }
128}
129
130/// Supported attribute value types.
131///
132/// These types align with the metadata field types defined in the storage
133/// layer (CollectionMeta). Type mismatches at write time will return an error.
134#[derive(Debug, Clone, PartialEq)]
135pub enum AttributeValue {
136    String(String),
137    Int64(i64),
138    Float64(f64),
139    Bool(bool),
140    Vector(Vec<f32>),
141}
142
143// Convenience From implementations for AttributeValue
144impl From<String> for AttributeValue {
145    fn from(s: String) -> Self {
146        AttributeValue::String(s)
147    }
148}
149
150impl From<&str> for AttributeValue {
151    fn from(s: &str) -> Self {
152        AttributeValue::String(s.to_string())
153    }
154}
155
156impl From<i64> for AttributeValue {
157    fn from(v: i64) -> Self {
158        AttributeValue::Int64(v)
159    }
160}
161
162impl From<f64> for AttributeValue {
163    fn from(v: f64) -> Self {
164        AttributeValue::Float64(v)
165    }
166}
167
168impl From<bool> for AttributeValue {
169    fn from(v: bool) -> Self {
170        AttributeValue::Bool(v)
171    }
172}
173
174impl From<crate::serde::FieldValue> for AttributeValue {
175    fn from(field: crate::serde::FieldValue) -> Self {
176        match field {
177            crate::serde::FieldValue::String(s) => AttributeValue::String(s),
178            crate::serde::FieldValue::Int64(v) => AttributeValue::Int64(v),
179            crate::serde::FieldValue::Float64(v) => AttributeValue::Float64(v),
180            crate::serde::FieldValue::Bool(v) => AttributeValue::Bool(v),
181            crate::serde::FieldValue::Vector(v) => AttributeValue::Vector(v),
182        }
183    }
184}
185
186/// Configuration for a VectorDb instance.
187#[derive(Debug, Clone, Serialize, Deserialize)]
188pub struct Config {
189    /// Storage backend configuration.
190    ///
191    /// Determines where and how vector data is persisted. See [`StorageConfig`]
192    /// for available options including in-memory and SlateDB backends.
193    pub storage: StorageConfig,
194
195    /// Vector dimensionality (immutable after creation).
196    ///
197    /// All vectors written to this database must have exactly this many
198    /// f32 values. Common dimensions: 384 (MiniLM), 768 (BERT), 1536 (OpenAI).
199    pub dimensions: u16,
200
201    /// Distance metric for similarity computation (immutable after creation).
202    pub distance_metric: DistanceMetric,
203
204    /// How often to flush data to durable storage (in seconds).
205    #[serde(with = "duration_secs")]
206    pub flush_interval: Duration,
207
208    /// Number of vectors in a centroid's posting list that triggers a split.
209    pub split_threshold_vectors: usize,
210
211    /// Number of vectors below which a centroid's posting list triggers a merge.
212    pub merge_threshold_vectors: usize,
213
214    /// Number of neighboring centroids to scan for reassignment candidates after a split.
215    pub split_search_neighbourhood: usize,
216
217    /// The maximum number of centroids that require rebalancing before which backpressure
218    /// is applied by pausing ingestion of new vector writes.
219    pub max_pending_and_running_rebalance_tasks: usize,
220
221    /// After backpressure is applied, ingestion resumes after the total number of centroids
222    /// requiring rebalance drops below this value.
223    pub rebalance_backpressure_resume_threshold: usize,
224
225    /// The maximum number of rebalance tasks that the rebalancer will run concurrently.
226    pub max_rebalance_tasks: usize,
227
228    /// Target number of centroids per chunk.
229    pub chunk_target: u16,
230
231    /// Query-aware dynamic pruning epsilon (ε₂ from SPANN paper).
232    ///
233    /// When set, a posting list is searched only if its centroid's distance
234    /// to the query satisfies `dist(q, c) <= (1 + epsilon) * dist(q, closest)`.
235    /// This reduces query latency by skipping distant posting lists while
236    /// preserving recall.
237    ///
238    /// Typical values: 0.1 to 0.5. `None` disables pruning (all nprobe
239    /// posting lists are searched).
240    pub query_pruning_factor: Option<f32>,
241
242    /// Metadata field schema.
243    ///
244    /// Defines the expected attribute names and types. Writes with unknown
245    /// attribute names or type mismatches will fail. If empty, any attribute
246    /// names are accepted with types inferred from the first write.
247    pub metadata_fields: Vec<MetadataFieldSpec>,
248}
249
250impl Default for Config {
251    fn default() -> Self {
252        Self {
253            storage: StorageConfig::InMemory,
254            dimensions: 0, // Must be set explicitly
255            distance_metric: DistanceMetric::L2,
256            flush_interval: Duration::from_secs(60),
257            split_threshold_vectors: 2_000,
258            merge_threshold_vectors: 500,
259            split_search_neighbourhood: 16,
260            max_pending_and_running_rebalance_tasks: 16,
261            rebalance_backpressure_resume_threshold: 8,
262            max_rebalance_tasks: 8,
263            chunk_target: 4096,
264            query_pruning_factor: None,
265            metadata_fields: Vec::new(),
266        }
267    }
268}
269
270/// Configuration for a read-only vector database client.
271#[derive(Debug, Clone, Serialize, Deserialize)]
272pub struct ReaderConfig {
273    /// Storage backend configuration.
274    pub storage: StorageConfig,
275
276    /// Vector dimensionality.
277    pub dimensions: u16,
278
279    /// Distance metric for similarity computation.
280    pub distance_metric: DistanceMetric,
281
282    /// Query-aware dynamic pruning epsilon (SPANN §3.2).
283    ///
284    /// See [`Config::query_pruning_factor`] for details.
285    pub query_pruning_factor: Option<f32>,
286
287    /// Metadata field schema.
288    pub metadata_fields: Vec<MetadataFieldSpec>,
289}
290
291/// Metadata field specification for schema definition.
292#[derive(Debug, Clone, Serialize, Deserialize)]
293pub struct MetadataFieldSpec {
294    /// Field name.
295    pub name: String,
296
297    /// Expected value type.
298    pub field_type: FieldType,
299
300    /// Whether this field should be indexed for filtering.
301    /// Indexed fields can be used in query predicates.
302    pub indexed: bool,
303}
304
305impl MetadataFieldSpec {
306    pub fn new(name: impl Into<String>, field_type: FieldType, indexed: bool) -> Self {
307        Self {
308            name: name.into(),
309            field_type,
310            indexed,
311        }
312    }
313}
314
315/// A search result with vector, score, and metadata.
316#[derive(Debug, Clone)]
317pub struct SearchResult {
318    /// Similarity score (interpretation depends on distance metric)
319    ///
320    /// - L2: Lower scores = more similar
321    /// - DotProduct: Higher scores = more similar
322    pub score: f32,
323    /// The vector found by search
324    pub vector: Vector,
325}
326
327/// Query specification for vector search.
328///
329/// Constructed using the builder pattern:
330///
331/// ```ignore
332/// let query = Query::new(embedding)
333///     .with_limit(10)
334///     .with_filter(Filter::eq("category", "shoes"));
335/// ```
336#[derive(Debug, Clone)]
337pub struct Query {
338    /// Query vector (required).
339    pub vector: Vec<f32>,
340    /// Maximum number of results to return (default: 10).
341    pub limit: usize,
342    /// Optional metadata filter.
343    pub filter: Option<Filter>,
344}
345
346impl Query {
347    /// Creates a new query with the given vector.
348    pub fn new(vector: Vec<f32>) -> Self {
349        Self {
350            vector,
351            limit: 10,
352            filter: None,
353        }
354    }
355
356    /// Sets the maximum number of results to return.
357    pub fn with_limit(mut self, limit: usize) -> Self {
358        self.limit = limit;
359        self
360    }
361
362    /// Sets the metadata filter.
363    pub fn with_filter(mut self, filter: Filter) -> Self {
364        self.filter = Some(filter);
365        self
366    }
367}
368
369/// Metadata filter for search queries.
370///
371/// Filters are composed using simple predicates and logical operators.
372/// All filters are evaluated against the metadata inverted indexes.
373#[derive(Debug, Clone, PartialEq)]
374pub enum Filter {
375    /// Field equals value.
376    Eq(String, AttributeValue),
377    /// Field not equals value.
378    Neq(String, AttributeValue),
379    /// Field is in set of values.
380    In(String, Vec<AttributeValue>),
381    /// All filters must match (logical AND).
382    And(Vec<Filter>),
383    /// Any filter must match (logical OR).
384    Or(Vec<Filter>),
385}
386
387impl Filter {
388    /// Creates an equality filter.
389    pub fn eq(field: impl Into<String>, value: impl Into<AttributeValue>) -> Self {
390        Filter::Eq(field.into(), value.into())
391    }
392
393    /// Creates a not-equals filter.
394    pub fn neq(field: impl Into<String>, value: impl Into<AttributeValue>) -> Self {
395        Filter::Neq(field.into(), value.into())
396    }
397
398    /// Creates an in-set filter.
399    pub fn in_set(field: impl Into<String>, values: Vec<AttributeValue>) -> Self {
400        Filter::In(field.into(), values)
401    }
402
403    /// Combines filters with logical AND.
404    pub fn and(filters: Vec<Filter>) -> Self {
405        Filter::And(filters)
406    }
407
408    /// Combines filters with logical OR.
409    pub fn or(filters: Vec<Filter>) -> Self {
410        Filter::Or(filters)
411    }
412}
413
414/// Helper to build a metadata map from attributes.
415pub(crate) fn attributes_to_map(attributes: &[Attribute]) -> HashMap<String, AttributeValue> {
416    attributes
417        .iter()
418        .map(|attr| (attr.name.clone(), attr.value.clone()))
419        .collect()
420}
421
422mod duration_secs {
423    use std::time::Duration;
424
425    use serde::{Deserialize, Deserializer, Serializer};
426
427    pub fn serialize<S: Serializer>(d: &Duration, s: S) -> Result<S::Ok, S::Error> {
428        s.serialize_u64(d.as_secs())
429    }
430
431    pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result<Duration, D::Error> {
432        let secs = u64::deserialize(d)?;
433        Ok(Duration::from_secs(secs))
434    }
435}
436
437#[cfg(test)]
438mod tests {
439    use super::*;
440
441    #[test]
442    fn should_create_vector_with_builder() {
443        // given/when
444        let vector = Vector::builder("test-id", vec![1.0, 2.0, 3.0])
445            .attribute("category", "test")
446            .attribute("count", 42i64)
447            .attribute("score", 0.95)
448            .attribute("enabled", true)
449            .build();
450
451        // then
452        assert_eq!(vector.id, "test-id");
453        // 5 attributes: vector + category + count + score + enabled
454        assert_eq!(vector.attributes.len(), 5);
455        // First attribute is "vector"
456        assert_eq!(vector.attributes[0].name, "vector");
457        assert_eq!(
458            vector.attributes[0].value,
459            AttributeValue::Vector(vec![1.0, 2.0, 3.0])
460        );
461        // Second attribute is "category"
462        assert_eq!(vector.attributes[1].name, "category");
463        assert_eq!(
464            vector.attributes[1].value,
465            AttributeValue::String("test".to_string())
466        );
467    }
468
469    #[test]
470    fn should_create_vector_without_extra_attributes() {
471        // given/when
472        let vector = Vector::new("test-id", vec![1.0, 2.0, 3.0]);
473
474        // then
475        assert_eq!(vector.id, "test-id");
476        // Only the "vector" attribute
477        assert_eq!(vector.attributes.len(), 1);
478        assert_eq!(vector.attributes[0].name, "vector");
479        assert_eq!(
480            vector.attributes[0].value,
481            AttributeValue::Vector(vec![1.0, 2.0, 3.0])
482        );
483    }
484
485    #[test]
486    fn should_convert_str_to_attribute_value() {
487        // given
488        let value: AttributeValue = "test".into();
489
490        // then
491        assert_eq!(value, AttributeValue::String("test".to_string()));
492    }
493
494    #[test]
495    fn should_convert_int_to_attribute_value() {
496        // given
497        let value: AttributeValue = 42i64.into();
498
499        // then
500        assert_eq!(value, AttributeValue::Int64(42));
501    }
502
503    #[test]
504    fn should_convert_attributes_to_map() {
505        // given
506        let attributes = vec![
507            Attribute::new("name", AttributeValue::String("test".to_string())),
508            Attribute::new("count", AttributeValue::Int64(42)),
509        ];
510
511        // when
512        let map = attributes_to_map(&attributes);
513
514        // then
515        assert_eq!(map.len(), 2);
516        assert_eq!(
517            map.get("name"),
518            Some(&AttributeValue::String("test".to_string()))
519        );
520        assert_eq!(map.get("count"), Some(&AttributeValue::Int64(42)));
521    }
522}