Skip to main content

vector/
model.rs

1//! Public API types for the vector database (RFC 0002).
2//!
3//! This module provides the user-facing types for writing vectors to the database.
4//! The API is designed to be simple and ergonomic while enforcing necessary constraints
5//! like dimension matching and metadata schema validation.
6
7use std::collections::HashMap;
8use std::time::Duration;
9
10use common::StorageConfig;
11use serde::{Deserialize, Serialize};
12
13// Re-export types from serde layer
14pub use crate::serde::FieldType;
15pub use crate::serde::collection_meta::DistanceMetric;
16
17/// Reserved field name for the embedding vector stored as an AttributeValue::Vector.
18pub const VECTOR_FIELD_NAME: &str = "vector";
19
20/// A vector with its identifying ID, embedding values, and metadata.
21///
22/// # Identity
23///
24/// A vector is uniquely identified by its `id` within a namespace. The ID is
25/// a user-provided string (max 64 bytes UTF-8) that serves as an external
26/// identifier. The system internally maps external IDs to compact u64 internal
27/// IDs for efficient storage and indexing.
28///
29/// # Upsert Semantics
30///
31/// Writing a vector with an existing ID replaces the previous vector. The old
32/// vector is marked as deleted and a new internal ID is allocated. This ensures
33/// posting lists and metadata indexes are updated correctly without expensive
34/// read-modify-write cycles.
35///
36/// # Embedding Values
37///
38/// The embedding vector must be provided as an attribute with name "vector"
39/// and type `AttributeValue::Vector`. The length must match the `dimensions`
40/// specified in the `Config` when the database was created.
41#[derive(Debug, Clone)]
42pub struct Vector {
43    /// User-provided unique identifier (max 64 bytes UTF-8).
44    pub id: String,
45
46    /// Attributes including the embedding vector (under "vector" field) and metadata.
47    pub attributes: Vec<Attribute>,
48}
49
50impl Vector {
51    /// Creates a new vector with just the embedding values (no other attributes).
52    ///
53    /// The vector values are stored as an attribute with name [`VECTOR_FIELD_NAME`].
54    pub fn new(id: impl Into<String>, values: Vec<f32>) -> Self {
55        Self {
56            id: id.into(),
57            attributes: vec![Attribute::new(
58                VECTOR_FIELD_NAME,
59                AttributeValue::Vector(values),
60            )],
61        }
62    }
63
64    /// Builder-style construction for vectors with attributes.
65    ///
66    /// The vector values are stored as an attribute with name [`VECTOR_FIELD_NAME`].
67    pub fn builder(id: impl Into<String>, values: Vec<f32>) -> VectorBuilder {
68        VectorBuilder {
69            id: id.into(),
70            attributes: vec![Attribute::new(
71                VECTOR_FIELD_NAME,
72                AttributeValue::Vector(values),
73            )],
74        }
75    }
76
77    pub fn attribute(&self, name: &str) -> Option<&AttributeValue> {
78        self.attributes
79            .iter()
80            .filter(|a| a.name == name)
81            .map(|a| &a.value)
82            .next()
83    }
84}
85
86/// Builder for constructing `Vector` instances with attributes.
87#[derive(Debug)]
88pub struct VectorBuilder {
89    id: String,
90    attributes: Vec<Attribute>,
91}
92
93impl VectorBuilder {
94    /// Adds a metadata attribute to the vector.
95    pub fn attribute(mut self, name: impl Into<String>, value: impl Into<AttributeValue>) -> Self {
96        self.attributes
97            .push(Attribute::new(name.into(), value.into()));
98        self
99    }
100
101    /// Builds the final `Vector`.
102    pub fn build(self) -> Vector {
103        Vector {
104            id: self.id,
105            attributes: self.attributes,
106        }
107    }
108}
109
110/// A metadata attribute attached to a vector.
111///
112/// Attributes enable filtered vector search by allowing queries like
113/// `{category="shoes", price < 100}`. Each attribute has a name and a
114/// typed value.
115#[derive(Debug, Clone, PartialEq)]
116pub struct Attribute {
117    pub name: String,
118    pub value: AttributeValue,
119}
120
121impl Attribute {
122    pub fn new(name: impl Into<String>, value: AttributeValue) -> Self {
123        Self {
124            name: name.into(),
125            value,
126        }
127    }
128}
129
130/// Supported attribute value types.
131///
132/// These types align with the metadata field types defined in the storage
133/// layer (CollectionMeta). Type mismatches at write time will return an error.
134#[derive(Debug, Clone, PartialEq)]
135pub enum AttributeValue {
136    String(String),
137    Int64(i64),
138    Float64(f64),
139    Bool(bool),
140    Vector(Vec<f32>),
141}
142
143// Convenience From implementations for AttributeValue
144impl From<String> for AttributeValue {
145    fn from(s: String) -> Self {
146        AttributeValue::String(s)
147    }
148}
149
150impl From<&str> for AttributeValue {
151    fn from(s: &str) -> Self {
152        AttributeValue::String(s.to_string())
153    }
154}
155
156impl From<i64> for AttributeValue {
157    fn from(v: i64) -> Self {
158        AttributeValue::Int64(v)
159    }
160}
161
162impl From<f64> for AttributeValue {
163    fn from(v: f64) -> Self {
164        AttributeValue::Float64(v)
165    }
166}
167
168impl From<bool> for AttributeValue {
169    fn from(v: bool) -> Self {
170        AttributeValue::Bool(v)
171    }
172}
173
174impl From<crate::serde::FieldValue> for AttributeValue {
175    fn from(field: crate::serde::FieldValue) -> Self {
176        match field {
177            crate::serde::FieldValue::String(s) => AttributeValue::String(s),
178            crate::serde::FieldValue::Int64(v) => AttributeValue::Int64(v),
179            crate::serde::FieldValue::Float64(v) => AttributeValue::Float64(v),
180            crate::serde::FieldValue::Bool(v) => AttributeValue::Bool(v),
181            crate::serde::FieldValue::Vector(v) => AttributeValue::Vector(v),
182        }
183    }
184}
185
186/// Configuration for a VectorDb instance.
187#[derive(Debug, Clone, Serialize, Deserialize)]
188pub struct Config {
189    /// Storage backend configuration.
190    ///
191    /// Determines where and how vector data is persisted. See [`StorageConfig`]
192    /// for available options including in-memory and SlateDB backends.
193    pub storage: StorageConfig,
194
195    /// Vector dimensionality (immutable after creation).
196    ///
197    /// All vectors written to this database must have exactly this many
198    /// f32 values. Common dimensions: 384 (MiniLM), 768 (BERT), 1536 (OpenAI).
199    pub dimensions: u16,
200
201    /// Distance metric for similarity computation (immutable after creation).
202    pub distance_metric: DistanceMetric,
203
204    /// How often to flush data to durable storage (in seconds).
205    #[serde(with = "duration_secs")]
206    pub flush_interval: Duration,
207
208    /// Number of vectors in a centroid's posting list that triggers a split.
209    pub split_threshold_vectors: usize,
210
211    /// Number of vectors below which a centroid's posting list triggers a merge.
212    pub merge_threshold_vectors: usize,
213
214    /// Number of neighboring centroids to scan for reassignment candidates after a split.
215    pub split_search_neighbourhood: usize,
216
217    /// The maximum number of centroids that require rebalancing before which backpressure
218    /// is applied by pausing ingestion of new vector writes.
219    pub max_pending_and_running_rebalance_tasks: usize,
220
221    /// After backpressure is applied, ingestion resumes after the total number of centroids
222    /// requiring rebalance drops below this value.
223    pub rebalance_backpressure_resume_threshold: usize,
224
225    /// The maximum number of rebalance tasks that the rebalancer will run concurrently.
226    pub max_rebalance_tasks: usize,
227
228    /// Target number of centroids per chunk.
229    pub chunk_target: u16,
230
231    /// Query-aware dynamic pruning epsilon (ε₂ from SPANN paper).
232    ///
233    /// When set, a posting list is searched only if its centroid's distance
234    /// to the query satisfies `dist(q, c) <= (1 + epsilon) * dist(q, closest)`.
235    /// This reduces query latency by skipping distant posting lists while
236    /// preserving recall.
237    ///
238    /// Typical values: 0.1 to 0.5. `None` disables pruning (all nprobe
239    /// posting lists are searched).
240    pub query_pruning_factor: Option<f32>,
241
242    /// Metadata field schema.
243    ///
244    /// Defines the expected attribute names and types. Writes with unknown
245    /// attribute names or type mismatches will fail. If empty, any attribute
246    /// names are accepted with types inferred from the first write.
247    pub metadata_fields: Vec<MetadataFieldSpec>,
248}
249
250impl Default for Config {
251    fn default() -> Self {
252        Self {
253            storage: StorageConfig::InMemory,
254            dimensions: 0, // Must be set explicitly
255            distance_metric: DistanceMetric::L2,
256            flush_interval: Duration::from_secs(60),
257            split_threshold_vectors: 2_000,
258            merge_threshold_vectors: 500,
259            split_search_neighbourhood: 16,
260            max_pending_and_running_rebalance_tasks: 16,
261            rebalance_backpressure_resume_threshold: 8,
262            max_rebalance_tasks: 8,
263            chunk_target: 4096,
264            query_pruning_factor: None,
265            metadata_fields: Vec::new(),
266        }
267    }
268}
269
270/// Configuration for a read-only vector database client.
271#[derive(Debug, Clone, Serialize, Deserialize)]
272pub struct ReaderConfig {
273    /// Storage backend configuration.
274    pub storage: StorageConfig,
275
276    /// Vector dimensionality.
277    pub dimensions: u16,
278
279    /// Distance metric for similarity computation.
280    pub distance_metric: DistanceMetric,
281
282    /// Query-aware dynamic pruning epsilon (SPANN §3.2).
283    ///
284    /// See [`Config::query_pruning_factor`] for details.
285    pub query_pruning_factor: Option<f32>,
286
287    /// Metadata field schema.
288    pub metadata_fields: Vec<MetadataFieldSpec>,
289}
290
291/// Metadata field specification for schema definition.
292#[derive(Debug, Clone, Serialize, Deserialize)]
293pub struct MetadataFieldSpec {
294    /// Field name.
295    pub name: String,
296
297    /// Expected value type.
298    pub field_type: FieldType,
299
300    /// Whether this field should be indexed for filtering.
301    /// Indexed fields can be used in query predicates.
302    pub indexed: bool,
303}
304
305impl MetadataFieldSpec {
306    pub fn new(name: impl Into<String>, field_type: FieldType, indexed: bool) -> Self {
307        Self {
308            name: name.into(),
309            field_type,
310            indexed,
311        }
312    }
313}
314
315/// A search result with vector, score, and metadata.
316#[derive(Debug, Clone)]
317pub struct SearchResult {
318    /// Similarity score (interpretation depends on distance metric)
319    ///
320    /// - L2: Lower scores = more similar
321    /// - DotProduct: Higher scores = more similar
322    pub score: f32,
323    /// The vector found by search
324    pub vector: Vector,
325}
326
327/// Specifies which fields to include in query results.
328///
329/// Controls which attributes are returned in search results,
330/// reducing data transfer when only specific fields are needed.
331#[derive(Debug, Clone, PartialEq)]
332pub enum FieldSelection {
333    /// Include all fields (vector and all metadata).
334    All,
335    /// Include no fields (only IDs and scores).
336    None,
337    /// Include specific fields by name (e.g., `["category", "price"]`).
338    /// Use `"vector"` to include the embedding vector.
339    Fields(Vec<String>),
340}
341
342impl From<bool> for FieldSelection {
343    fn from(include: bool) -> Self {
344        if include {
345            FieldSelection::All
346        } else {
347            FieldSelection::None
348        }
349    }
350}
351
352impl From<Vec<&str>> for FieldSelection {
353    fn from(fields: Vec<&str>) -> Self {
354        FieldSelection::Fields(fields.into_iter().map(String::from).collect())
355    }
356}
357
358impl From<Vec<String>> for FieldSelection {
359    fn from(fields: Vec<String>) -> Self {
360        FieldSelection::Fields(fields)
361    }
362}
363
364/// Query specification for vector search.
365///
366/// Constructed using the builder pattern:
367///
368/// ```ignore
369/// let query = Query::new(embedding)
370///     .with_limit(10)
371///     .with_filter(Filter::eq("category", "shoes"))
372///     .with_fields(vec!["category", "price"]);
373/// ```
374#[derive(Debug, Clone)]
375pub struct Query {
376    /// Query vector (required).
377    pub vector: Vec<f32>,
378    /// Maximum number of results to return (default: 10).
379    pub limit: usize,
380    /// Optional metadata filter.
381    pub filter: Option<Filter>,
382    /// Which fields to include in results (default: All).
383    pub include_fields: FieldSelection,
384}
385
386impl Query {
387    /// Creates a new query with the given vector.
388    pub fn new(vector: Vec<f32>) -> Self {
389        Self {
390            vector,
391            limit: 10,
392            filter: None,
393            include_fields: FieldSelection::All,
394        }
395    }
396
397    /// Sets the maximum number of results to return.
398    pub fn with_limit(mut self, limit: usize) -> Self {
399        self.limit = limit;
400        self
401    }
402
403    /// Sets the metadata filter.
404    pub fn with_filter(mut self, filter: Filter) -> Self {
405        self.filter = Some(filter);
406        self
407    }
408
409    /// Controls which fields are included in results.
410    ///
411    /// Accepts `true`/`false` for all/none, or `Vec<&str>`/`Vec<String>` for specific fields.
412    pub fn with_fields(mut self, fields: impl Into<FieldSelection>) -> Self {
413        self.include_fields = fields.into();
414        self
415    }
416}
417
418/// Metadata filter for search queries.
419///
420/// Filters are composed using simple predicates and logical operators.
421/// All filters are evaluated against the metadata inverted indexes.
422#[derive(Debug, Clone, PartialEq)]
423pub enum Filter {
424    /// Field equals value.
425    Eq(String, AttributeValue),
426    /// Field not equals value.
427    Neq(String, AttributeValue),
428    /// Field is in set of values.
429    In(String, Vec<AttributeValue>),
430    /// All filters must match (logical AND).
431    And(Vec<Filter>),
432    /// Any filter must match (logical OR).
433    Or(Vec<Filter>),
434}
435
436impl Filter {
437    /// Creates an equality filter.
438    pub fn eq(field: impl Into<String>, value: impl Into<AttributeValue>) -> Self {
439        Filter::Eq(field.into(), value.into())
440    }
441
442    /// Creates a not-equals filter.
443    pub fn neq(field: impl Into<String>, value: impl Into<AttributeValue>) -> Self {
444        Filter::Neq(field.into(), value.into())
445    }
446
447    /// Creates an in-set filter.
448    pub fn in_set(field: impl Into<String>, values: Vec<AttributeValue>) -> Self {
449        Filter::In(field.into(), values)
450    }
451
452    /// Combines filters with logical AND.
453    pub fn and(filters: Vec<Filter>) -> Self {
454        Filter::And(filters)
455    }
456
457    /// Combines filters with logical OR.
458    pub fn or(filters: Vec<Filter>) -> Self {
459        Filter::Or(filters)
460    }
461}
462
463/// Helper to build a metadata map from attributes.
464pub(crate) fn attributes_to_map(attributes: &[Attribute]) -> HashMap<String, AttributeValue> {
465    attributes
466        .iter()
467        .map(|attr| (attr.name.clone(), attr.value.clone()))
468        .collect()
469}
470
471mod duration_secs {
472    use std::time::Duration;
473
474    use serde::{Deserialize, Deserializer, Serializer};
475
476    pub fn serialize<S: Serializer>(d: &Duration, s: S) -> Result<S::Ok, S::Error> {
477        s.serialize_u64(d.as_secs())
478    }
479
480    pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result<Duration, D::Error> {
481        let secs = u64::deserialize(d)?;
482        Ok(Duration::from_secs(secs))
483    }
484}
485
486#[cfg(test)]
487mod tests {
488    use super::*;
489
490    #[test]
491    fn should_create_vector_with_builder() {
492        // given/when
493        let vector = Vector::builder("test-id", vec![1.0, 2.0, 3.0])
494            .attribute("category", "test")
495            .attribute("count", 42i64)
496            .attribute("score", 0.95)
497            .attribute("enabled", true)
498            .build();
499
500        // then
501        assert_eq!(vector.id, "test-id");
502        // 5 attributes: vector + category + count + score + enabled
503        assert_eq!(vector.attributes.len(), 5);
504        // First attribute is "vector"
505        assert_eq!(vector.attributes[0].name, "vector");
506        assert_eq!(
507            vector.attributes[0].value,
508            AttributeValue::Vector(vec![1.0, 2.0, 3.0])
509        );
510        // Second attribute is "category"
511        assert_eq!(vector.attributes[1].name, "category");
512        assert_eq!(
513            vector.attributes[1].value,
514            AttributeValue::String("test".to_string())
515        );
516    }
517
518    #[test]
519    fn should_create_vector_without_extra_attributes() {
520        // given/when
521        let vector = Vector::new("test-id", vec![1.0, 2.0, 3.0]);
522
523        // then
524        assert_eq!(vector.id, "test-id");
525        // Only the "vector" attribute
526        assert_eq!(vector.attributes.len(), 1);
527        assert_eq!(vector.attributes[0].name, "vector");
528        assert_eq!(
529            vector.attributes[0].value,
530            AttributeValue::Vector(vec![1.0, 2.0, 3.0])
531        );
532    }
533
534    #[test]
535    fn should_convert_str_to_attribute_value() {
536        // given
537        let value: AttributeValue = "test".into();
538
539        // then
540        assert_eq!(value, AttributeValue::String("test".to_string()));
541    }
542
543    #[test]
544    fn should_convert_int_to_attribute_value() {
545        // given
546        let value: AttributeValue = 42i64.into();
547
548        // then
549        assert_eq!(value, AttributeValue::Int64(42));
550    }
551
552    #[test]
553    fn should_convert_attributes_to_map() {
554        // given
555        let attributes = vec![
556            Attribute::new("name", AttributeValue::String("test".to_string())),
557            Attribute::new("count", AttributeValue::Int64(42)),
558        ];
559
560        // when
561        let map = attributes_to_map(&attributes);
562
563        // then
564        assert_eq!(map.len(), 2);
565        assert_eq!(
566            map.get("name"),
567            Some(&AttributeValue::String("test".to_string()))
568        );
569        assert_eq!(map.get("count"), Some(&AttributeValue::Int64(42)));
570    }
571}