Skip to main content

vector/serde/
collection_meta.rs

1//! CollectionMeta value encoding/decoding.
2//!
3//! Stores the global schema and configuration for a vector collection.
4//!
5//! ## Overview
6//!
7//! `CollectionMeta` is a **singleton record** that defines the structure all vectors
8//! in the namespace must conform to. It stores:
9//!
10//! - **Vector configuration**: dimensionality and distance metric
11//! - **Index configuration**: centroids per chunk (CHUNK_TARGET)
12//! - **Metadata schema**: field names, types, and indexing flags
13//!
14//! This record is read on startup to configure the vector index and validate
15//! incoming vectors.
16//!
17//! ## Immutability
18//!
19//! Several fields are immutable after collection creation:
20//!
21//! - `dimensions` - All vectors must have the same dimensionality
22//! - `distance_metric` - Changing would invalidate all similarity computations
23//! - `chunk_target` - Changing would require reorganizing all centroid chunks
24//!
25//! ## Schema Evolution
26//!
27//! The `schema_version` field tracks metadata schema changes. Supported evolutions:
28//!
29//! - **Adding metadata fields**: New fields can be appended. Existing vectors
30//!   without the field return null/missing.
31//! - **Enabling indexing**: A field's `indexed` flag can change from false to true.
32//!   A background job must rebuild `MetadataIndex` entries for existing vectors.
33//!
34//! Unsupported changes (require creating a new collection):
35//!
36//! - Changing `dimensions`, `distance_metric`, or `chunk_target`
37//! - Removing metadata fields or changing their types
38//! - Disabling indexing on a field (index entries would become stale)
39
40use super::{
41    Decode, Encode, EncodingError, FieldType, decode_array, decode_utf8, encode_array, encode_utf8,
42};
43use bytes::{Bytes, BytesMut};
44
45/// Distance metric for vector similarity computation.
46///
47/// The distance metric determines how vector similarity is computed during search.
48/// This is set at collection creation and cannot be changed afterward.
49///
50/// - **L2**: Euclidean distance. Lower values = more similar.
51/// - **DotProduct**: Dot product. Higher values = more similar. Requires normalized vectors.
52#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
53#[repr(u8)]
54pub enum DistanceMetric {
55    /// Euclidean (L2) distance: sqrt(sum((a\[i\] - b\[i\])²))
56    L2 = 0,
57    /// Dot product: sum(a\[i\] * b\[i\])
58    DotProduct = 2,
59}
60
61impl DistanceMetric {
62    pub fn from_byte(byte: u8) -> Result<Self, EncodingError> {
63        match byte {
64            0 => Ok(DistanceMetric::L2),
65            2 => Ok(DistanceMetric::DotProduct),
66            _ => Err(EncodingError {
67                message: format!("Invalid distance metric: {}", byte),
68            }),
69        }
70    }
71}
72
73/// Metadata field specification.
74///
75/// Defines the schema for a single metadata field that can be attached to vectors.
76/// Field names must be unique within a collection.
77///
78/// ## Indexing
79///
80/// When `indexed` is true, a `MetadataIndex` inverted index is maintained for this
81/// field, enabling efficient filtering during hybrid queries (e.g., "find similar
82/// vectors where category='shoes'").
83///
84/// Non-indexed fields are stored in `VectorMeta` but cannot be used in filter
85/// predicates efficiently—they require scanning all candidate vectors.
86#[derive(Debug, Clone, PartialEq, Eq)]
87pub struct MetadataFieldSpec {
88    /// Field name (must be unique within collection).
89    pub name: String,
90    /// Data type for this field's values.
91    pub field_type: FieldType,
92    /// Whether this field has a `MetadataIndex` for filtering.
93    pub indexed: bool,
94}
95
96impl MetadataFieldSpec {
97    pub fn new(name: impl Into<String>, field_type: FieldType, indexed: bool) -> Self {
98        Self {
99            name: name.into(),
100            field_type,
101            indexed,
102        }
103    }
104}
105
106impl Encode for MetadataFieldSpec {
107    fn encode(&self, buf: &mut BytesMut) {
108        encode_utf8(&self.name, buf);
109        buf.extend_from_slice(&[self.field_type as u8]);
110        buf.extend_from_slice(&[if self.indexed { 1 } else { 0 }]);
111    }
112}
113
114impl Decode for MetadataFieldSpec {
115    fn decode(buf: &mut &[u8]) -> Result<Self, EncodingError> {
116        let name = decode_utf8(buf)?;
117
118        if buf.len() < 2 {
119            return Err(EncodingError {
120                message: "Buffer too short for MetadataFieldSpec".to_string(),
121            });
122        }
123
124        let field_type = FieldType::from_byte(buf[0])?;
125        let indexed = buf[1] != 0;
126        *buf = &buf[2..];
127
128        Ok(MetadataFieldSpec {
129            name,
130            field_type,
131            indexed,
132        })
133    }
134}
135
136/// CollectionMeta value storing collection schema and configuration.
137///
138/// This is a singleton record (one per collection) that defines the structure
139/// all vectors must conform to. It is read on startup and cached in memory.
140///
141/// ## Value Layout (little-endian)
142///
143/// ```text
144/// ┌────────────────────────────────────────────────────────────────┐
145/// │  schema_version:    u32                                        │
146/// │  dimensions:        u16                                        │
147/// │  distance_metric:   u8   (0=L2, 2=dot_product)                  │
148/// │  chunk_target:      u16  (centroids per chunk, default 4096)   │
149/// │  metadata_fields:   Array<MetadataFieldSpec>                   │
150/// │                                                                │
151/// │  MetadataFieldSpec                                             │
152/// │  ┌──────────────────────────────────────────────────────────┐  │
153/// │  │  name:       Utf8                                        │  │
154/// │  │  field_type: u8  (0=string, 1=int64, 2=float64, 3=bool)  │  │
155/// │  │  indexed:    u8  (0=false, 1=true)                       │  │
156/// │  └──────────────────────────────────────────────────────────┘  │
157/// └────────────────────────────────────────────────────────────────┘
158/// ```
159#[derive(Debug, Clone, PartialEq, Eq)]
160pub struct CollectionMetaValue {
161    /// Monotonically increasing version for metadata schema changes.
162    ///
163    /// Incremented when metadata fields are added or indexing is enabled.
164    /// Does not change for vector insertions/deletions.
165    pub schema_version: u32,
166
167    /// Fixed dimensionality for all vectors (immutable after creation).
168    ///
169    /// Common values: 384 (MiniLM), 768 (BERT), 1536 (OpenAI ada-002).
170    pub dimensions: u16,
171
172    /// Distance function for similarity computation (immutable after creation).
173    pub distance_metric: DistanceMetric,
174
175    /// Target number of centroids per `CentroidChunk` (immutable after creation).
176    ///
177    /// Default is 4096, which yields ~25 MB per chunk at 1536 dimensions.
178    pub chunk_target: u16,
179
180    /// Schema for fields attached to vectors.
181    ///
182    /// Defines field names, types, and whether each field is indexed for filtering.
183    pub fields: Vec<MetadataFieldSpec>,
184}
185
186impl CollectionMetaValue {
187    pub fn new(
188        dimensions: u16,
189        distance_metric: DistanceMetric,
190        chunk_target: u16,
191        metadata_fields: Vec<MetadataFieldSpec>,
192    ) -> Self {
193        Self {
194            schema_version: 1,
195            dimensions,
196            distance_metric,
197            chunk_target,
198            fields: metadata_fields,
199        }
200    }
201
202    pub fn encode_to_bytes(&self) -> Bytes {
203        let mut buf = BytesMut::new();
204        buf.extend_from_slice(&self.schema_version.to_le_bytes());
205        buf.extend_from_slice(&self.dimensions.to_le_bytes());
206        buf.extend_from_slice(&[self.distance_metric as u8]);
207        buf.extend_from_slice(&self.chunk_target.to_le_bytes());
208        encode_array(&self.fields, &mut buf);
209        buf.freeze()
210    }
211
212    pub fn decode_from_bytes(buf: &[u8]) -> Result<Self, EncodingError> {
213        if buf.len() < 9 {
214            return Err(EncodingError {
215                message: format!(
216                    "Buffer too short for CollectionMetaValue: need at least 9 bytes, have {}",
217                    buf.len()
218                ),
219            });
220        }
221
222        let schema_version = u32::from_le_bytes([buf[0], buf[1], buf[2], buf[3]]);
223        let dimensions = u16::from_le_bytes([buf[4], buf[5]]);
224        let distance_metric = DistanceMetric::from_byte(buf[6])?;
225        let chunk_target = u16::from_le_bytes([buf[7], buf[8]]);
226
227        let mut slice = &buf[9..];
228        let metadata_fields = decode_array(&mut slice)?;
229
230        Ok(CollectionMetaValue {
231            schema_version,
232            dimensions,
233            distance_metric,
234            chunk_target,
235            fields: metadata_fields,
236        })
237    }
238
239    /// Find a metadata field spec by name.
240    pub fn get_field(&self, name: &str) -> Option<&MetadataFieldSpec> {
241        self.fields.iter().find(|f| f.name == name)
242    }
243
244    /// Returns the names of all indexed fields.
245    pub fn indexed_fields(&self) -> impl Iterator<Item = &str> {
246        self.fields
247            .iter()
248            .filter(|f| f.indexed)
249            .map(|f| f.name.as_str())
250    }
251}
252
253#[cfg(test)]
254mod tests {
255    use super::*;
256
257    #[test]
258    fn should_encode_and_decode_collection_meta() {
259        // given
260        let value = CollectionMetaValue::new(
261            1536,
262            DistanceMetric::DotProduct,
263            4096,
264            vec![
265                MetadataFieldSpec::new("category", FieldType::String, true),
266                MetadataFieldSpec::new("price", FieldType::Float64, true),
267                MetadataFieldSpec::new("description", FieldType::String, false),
268            ],
269        );
270
271        // when
272        let encoded = value.encode_to_bytes();
273        let decoded = CollectionMetaValue::decode_from_bytes(&encoded).unwrap();
274
275        // then
276        assert_eq!(decoded, value);
277    }
278
279    #[test]
280    fn should_encode_and_decode_with_no_metadata_fields() {
281        // given
282        let value = CollectionMetaValue::new(384, DistanceMetric::L2, 1024, vec![]);
283
284        // when
285        let encoded = value.encode_to_bytes();
286        let decoded = CollectionMetaValue::decode_from_bytes(&encoded).unwrap();
287
288        // then
289        assert_eq!(decoded, value);
290        assert!(decoded.fields.is_empty());
291    }
292
293    #[test]
294    fn should_find_field_by_name() {
295        // given
296        let value = CollectionMetaValue::new(
297            1536,
298            DistanceMetric::DotProduct,
299            4096,
300            vec![
301                MetadataFieldSpec::new("category", FieldType::String, true),
302                MetadataFieldSpec::new("price", FieldType::Float64, false),
303            ],
304        );
305
306        // when / then
307        let category = value.get_field("category").unwrap();
308        assert_eq!(category.field_type, FieldType::String);
309        assert!(category.indexed);
310
311        let price = value.get_field("price").unwrap();
312        assert_eq!(price.field_type, FieldType::Float64);
313        assert!(!price.indexed);
314
315        assert!(value.get_field("unknown").is_none());
316    }
317
318    #[test]
319    fn should_list_indexed_fields() {
320        // given
321        let value = CollectionMetaValue::new(
322            1536,
323            DistanceMetric::DotProduct,
324            4096,
325            vec![
326                MetadataFieldSpec::new("category", FieldType::String, true),
327                MetadataFieldSpec::new("price", FieldType::Float64, true),
328                MetadataFieldSpec::new("description", FieldType::String, false),
329            ],
330        );
331
332        // when
333        let indexed: Vec<&str> = value.indexed_fields().collect();
334
335        // then
336        assert_eq!(indexed, vec!["category", "price"]);
337    }
338
339    #[test]
340    fn should_preserve_all_distance_metrics() {
341        for metric in [DistanceMetric::L2, DistanceMetric::DotProduct] {
342            // given
343            let value = CollectionMetaValue::new(128, metric, 2048, vec![]);
344
345            // when
346            let encoded = value.encode_to_bytes();
347            let decoded = CollectionMetaValue::decode_from_bytes(&encoded).unwrap();
348
349            // then
350            assert_eq!(decoded.distance_metric, metric);
351        }
352    }
353
354    #[test]
355    fn should_preserve_all_field_types() {
356        for field_type in [
357            FieldType::String,
358            FieldType::Int64,
359            FieldType::Float64,
360            FieldType::Bool,
361        ] {
362            // given
363            let value = CollectionMetaValue::new(
364                128,
365                DistanceMetric::L2,
366                2048,
367                vec![MetadataFieldSpec::new("test", field_type, true)],
368            );
369
370            // when
371            let encoded = value.encode_to_bytes();
372            let decoded = CollectionMetaValue::decode_from_bytes(&encoded).unwrap();
373
374            // then
375            assert_eq!(decoded.fields[0].field_type, field_type);
376        }
377    }
378}