vector/serde/collection_meta.rs
1//! CollectionMeta value encoding/decoding.
2//!
3//! Stores the global schema and configuration for a vector collection.
4//!
5//! ## Overview
6//!
7//! `CollectionMeta` is a **singleton record** that defines the structure all vectors
8//! in the namespace must conform to. It stores:
9//!
10//! - **Vector configuration**: dimensionality and distance metric
11//! - **Index configuration**: centroids per chunk (CHUNK_TARGET)
12//! - **Metadata schema**: field names, types, and indexing flags
13//!
14//! This record is read on startup to configure the vector index and validate
15//! incoming vectors.
16//!
17//! ## Immutability
18//!
19//! Several fields are immutable after collection creation:
20//!
21//! - `dimensions` - All vectors must have the same dimensionality
22//! - `distance_metric` - Changing would invalidate all similarity computations
23//! - `chunk_target` - Changing would require reorganizing all centroid chunks
24//!
25//! ## Schema Evolution
26//!
27//! The `schema_version` field tracks metadata schema changes. Supported evolutions:
28//!
29//! - **Adding metadata fields**: New fields can be appended. Existing vectors
30//! without the field return null/missing.
31//! - **Enabling indexing**: A field's `indexed` flag can change from false to true.
32//! A background job must rebuild `MetadataIndex` entries for existing vectors.
33//!
34//! Unsupported changes (require creating a new collection):
35//!
36//! - Changing `dimensions`, `distance_metric`, or `chunk_target`
37//! - Removing metadata fields or changing their types
38//! - Disabling indexing on a field (index entries would become stale)
39
40use super::{
41 Decode, Encode, EncodingError, FieldType, decode_array, decode_utf8, encode_array, encode_utf8,
42};
43use bytes::{Bytes, BytesMut};
44
45/// Distance metric for vector similarity computation.
46///
47/// The distance metric determines how vector similarity is computed during search.
48/// This is set at collection creation and cannot be changed afterward.
49///
50/// - **L2**: Euclidean distance. Lower values = more similar.
51/// - **DotProduct**: Dot product. Higher values = more similar. Requires normalized vectors.
52#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
53#[repr(u8)]
54pub enum DistanceMetric {
55 /// Euclidean (L2) distance: sqrt(sum((a\[i\] - b\[i\])²))
56 L2 = 0,
57 /// Dot product: sum(a\[i\] * b\[i\])
58 DotProduct = 2,
59}
60
61impl DistanceMetric {
62 pub fn from_byte(byte: u8) -> Result<Self, EncodingError> {
63 match byte {
64 0 => Ok(DistanceMetric::L2),
65 2 => Ok(DistanceMetric::DotProduct),
66 _ => Err(EncodingError {
67 message: format!("Invalid distance metric: {}", byte),
68 }),
69 }
70 }
71}
72
73/// Metadata field specification.
74///
75/// Defines the schema for a single metadata field that can be attached to vectors.
76/// Field names must be unique within a collection.
77///
78/// ## Indexing
79///
80/// When `indexed` is true, a `MetadataIndex` inverted index is maintained for this
81/// field, enabling efficient filtering during hybrid queries (e.g., "find similar
82/// vectors where category='shoes'").
83///
84/// Non-indexed fields are stored in `VectorMeta` but cannot be used in filter
85/// predicates efficiently—they require scanning all candidate vectors.
86#[derive(Debug, Clone, PartialEq, Eq)]
87pub struct MetadataFieldSpec {
88 /// Field name (must be unique within collection).
89 pub name: String,
90 /// Data type for this field's values.
91 pub field_type: FieldType,
92 /// Whether this field has a `MetadataIndex` for filtering.
93 pub indexed: bool,
94}
95
96impl MetadataFieldSpec {
97 pub fn new(name: impl Into<String>, field_type: FieldType, indexed: bool) -> Self {
98 Self {
99 name: name.into(),
100 field_type,
101 indexed,
102 }
103 }
104}
105
106impl Encode for MetadataFieldSpec {
107 fn encode(&self, buf: &mut BytesMut) {
108 encode_utf8(&self.name, buf);
109 buf.extend_from_slice(&[self.field_type as u8]);
110 buf.extend_from_slice(&[if self.indexed { 1 } else { 0 }]);
111 }
112}
113
114impl Decode for MetadataFieldSpec {
115 fn decode(buf: &mut &[u8]) -> Result<Self, EncodingError> {
116 let name = decode_utf8(buf)?;
117
118 if buf.len() < 2 {
119 return Err(EncodingError {
120 message: "Buffer too short for MetadataFieldSpec".to_string(),
121 });
122 }
123
124 let field_type = FieldType::from_byte(buf[0])?;
125 let indexed = buf[1] != 0;
126 *buf = &buf[2..];
127
128 Ok(MetadataFieldSpec {
129 name,
130 field_type,
131 indexed,
132 })
133 }
134}
135
136/// CollectionMeta value storing collection schema and configuration.
137///
138/// This is a singleton record (one per collection) that defines the structure
139/// all vectors must conform to. It is read on startup and cached in memory.
140///
141/// ## Value Layout (little-endian)
142///
143/// ```text
144/// ┌────────────────────────────────────────────────────────────────┐
145/// │ schema_version: u32 │
146/// │ dimensions: u16 │
147/// │ distance_metric: u8 (0=L2, 2=dot_product) │
148/// │ chunk_target: u16 (centroids per chunk, default 4096) │
149/// │ metadata_fields: Array<MetadataFieldSpec> │
150/// │ │
151/// │ MetadataFieldSpec │
152/// │ ┌──────────────────────────────────────────────────────────┐ │
153/// │ │ name: Utf8 │ │
154/// │ │ field_type: u8 (0=string, 1=int64, 2=float64, 3=bool) │ │
155/// │ │ indexed: u8 (0=false, 1=true) │ │
156/// │ └──────────────────────────────────────────────────────────┘ │
157/// └────────────────────────────────────────────────────────────────┘
158/// ```
159#[derive(Debug, Clone, PartialEq, Eq)]
160pub struct CollectionMetaValue {
161 /// Monotonically increasing version for metadata schema changes.
162 ///
163 /// Incremented when metadata fields are added or indexing is enabled.
164 /// Does not change for vector insertions/deletions.
165 pub schema_version: u32,
166
167 /// Fixed dimensionality for all vectors (immutable after creation).
168 ///
169 /// Common values: 384 (MiniLM), 768 (BERT), 1536 (OpenAI ada-002).
170 pub dimensions: u16,
171
172 /// Distance function for similarity computation (immutable after creation).
173 pub distance_metric: DistanceMetric,
174
175 /// Target number of centroids per `CentroidChunk` (immutable after creation).
176 ///
177 /// Default is 4096, which yields ~25 MB per chunk at 1536 dimensions.
178 pub chunk_target: u16,
179
180 /// Schema for fields attached to vectors.
181 ///
182 /// Defines field names, types, and whether each field is indexed for filtering.
183 pub fields: Vec<MetadataFieldSpec>,
184}
185
186impl CollectionMetaValue {
187 pub fn new(
188 dimensions: u16,
189 distance_metric: DistanceMetric,
190 chunk_target: u16,
191 metadata_fields: Vec<MetadataFieldSpec>,
192 ) -> Self {
193 Self {
194 schema_version: 1,
195 dimensions,
196 distance_metric,
197 chunk_target,
198 fields: metadata_fields,
199 }
200 }
201
202 pub fn encode_to_bytes(&self) -> Bytes {
203 let mut buf = BytesMut::new();
204 buf.extend_from_slice(&self.schema_version.to_le_bytes());
205 buf.extend_from_slice(&self.dimensions.to_le_bytes());
206 buf.extend_from_slice(&[self.distance_metric as u8]);
207 buf.extend_from_slice(&self.chunk_target.to_le_bytes());
208 encode_array(&self.fields, &mut buf);
209 buf.freeze()
210 }
211
212 pub fn decode_from_bytes(buf: &[u8]) -> Result<Self, EncodingError> {
213 if buf.len() < 9 {
214 return Err(EncodingError {
215 message: format!(
216 "Buffer too short for CollectionMetaValue: need at least 9 bytes, have {}",
217 buf.len()
218 ),
219 });
220 }
221
222 let schema_version = u32::from_le_bytes([buf[0], buf[1], buf[2], buf[3]]);
223 let dimensions = u16::from_le_bytes([buf[4], buf[5]]);
224 let distance_metric = DistanceMetric::from_byte(buf[6])?;
225 let chunk_target = u16::from_le_bytes([buf[7], buf[8]]);
226
227 let mut slice = &buf[9..];
228 let metadata_fields = decode_array(&mut slice)?;
229
230 Ok(CollectionMetaValue {
231 schema_version,
232 dimensions,
233 distance_metric,
234 chunk_target,
235 fields: metadata_fields,
236 })
237 }
238
239 /// Find a metadata field spec by name.
240 pub fn get_field(&self, name: &str) -> Option<&MetadataFieldSpec> {
241 self.fields.iter().find(|f| f.name == name)
242 }
243
244 /// Returns the names of all indexed fields.
245 pub fn indexed_fields(&self) -> impl Iterator<Item = &str> {
246 self.fields
247 .iter()
248 .filter(|f| f.indexed)
249 .map(|f| f.name.as_str())
250 }
251}
252
253#[cfg(test)]
254mod tests {
255 use super::*;
256
257 #[test]
258 fn should_encode_and_decode_collection_meta() {
259 // given
260 let value = CollectionMetaValue::new(
261 1536,
262 DistanceMetric::DotProduct,
263 4096,
264 vec![
265 MetadataFieldSpec::new("category", FieldType::String, true),
266 MetadataFieldSpec::new("price", FieldType::Float64, true),
267 MetadataFieldSpec::new("description", FieldType::String, false),
268 ],
269 );
270
271 // when
272 let encoded = value.encode_to_bytes();
273 let decoded = CollectionMetaValue::decode_from_bytes(&encoded).unwrap();
274
275 // then
276 assert_eq!(decoded, value);
277 }
278
279 #[test]
280 fn should_encode_and_decode_with_no_metadata_fields() {
281 // given
282 let value = CollectionMetaValue::new(384, DistanceMetric::L2, 1024, vec![]);
283
284 // when
285 let encoded = value.encode_to_bytes();
286 let decoded = CollectionMetaValue::decode_from_bytes(&encoded).unwrap();
287
288 // then
289 assert_eq!(decoded, value);
290 assert!(decoded.fields.is_empty());
291 }
292
293 #[test]
294 fn should_find_field_by_name() {
295 // given
296 let value = CollectionMetaValue::new(
297 1536,
298 DistanceMetric::DotProduct,
299 4096,
300 vec![
301 MetadataFieldSpec::new("category", FieldType::String, true),
302 MetadataFieldSpec::new("price", FieldType::Float64, false),
303 ],
304 );
305
306 // when / then
307 let category = value.get_field("category").unwrap();
308 assert_eq!(category.field_type, FieldType::String);
309 assert!(category.indexed);
310
311 let price = value.get_field("price").unwrap();
312 assert_eq!(price.field_type, FieldType::Float64);
313 assert!(!price.indexed);
314
315 assert!(value.get_field("unknown").is_none());
316 }
317
318 #[test]
319 fn should_list_indexed_fields() {
320 // given
321 let value = CollectionMetaValue::new(
322 1536,
323 DistanceMetric::DotProduct,
324 4096,
325 vec![
326 MetadataFieldSpec::new("category", FieldType::String, true),
327 MetadataFieldSpec::new("price", FieldType::Float64, true),
328 MetadataFieldSpec::new("description", FieldType::String, false),
329 ],
330 );
331
332 // when
333 let indexed: Vec<&str> = value.indexed_fields().collect();
334
335 // then
336 assert_eq!(indexed, vec!["category", "price"]);
337 }
338
339 #[test]
340 fn should_preserve_all_distance_metrics() {
341 for metric in [DistanceMetric::L2, DistanceMetric::DotProduct] {
342 // given
343 let value = CollectionMetaValue::new(128, metric, 2048, vec![]);
344
345 // when
346 let encoded = value.encode_to_bytes();
347 let decoded = CollectionMetaValue::decode_from_bytes(&encoded).unwrap();
348
349 // then
350 assert_eq!(decoded.distance_metric, metric);
351 }
352 }
353
354 #[test]
355 fn should_preserve_all_field_types() {
356 for field_type in [
357 FieldType::String,
358 FieldType::Int64,
359 FieldType::Float64,
360 FieldType::Bool,
361 ] {
362 // given
363 let value = CollectionMetaValue::new(
364 128,
365 DistanceMetric::L2,
366 2048,
367 vec![MetadataFieldSpec::new("test", field_type, true)],
368 );
369
370 // when
371 let encoded = value.encode_to_bytes();
372 let decoded = CollectionMetaValue::decode_from_bytes(&encoded).unwrap();
373
374 // then
375 assert_eq!(decoded.fields[0].field_type, field_type);
376 }
377 }
378}