Skip to main content

vector/serde/
vector_data.rs

1//! VectorData value encoding/decoding.
2//!
3//! Stores all data for a single vector: external ID, embedding vector, and metadata.
4//!
5//! ## Storage Design
6//!
7//! Vectors are stored individually (one record per vector) rather than batched
8//! together. This enables:
9//!
10//! - **Efficient point lookups**: Load a single vector without reading neighbors
11//! - **Partial loading**: During filtered search, only load vectors that pass filters
12//! - **Independent updates**: Upsert/delete individual vectors without rewriting batches
13//!
14//! ## Unified Storage
15//!
16//! Vector data, metadata, and external ID are stored together in a single record.
17//! The vector is stored as a special field with tag 0xff (255) alongside metadata fields.
18//!
19//! ## Dimensionality
20//!
21//! The vector length is not stored in the value—it's obtained from `CollectionMeta`.
22//! All vectors in a collection must have the same dimensionality.
23
24use super::{Decode, Encode, EncodingError, FieldValue, decode_utf8, encode_utf8};
25use crate::FieldType;
26use crate::model::VECTOR_FIELD_NAME;
27use bytes::{Bytes, BytesMut};
28
29/// A metadata field with name and value.
30#[derive(Debug, Clone, PartialEq)]
31pub struct Field {
32    /// Field name.
33    pub field_name: String,
34    /// Field value.
35    pub value: FieldValue,
36}
37
38impl Field {
39    pub fn new(field_name: impl Into<String>, value: FieldValue) -> Self {
40        Self {
41            field_name: field_name.into(),
42            value,
43        }
44    }
45
46    pub fn string(field_name: impl Into<String>, value: impl Into<String>) -> Self {
47        Self::new(field_name, FieldValue::String(value.into()))
48    }
49
50    pub fn int64(field_name: impl Into<String>, value: i64) -> Self {
51        Self::new(field_name, FieldValue::Int64(value))
52    }
53
54    pub fn float64(field_name: impl Into<String>, value: f64) -> Self {
55        Self::new(field_name, FieldValue::Float64(value))
56    }
57
58    pub fn bool(field_name: impl Into<String>, value: bool) -> Self {
59        Self::new(field_name, FieldValue::Bool(value))
60    }
61
62    /// Create a vector field with the reserved name "vector".
63    pub fn vector(value: Vec<f32>) -> Self {
64        Self::new(VECTOR_FIELD_NAME, FieldValue::Vector(value))
65    }
66}
67
68impl Encode for Field {
69    fn encode(&self, buf: &mut BytesMut) {
70        encode_utf8(&self.field_name, buf);
71        self.value.encode(buf);
72    }
73}
74
75impl Decode for Field {
76    fn decode(buf: &mut &[u8]) -> Result<Self, EncodingError> {
77        let field_name = decode_utf8(buf)?;
78        let value = FieldValue::decode(buf)?;
79        if field_name == VECTOR_FIELD_NAME && value.field_type() != FieldType::Vector {
80            return Err(EncodingError {
81                message: "vector field must have type vector".to_string(),
82            });
83        }
84        Ok(Field { field_name, value })
85    }
86}
87
88impl Field {
89    /// Decode a MetadataField that may contain a Vector value.
90    ///
91    /// This is needed because Vector fields require knowing the dimensions.
92    fn decode_with_dimensions(buf: &mut &[u8], dimensions: usize) -> Result<Self, EncodingError> {
93        let field_name = decode_utf8(buf)?;
94        let value = FieldValue::decode_with_dimensions(buf, dimensions)?;
95        if field_name == VECTOR_FIELD_NAME && value.field_type() != FieldType::Vector {
96            return Err(EncodingError {
97                message: "vector field must have type vector".to_string(),
98            });
99        }
100        Ok(Field { field_name, value })
101    }
102}
103
104/// VectorData value storing the external ID, embedding vector, and metadata.
105///
106/// The key for this record is `VectorDataKey { vector_id }`, where `vector_id`
107/// is the internal u64 ID (not the user-provided external ID).
108///
109/// ## Value Layout (little-endian)
110///
111/// ```text
112/// ┌────────────────────────────────────────────────────────────────┐
113/// │  external_id: Utf8  (max 64 bytes, user-provided identifier)   │
114/// │  fields:      Array<Field>                                     │
115/// │                                                                │
116/// │  Field                                                         │
117/// │  ┌──────────────────────────────────────────────────────────┐  │
118/// │  │  field_name:  Utf8                                       │  │
119/// │  │  value:       FieldValue (tagged union)                  │  │
120/// │  └──────────────────────────────────────────────────────────┘  │
121/// │                                                                │
122/// │  The vector is stored as a special field with name "vector"    │
123/// │  and type FieldValue::Vector.                                  │
124/// └────────────────────────────────────────────────────────────────┘
125/// ```
126///
127/// ## Field Ordering
128///
129/// Fields are automatically sorted by `field_name` during construction to ensure
130/// consistent encoding.
131///
132/// ## Common Dimensionalities
133///
134/// | Model              | Dimensions | Vector Size |
135/// |--------------------|------------|-------------|
136/// | MiniLM-L6          | 384        | 1.5 KB      |
137/// | BERT base          | 768        | 3 KB        |
138/// | OpenAI ada-002     | 1536       | 6 KB        |
139/// | OpenAI text-3-large| 3072       | 12 KB       |
140#[derive(Debug, Clone, PartialEq)]
141pub struct VectorDataValue {
142    /// User-provided external identifier (max 64 bytes).
143    external_id: String,
144
145    /// All fields including metadata and the vector (stored as field "vector").
146    /// Sorted by field_name for consistent encoding.
147    fields: Vec<Field>,
148}
149
150impl VectorDataValue {
151    /// Create a new VectorDataValue with external ID and fields.
152    ///
153    /// The caller should include a field with name `VECTOR_FIELD_NAME` ("vector")
154    /// containing a `FieldValue::Vector`. Fields are sorted by name for consistent encoding.
155    pub fn new(external_id: impl Into<String>, fields: Vec<Field>) -> Self {
156        let mut fields = fields;
157        // Sort fields by name for consistent encoding
158        fields.sort_by(|a, b| a.field_name.cmp(&b.field_name));
159        Self {
160            external_id: external_id.into(),
161            fields,
162        }
163    }
164
165    /// Encode to bytes.
166    pub fn encode_to_bytes(&self) -> Bytes {
167        let mut buf = BytesMut::new();
168        encode_utf8(&self.external_id, &mut buf);
169        // Encode array manually: count + elements
170        let count = self.fields.len();
171        if count > u16::MAX as usize {
172            panic!("Too many fields: {}", count);
173        }
174        buf.extend_from_slice(&(count as u16).to_le_bytes());
175        for field in &self.fields {
176            field.encode(&mut buf);
177        }
178        buf.freeze()
179    }
180
181    /// Decode vector data from bytes.
182    ///
183    /// Requires dimensions to properly decode the vector field.
184    pub fn decode_from_bytes(buf: &[u8], dimensions: usize) -> Result<Self, EncodingError> {
185        let mut slice = buf;
186        let external_id = decode_utf8(&mut slice)?;
187
188        // Decode array count
189        if slice.len() < 2 {
190            return Err(EncodingError {
191                message: "Buffer too short for field count".to_string(),
192            });
193        }
194        let count = u16::from_le_bytes([slice[0], slice[1]]) as usize;
195        slice = &slice[2..];
196
197        // Decode fields
198        let mut fields = Vec::with_capacity(count);
199        for _ in 0..count {
200            fields.push(Field::decode_with_dimensions(&mut slice, dimensions)?);
201        }
202
203        Ok(VectorDataValue {
204            external_id,
205            fields,
206        })
207    }
208
209    /// Returns the external ID.
210    pub fn external_id(&self) -> &str {
211        &self.external_id
212    }
213
214    /// Returns an iterator over metadata fields (excludes the vector field).
215    pub fn fields(&self) -> impl Iterator<Item = &Field> {
216        self.fields.iter()
217    }
218
219    /// Get a metadata field by name.
220    pub fn field(&self, name: &str) -> Option<&FieldValue> {
221        self.fields
222            .iter()
223            .find(|f| f.field_name == name)
224            .map(|f| &f.value)
225    }
226
227    /// Get a string field value.
228    pub fn string_field(&self, name: &str) -> Option<&str> {
229        match self.field(name) {
230            Some(FieldValue::String(s)) => Some(s),
231            _ => None,
232        }
233    }
234
235    /// Get an i64 field value.
236    pub fn int64_field(&self, name: &str) -> Option<i64> {
237        match self.field(name) {
238            Some(FieldValue::Int64(v)) => Some(*v),
239            _ => None,
240        }
241    }
242
243    /// Get an f64 field value.
244    pub fn float64_field(&self, name: &str) -> Option<f64> {
245        match self.field(name) {
246            Some(FieldValue::Float64(v)) => Some(*v),
247            _ => None,
248        }
249    }
250
251    /// Get a bool field value.
252    pub fn bool_field(&self, name: &str) -> Option<bool> {
253        match self.field(name) {
254            Some(FieldValue::Bool(v)) => Some(*v),
255            _ => None,
256        }
257    }
258
259    /// Get vector field value
260    pub fn vector_field(&self) -> &[f32] {
261        let FieldValue::Vector(v) = self
262            .field(VECTOR_FIELD_NAME)
263            .expect("vector data must have vector field")
264        else {
265            panic!("vector field must have type vector")
266        };
267        v
268    }
269}
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274
275    #[test]
276    fn should_encode_and_decode_vector_data_value() {
277        // given
278        let value = VectorDataValue::new(
279            "my-vector-id",
280            vec![
281                Field::vector(vec![1.0, 2.0, 3.0, 4.0]),
282                Field::string("category", "shoes"),
283                Field::float64("price", 99.99),
284                Field::bool("active", true),
285            ],
286        );
287
288        // when
289        let encoded = value.encode_to_bytes();
290        let decoded = VectorDataValue::decode_from_bytes(&encoded, 4).unwrap();
291
292        // then
293        assert_eq!(decoded.external_id(), "my-vector-id");
294        assert_eq!(decoded.string_field("category"), Some("shoes"));
295        assert_eq!(decoded.float64_field("price"), Some(99.99));
296        assert_eq!(decoded.bool_field("active"), Some(true));
297    }
298
299    #[test]
300    fn should_handle_vector_only() {
301        // given
302        let value = VectorDataValue::new("vector-only", vec![Field::vector(vec![1.0, 2.0, 3.0])]);
303
304        // when
305        let encoded = value.encode_to_bytes();
306        let decoded = VectorDataValue::decode_from_bytes(&encoded, 3).unwrap();
307
308        // then
309        assert_eq!(decoded, value);
310    }
311
312    #[test]
313    fn should_sort_fields_by_name() {
314        // given
315        let value = VectorDataValue::new(
316            "test",
317            vec![
318                Field::vector(vec![1.0]),
319                Field::string("zebra", "last"),
320                Field::string("apple", "first"),
321                Field::string("mango", "middle"),
322            ],
323        );
324
325        // when
326        let metadata: Vec<_> = value.fields().collect();
327
328        // then - fields should be sorted alphabetically
329        assert_eq!(metadata[0].field_name, "apple");
330        assert_eq!(metadata[1].field_name, "mango");
331        assert_eq!(metadata[2].field_name, "vector");
332        assert_eq!(metadata[3].field_name, "zebra");
333    }
334
335    #[test]
336    fn should_handle_all_value_types() {
337        // given
338        let value = VectorDataValue::new(
339            "test",
340            vec![
341                Field::vector(vec![1.0, 2.0]),
342                Field::string("s", "hello"),
343                Field::int64("i", -42),
344                Field::float64("f", 1.23),
345                Field::bool("b", false),
346            ],
347        );
348
349        // when
350        let encoded = value.encode_to_bytes();
351        let decoded = VectorDataValue::decode_from_bytes(&encoded, 2).unwrap();
352
353        // then
354        assert_eq!(decoded.string_field("s"), Some("hello"));
355        assert_eq!(decoded.int64_field("i"), Some(-42));
356        assert_eq!(decoded.float64_field("f"), Some(1.23));
357        assert_eq!(decoded.bool_field("b"), Some(false));
358    }
359
360    #[test]
361    fn should_handle_high_dimensional_vector() {
362        // given
363        let vector: Vec<f32> = (0..1536).map(|i| i as f32 * 0.001).collect();
364        let value = VectorDataValue::new("high-dim", vec![Field::vector(vector)]);
365
366        // when
367        let encoded = value.encode_to_bytes();
368        let decoded = VectorDataValue::decode_from_bytes(&encoded, 1536).unwrap();
369
370        // then
371        assert_eq!(decoded, value);
372    }
373
374    #[test]
375    fn should_handle_unicode_in_metadata() {
376        // given
377        let value = VectorDataValue::new(
378            "unicode-test",
379            vec![
380                Field::vector(vec![1.0]),
381                Field::string("greeting", "Hello, 世界!"),
382            ],
383        );
384
385        // when
386        let encoded = value.encode_to_bytes();
387        let decoded = VectorDataValue::decode_from_bytes(&encoded, 1).unwrap();
388
389        // then
390        assert_eq!(decoded.string_field("greeting"), Some("Hello, 世界!"));
391    }
392
393    #[test]
394    fn should_return_none_for_missing_field() {
395        // given
396        let value = VectorDataValue::new(
397            "test",
398            vec![Field::vector(vec![1.0]), Field::string("exists", "yes")],
399        );
400
401        // when / then
402        assert!(value.field("missing").is_none());
403        assert!(value.string_field("missing").is_none());
404    }
405
406    #[test]
407    fn should_return_none_for_wrong_type() {
408        // given
409        let value = VectorDataValue::new(
410            "test",
411            vec![Field::vector(vec![1.0]), Field::string("name", "value")],
412        );
413
414        // when / then
415        assert!(value.int64_field("name").is_none());
416        assert!(value.float64_field("name").is_none());
417        assert!(value.bool_field("name").is_none());
418    }
419
420    #[test]
421    fn should_handle_special_float_values_in_vector() {
422        // given
423        let value = VectorDataValue::new(
424            "special-floats",
425            vec![Field::vector(vec![
426                0.0,
427                -0.0,
428                f32::INFINITY,
429                f32::NEG_INFINITY,
430                f32::MIN,
431                f32::MAX,
432            ])],
433        );
434
435        // when
436        let encoded = value.encode_to_bytes();
437        let decoded = VectorDataValue::decode_from_bytes(&encoded, 6).unwrap();
438
439        // then - verify round-trip worked by checking metadata fields count
440        assert_eq!(decoded.external_id(), "special-floats");
441        assert_eq!(decoded.fields().count(), 1);
442        assert_eq!(decoded.vector_field(), value.vector_field());
443    }
444}