Skip to main content

quiver_core/
descriptor.rs

1// SPDX-License-Identifier: AGPL-3.0-only
2//! Collection descriptors: the schema fixed when a collection is created.
3
4use serde::{Deserialize, Serialize};
5
6/// The element type of stored vectors. Phase 1 ships `f32`; lower-precision and
7/// quantized dtypes arrive with the memory-frugality work in Phase 2.
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
9#[non_exhaustive]
10pub enum Dtype {
11    /// 32-bit IEEE-754 float.
12    F32,
13}
14
15impl Dtype {
16    /// Size in bytes of one vector element.
17    #[must_use]
18    pub const fn element_size(self) -> usize {
19        match self {
20            Dtype::F32 => 4,
21        }
22    }
23}
24
25/// The distance / similarity function a collection is searched with.
26#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
27pub enum DistanceMetric {
28    /// Inner product — higher is more similar.
29    Dot,
30    /// Cosine similarity — higher is more similar.
31    Cosine,
32    /// Squared Euclidean distance — lower is more similar.
33    L2,
34}
35
36/// The index structure a collection is served by (ADR-0007). The default is the
37/// in-memory HNSW graph; the others are the Phase 2 memory-frugal options.
38#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
39#[serde(rename_all = "snake_case")]
40#[non_exhaustive]
41pub enum IndexKind {
42    /// In-memory HNSW graph: lowest latency, fits in RAM. The default.
43    #[default]
44    Hnsw,
45    /// In-memory Vamana (DiskANN) graph.
46    Vamana,
47    /// Disk-resident Vamana: PQ codes in RAM, graph + full vectors on SSD.
48    DiskVamana,
49    /// Inverted-file index with coarse Voronoi partitioning.
50    Ivf,
51    /// ColBERTv2/PLAID compressed token-pool index for late-interaction
52    /// (`multivector`) collections: centroid + residual-PQ codes in RAM with
53    /// centroid-pruned candidate generation (ADR-0034). Valid only for a
54    /// `multivector` collection.
55    Colbert,
56}
57
58/// Which index a collection uses and how its vectors are compressed (ADR-0007,
59/// ADR-0008). Defaults to in-memory HNSW with no quantization (exact search).
60#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
61pub struct IndexSpec {
62    /// The index structure.
63    pub kind: IndexKind,
64    /// Product-quantization subspaces for quantized kinds (the disk graph,
65    /// IVF+PQ). `None` selects a kind-appropriate default or no quantization.
66    pub pq_subspaces: Option<u32>,
67}
68
69/// The type of a filterable payload field, which fixes how its values are keyed
70/// in the secondary index (`.sec`) — and therefore which predicates it answers.
71#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
72#[serde(rename_all = "snake_case")]
73#[non_exhaustive]
74pub enum FieldType {
75    /// An exact-match string field (equality and lexical range).
76    Keyword,
77    /// A numeric field (equality and numeric range), keyed order-preserving.
78    Numeric,
79}
80
81/// A payload field declared filterable at collection creation: its dot-path and
82/// type. Declared fields are extracted into the per-segment secondary index at
83/// flush time (ADR-0022), enabling pre-filtered (hybrid) search.
84#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
85pub struct FilterableField {
86    /// Dot-path into the JSON payload (e.g. `"user.age"`).
87    pub path: String,
88    /// The field's value type.
89    pub field_type: FieldType,
90}
91
92impl FilterableField {
93    /// A keyword (exact-match string) field at `path`.
94    #[must_use]
95    pub fn keyword(path: impl Into<String>) -> Self {
96        Self {
97            path: path.into(),
98            field_type: FieldType::Keyword,
99        }
100    }
101
102    /// A numeric field at `path`.
103    #[must_use]
104    pub fn numeric(path: impl Into<String>) -> Self {
105        Self {
106            path: path.into(),
107            field_type: FieldType::Numeric,
108        }
109    }
110}
111
112/// How a collection's vectors are encrypted (ADR-0031, ADR-0032). Encryption is
113/// always **client-side** — the server never holds the key. Defaults to
114/// [`VectorEncryption::None`]. The variants sit on Quiver's encrypted-search
115/// spectrum, from fastest to most confidential:
116///
117/// - [`None`](VectorEncryption::None): plaintext vectors; the server ranks and
118///   sees everything (the default).
119/// - [`Dcpe`](VectorEncryption::Dcpe): experimental property-preserving
120///   encryption; the server ranks ciphertexts but the approximate
121///   distance-comparison relation leaks **by design**. `L2` only; not
122///   semantically secure (ADR-0031).
123/// - [`ClientSide`](VectorEncryption::ClientSide): semantically secure (IND-CPA)
124///   opaque AEAD ciphertext; the server stores blobs it cannot read and does
125///   **no** distance math, so the client fetches and ranks locally (ADR-0032).
126///
127/// The discriminants are chosen so a descriptor written when this flag was a
128/// `bool encrypted_vectors` decodes unchanged: `false` (byte 0) is
129/// [`None`](VectorEncryption::None) and `true` (byte 1) is
130/// [`Dcpe`](VectorEncryption::Dcpe) — no data migration.
131#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
132#[serde(rename_all = "snake_case")]
133pub enum VectorEncryption {
134    /// Plaintext vectors; the server ranks (the default).
135    #[default]
136    None,
137    /// Experimental DCPE ciphertext (ADR-0031): the server ranks, the approximate
138    /// distance-comparison relation leaks by design, `L2` only, not semantically
139    /// secure.
140    Dcpe,
141    /// Semantically secure opaque AEAD ciphertext (ADR-0032): the server stores
142    /// blobs it cannot read and does no distance math; the client ranks locally.
143    ClientSide,
144}
145
146/// The immutable schema of a collection, fixed at creation.
147#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
148pub struct Descriptor {
149    /// Vector dimensionality.
150    pub dim: u32,
151    /// Vector element type.
152    pub dtype: Dtype,
153    /// Distance metric used for search.
154    pub metric: DistanceMetric,
155    /// Index & quantization configuration. Defaults to HNSW/exact and is absent
156    /// in descriptors written before Phase 2 (filled by the default on read).
157    #[serde(default)]
158    pub index: IndexSpec,
159    /// Payload fields indexed for filtering. Empty by default and absent in
160    /// descriptors written before secondary indexes existed (defaulted on read).
161    #[serde(default)]
162    pub filterable: Vec<FilterableField>,
163    /// Whether this is a multi-vector (late-interaction / ColBERT) collection:
164    /// each document is stored as a group of token-vector rows and searched by
165    /// MaxSim (ADR-0028). `false` for an ordinary single-vector collection, and
166    /// absent in descriptors written before late interaction existed (defaulted to
167    /// `false` on read).
168    #[serde(default)]
169    pub multivector: bool,
170    /// How this collection's vectors are encrypted (ADR-0031, ADR-0032).
171    /// [`VectorEncryption::None`] by default; [`Dcpe`](VectorEncryption::Dcpe)
172    /// requires the `L2` metric. Absent in descriptors written before the flag
173    /// existed (defaulted to `None` on read); a descriptor written while the flag
174    /// was a `bool encrypted_vectors` decodes unchanged (`false`→`None`,
175    /// `true`→`Dcpe`).
176    #[serde(default)]
177    pub vector_encryption: VectorEncryption,
178}
179
180impl Descriptor {
181    /// A descriptor with the default index (in-memory HNSW, exact) and no
182    /// filterable fields.
183    #[must_use]
184    pub fn new(dim: u32, dtype: Dtype, metric: DistanceMetric) -> Self {
185        Self {
186            dim,
187            dtype,
188            metric,
189            index: IndexSpec::default(),
190            filterable: Vec::new(),
191            multivector: false,
192            vector_encryption: VectorEncryption::None,
193        }
194    }
195
196    /// Set the index specification (builder style).
197    #[must_use]
198    pub fn with_index(mut self, index: IndexSpec) -> Self {
199        self.index = index;
200        self
201    }
202
203    /// Set the filterable payload fields (builder style).
204    #[must_use]
205    pub fn with_filterable(mut self, filterable: Vec<FilterableField>) -> Self {
206        self.filterable = filterable;
207        self
208    }
209
210    /// Mark this collection as multi-vector (late-interaction / ColBERT), so each
211    /// document is stored as a group of token-vector rows scored by MaxSim
212    /// (builder style). The dimensionality is the per-token dimension.
213    #[must_use]
214    pub fn with_multivector(mut self, multivector: bool) -> Self {
215        self.multivector = multivector;
216        self
217    }
218
219    /// Set how this collection's vectors are encrypted (builder style). A
220    /// [`Dcpe`](VectorEncryption::Dcpe) collection must use the `L2` metric; a
221    /// [`ClientSide`](VectorEncryption::ClientSide) collection is searched by the
222    /// client, not the server (ADR-0031, ADR-0032).
223    #[must_use]
224    pub fn with_vector_encryption(mut self, vector_encryption: VectorEncryption) -> Self {
225        self.vector_encryption = vector_encryption;
226        self
227    }
228
229    /// Decode a descriptor from its postcard bytes, tolerating every earlier
230    /// layout.
231    ///
232    /// postcard is non-self-describing, so a missing *trailing* field cannot be
233    /// defaulted by `#[serde(default)]` alone (the reader hits end-of-input and
234    /// errors). We therefore try the layouts newest-to-oldest — current
235    /// (with `vector_encryption`) → the six-field `multivector` layout → the
236    /// five-field `filterable` layout → the four-field `index`-only layout → the
237    /// original three-field layout — defaulting the missing trailing fields. The
238    /// order matters: postcard ignores trailing bytes, so an older decoder would
239    /// silently mis-read a newer buffer if tried first.
240    ///
241    /// # Errors
242    /// Returns the postcard error if the bytes match no known layout.
243    pub fn decode(bytes: &[u8]) -> std::result::Result<Self, postcard::Error> {
244        postcard::from_bytes::<Self>(bytes)
245            .or_else(|_| postcard::from_bytes::<DescriptorV4>(bytes).map(Self::from))
246            .or_else(|_| postcard::from_bytes::<DescriptorV3>(bytes).map(Self::from))
247            .or_else(|_| postcard::from_bytes::<DescriptorV2>(bytes).map(Self::from))
248            .or_else(|_| postcard::from_bytes::<LegacyDescriptor>(bytes).map(Self::from))
249    }
250
251    /// Byte length of one stored vector (`dim × element_size`).
252    #[must_use]
253    pub fn stride(&self) -> usize {
254        self.dim as usize * self.dtype.element_size()
255    }
256}
257
258// The six-field layout (through `multivector`, no `vector_encryption`), kept only
259// to migrate descriptors written before client-side encryption existed, via
260// [`Descriptor::decode`].
261// It must be tried before the five-field layout, which would otherwise silently
262// drop `multivector` (postcard ignores trailing bytes).
263#[derive(Deserialize)]
264struct DescriptorV4 {
265    dim: u32,
266    dtype: Dtype,
267    metric: DistanceMetric,
268    index: IndexSpec,
269    filterable: Vec<FilterableField>,
270    multivector: bool,
271}
272
273impl From<DescriptorV4> for Descriptor {
274    fn from(v: DescriptorV4) -> Self {
275        Self {
276            dim: v.dim,
277            dtype: v.dtype,
278            metric: v.metric,
279            index: v.index,
280            filterable: v.filterable,
281            multivector: v.multivector,
282            vector_encryption: VectorEncryption::None,
283        }
284    }
285}
286
287// The five-field layout (an `index` and `filterable` but no `multivector`), kept
288// only to migrate descriptors written before late interaction existed, via
289// [`Descriptor::decode`]. It must be tried before the four-field layout, which
290// would otherwise silently drop `filterable` (postcard ignores trailing bytes).
291#[derive(Deserialize)]
292struct DescriptorV3 {
293    dim: u32,
294    dtype: Dtype,
295    metric: DistanceMetric,
296    index: IndexSpec,
297    filterable: Vec<FilterableField>,
298}
299
300impl From<DescriptorV3> for Descriptor {
301    fn from(v: DescriptorV3) -> Self {
302        Self {
303            dim: v.dim,
304            dtype: v.dtype,
305            metric: v.metric,
306            index: v.index,
307            filterable: v.filterable,
308            multivector: false,
309            vector_encryption: VectorEncryption::None,
310        }
311    }
312}
313
314// The four-field layout (an `index` but no `filterable`), kept only to migrate
315// descriptors written before secondary indexes existed, via [`Descriptor::decode`].
316#[derive(Deserialize)]
317struct DescriptorV2 {
318    dim: u32,
319    dtype: Dtype,
320    metric: DistanceMetric,
321    index: IndexSpec,
322}
323
324impl From<DescriptorV2> for Descriptor {
325    fn from(v: DescriptorV2) -> Self {
326        Self {
327            dim: v.dim,
328            dtype: v.dtype,
329            metric: v.metric,
330            index: v.index,
331            filterable: Vec::new(),
332            multivector: false,
333            vector_encryption: VectorEncryption::None,
334        }
335    }
336}
337
338// The original three-field layout (no `index`, no `filterable`), kept only to
339// migrate the oldest databases on read via [`Descriptor::decode`].
340#[derive(Deserialize)]
341struct LegacyDescriptor {
342    dim: u32,
343    dtype: Dtype,
344    metric: DistanceMetric,
345}
346
347impl From<LegacyDescriptor> for Descriptor {
348    fn from(v: LegacyDescriptor) -> Self {
349        Self {
350            dim: v.dim,
351            dtype: v.dtype,
352            metric: v.metric,
353            index: IndexSpec::default(),
354            filterable: Vec::new(),
355            multivector: false,
356            vector_encryption: VectorEncryption::None,
357        }
358    }
359}
360
361#[cfg(test)]
362mod tests {
363    use super::*;
364
365    #[test]
366    fn stride_matches_dim_and_dtype() {
367        let d = Descriptor::new(128, Dtype::F32, DistanceMetric::L2);
368        assert_eq!(d.stride(), 512);
369        assert_eq!(Dtype::F32.element_size(), 4);
370        // The default index is in-memory HNSW with no quantization.
371        assert_eq!(d.index, IndexSpec::default());
372        assert_eq!(d.index.kind, IndexKind::Hnsw);
373    }
374
375    #[test]
376    fn descriptor_roundtrips_through_postcard() {
377        let d = Descriptor::new(8, Dtype::F32, DistanceMetric::Cosine).with_index(IndexSpec {
378            kind: IndexKind::DiskVamana,
379            pq_subspaces: Some(16),
380        });
381        let bytes = postcard::to_allocvec(&d).unwrap();
382        let back: Descriptor = postcard::from_bytes(&bytes).unwrap();
383        assert_eq!(d, back);
384    }
385
386    // A descriptor serialized before the `index` field existed (only dim, dtype,
387    // metric) must still deserialize, defaulting the index to HNSW.
388    #[test]
389    fn pre_phase2_descriptor_deserializes_with_default_index() {
390        #[derive(serde::Serialize)]
391        struct OldDescriptor {
392            dim: u32,
393            dtype: Dtype,
394            metric: DistanceMetric,
395        }
396        let old = OldDescriptor {
397            dim: 16,
398            dtype: Dtype::F32,
399            metric: DistanceMetric::L2,
400        };
401        let bytes = postcard::to_allocvec(&old).unwrap();
402        // The raw new-layout decode fails on the shorter legacy bytes...
403        assert!(postcard::from_bytes::<Descriptor>(&bytes).is_err());
404        // ...but `decode` falls back to the legacy layout and defaults the index.
405        let back = Descriptor::decode(&bytes).unwrap();
406        assert_eq!(back.dim, 16);
407        assert_eq!(back.metric, DistanceMetric::L2);
408        assert_eq!(back.index, IndexSpec::default());
409    }
410
411    #[test]
412    fn decode_reads_current_layout() {
413        let d = Descriptor::new(8, Dtype::F32, DistanceMetric::Dot).with_index(IndexSpec {
414            kind: IndexKind::Ivf,
415            pq_subspaces: Some(8),
416        });
417        let bytes = postcard::to_allocvec(&d).unwrap();
418        assert_eq!(Descriptor::decode(&bytes).unwrap(), d);
419    }
420
421    // A descriptor serialized before `filterable` existed (four fields, with an
422    // `index`) must still deserialize — and the four-field fallback must run
423    // before the three-field one, so the `index` is preserved, not defaulted.
424    #[test]
425    fn pre_filterable_descriptor_decodes_and_keeps_its_index() {
426        #[derive(serde::Serialize)]
427        struct DescriptorV2 {
428            dim: u32,
429            dtype: Dtype,
430            metric: DistanceMetric,
431            index: IndexSpec,
432        }
433        let old = DescriptorV2 {
434            dim: 8,
435            dtype: Dtype::F32,
436            metric: DistanceMetric::L2,
437            index: IndexSpec {
438                kind: IndexKind::DiskVamana,
439                pq_subspaces: Some(16),
440            },
441        };
442        let bytes = postcard::to_allocvec(&old).unwrap();
443        // The current five-field decode fails on the shorter buffer...
444        assert!(postcard::from_bytes::<Descriptor>(&bytes).is_err());
445        // ...but `decode` falls back to the four-field layout, keeping the index
446        // (not the three-field legacy layout, which would lose it).
447        let back = Descriptor::decode(&bytes).unwrap();
448        assert_eq!(back.dim, 8);
449        assert_eq!(back.index.kind, IndexKind::DiskVamana);
450        assert_eq!(back.index.pq_subspaces, Some(16));
451        assert!(back.filterable.is_empty());
452    }
453
454    #[test]
455    fn descriptor_with_filterable_roundtrips() {
456        let d = Descriptor::new(4, Dtype::F32, DistanceMetric::L2).with_filterable(vec![
457            FilterableField::keyword("city"),
458            FilterableField::numeric("age"),
459        ]);
460        let bytes = postcard::to_allocvec(&d).unwrap();
461        assert_eq!(Descriptor::decode(&bytes).unwrap(), d);
462    }
463
464    #[test]
465    fn descriptor_with_multivector_roundtrips() {
466        let d = Descriptor::new(128, Dtype::F32, DistanceMetric::Cosine).with_multivector(true);
467        let bytes = postcard::to_allocvec(&d).unwrap();
468        let back = Descriptor::decode(&bytes).unwrap();
469        assert_eq!(back, d);
470        assert!(back.multivector);
471    }
472
473    // A descriptor serialized before `multivector` existed (five fields, with a
474    // `filterable`) must still decode — via the five-field fallback, which keeps
475    // `filterable` and defaults `multivector` to false. The four-field fallback
476    // would wrongly drop `filterable`, so the five-field one must be tried first.
477    #[test]
478    fn pre_multivector_descriptor_decodes_and_keeps_filterable() {
479        #[derive(serde::Serialize)]
480        struct DescriptorV3 {
481            dim: u32,
482            dtype: Dtype,
483            metric: DistanceMetric,
484            index: IndexSpec,
485            filterable: Vec<FilterableField>,
486        }
487        let old = DescriptorV3 {
488            dim: 8,
489            dtype: Dtype::F32,
490            metric: DistanceMetric::Cosine,
491            index: IndexSpec {
492                kind: IndexKind::Ivf,
493                pq_subspaces: Some(8),
494            },
495            filterable: vec![FilterableField::keyword("city")],
496        };
497        let bytes = postcard::to_allocvec(&old).unwrap();
498        // The current six-field decode fails on the shorter buffer...
499        assert!(postcard::from_bytes::<Descriptor>(&bytes).is_err());
500        // ...but `decode` falls back to the five-field layout: filterable kept,
501        // multivector defaulted to false, index preserved.
502        let back = Descriptor::decode(&bytes).unwrap();
503        assert_eq!(back.filterable, vec![FilterableField::keyword("city")]);
504        assert!(!back.multivector);
505        assert_eq!(back.index.kind, IndexKind::Ivf);
506    }
507
508    #[test]
509    fn descriptor_with_vector_encryption_roundtrips() {
510        let d = Descriptor::new(64, Dtype::F32, DistanceMetric::L2)
511            .with_vector_encryption(VectorEncryption::ClientSide);
512        let bytes = postcard::to_allocvec(&d).unwrap();
513        let back = Descriptor::decode(&bytes).unwrap();
514        assert_eq!(back, d);
515        assert_eq!(back.vector_encryption, VectorEncryption::ClientSide);
516    }
517
518    // The vector-encryption flag used to be a `bool encrypted_vectors`. The enum's
519    // discriminants are chosen so those descriptors decode unchanged — a trailing
520    // `true` byte is `Dcpe`, `false` is `None` — so existing DCPE collections need
521    // no data migration.
522    #[test]
523    fn legacy_encrypted_vectors_bool_decodes_as_the_enum() {
524        #[derive(serde::Serialize)]
525        struct OldDescriptor {
526            dim: u32,
527            dtype: Dtype,
528            metric: DistanceMetric,
529            index: IndexSpec,
530            filterable: Vec<FilterableField>,
531            multivector: bool,
532            encrypted_vectors: bool,
533        }
534        let make = |enc: bool| OldDescriptor {
535            dim: 8,
536            dtype: Dtype::F32,
537            metric: DistanceMetric::L2,
538            index: IndexSpec::default(),
539            filterable: Vec::new(),
540            multivector: false,
541            encrypted_vectors: enc,
542        };
543        let dcpe = postcard::to_allocvec(&make(true)).unwrap();
544        assert_eq!(
545            Descriptor::decode(&dcpe).unwrap().vector_encryption,
546            VectorEncryption::Dcpe
547        );
548        let none = postcard::to_allocvec(&make(false)).unwrap();
549        assert_eq!(
550            Descriptor::decode(&none).unwrap().vector_encryption,
551            VectorEncryption::None
552        );
553    }
554
555    // A descriptor serialized before `vector_encryption` existed (six fields,
556    // through `multivector`) must still decode — via the six-field fallback, which
557    // keeps `multivector` and defaults `vector_encryption` to None. The five-field
558    // fallback would wrongly drop `multivector`, so the six-field one is tried first.
559    #[test]
560    fn pre_vector_encryption_descriptor_decodes_and_keeps_multivector() {
561        #[derive(serde::Serialize)]
562        struct DescriptorV4 {
563            dim: u32,
564            dtype: Dtype,
565            metric: DistanceMetric,
566            index: IndexSpec,
567            filterable: Vec<FilterableField>,
568            multivector: bool,
569        }
570        let old = DescriptorV4 {
571            dim: 8,
572            dtype: Dtype::F32,
573            metric: DistanceMetric::Cosine,
574            index: IndexSpec::default(),
575            filterable: vec![FilterableField::numeric("score")],
576            multivector: true,
577        };
578        let bytes = postcard::to_allocvec(&old).unwrap();
579        // The current seven-field decode fails on the shorter buffer...
580        assert!(postcard::from_bytes::<Descriptor>(&bytes).is_err());
581        // ...but `decode` falls back to the six-field layout: multivector and
582        // filterable kept, vector_encryption defaulted to None.
583        let back = Descriptor::decode(&bytes).unwrap();
584        assert!(back.multivector);
585        assert_eq!(back.filterable, vec![FilterableField::numeric("score")]);
586        assert_eq!(back.vector_encryption, VectorEncryption::None);
587    }
588}