Skip to main content

uni_common/
vector_index_opts.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright 2024-2026 Dragonscale Team
3
4//! Single source of truth for parsing vector-index options into a
5//! [`VectorIndexType`] + [`DistanceMetric`].
6//!
7//! ALL index-creation entry points use these helpers so dense vectors, native
8//! multi-vectors, and MUVERA behave **identically** regardless of path:
9//! - the Cypher DDL `CREATE VECTOR INDEX ... OPTIONS {type:'...', ...}` (`planner.rs`),
10//! - the `uni.schema.createIndex(...)` procedure (`executor::ddl_procedures`), and
11//! - the Python binding config map (`bindings/uni-db/src/core.rs`).
12//!
13//! (The typed Rust builder `VectorAlgo` in the `uni` crate maps directly to the same
14//! `VectorIndexType`.) Lives in `uni-common` — the only crate every surface depends on.
15//! Keeping the mapping here prevents the paths from drifting (they previously had
16//! different default ANN types: `ivf_pq` vs `hnsw`).
17
18use anyhow::Result;
19
20use crate::core::schema::{DistanceMetric, VectorIndexType};
21use crate::muvera::DEFAULT_FDE_SEED;
22
23/// Raw, already-typed vector-index options collected from either entry point. Each
24/// field is the user-supplied value or `None` (→ the canonical default below).
25#[derive(Debug, Default, Clone)]
26pub struct VectorIndexOpts<'a> {
27    /// The ANN/index subtype name (`flat`, `ivf_pq`, `hnsw_sq`, `muvera`, …). For the
28    /// DDL path this is `OPTIONS.type`; for the procedure it is the `algorithm` field.
29    pub type_name: Option<&'a str>,
30    pub partitions: Option<u32>,
31    pub m: Option<u32>,
32    pub ef_construction: Option<u32>,
33    pub sub_vectors: Option<u32>,
34    pub num_bits: Option<u8>,
35    // MUVERA-only knobs.
36    pub k_sim: Option<u32>,
37    pub reps: Option<u32>,
38    pub d_proj: Option<u32>,
39    pub seed: Option<u64>,
40    /// The single-vector ANN type built over the MUVERA FDE column.
41    pub inner: Option<&'a str>,
42}
43
44/// Map a single-vector ANN type name to a [`VectorIndexType`], defaulting to `IvfPq`.
45/// Shared by the outer index type and the MUVERA `inner` type.
46fn ann_type(o: &VectorIndexOpts, t: Option<&str>) -> VectorIndexType {
47    match t {
48        Some("flat") => VectorIndexType::Flat,
49        Some("ivf_flat") => VectorIndexType::IvfFlat {
50            num_partitions: o.partitions.unwrap_or(256),
51        },
52        Some("ivf_sq") => VectorIndexType::IvfSq {
53            num_partitions: o.partitions.unwrap_or(256),
54        },
55        Some("ivf_rq") => VectorIndexType::IvfRq {
56            num_partitions: o.partitions.unwrap_or(256),
57            num_bits: o.num_bits,
58        },
59        Some("hnsw_flat") => VectorIndexType::HnswFlat {
60            m: o.m.unwrap_or(16),
61            ef_construction: o.ef_construction.unwrap_or(200),
62            num_partitions: o.partitions,
63        },
64        Some("hnsw") | Some("hnsw_sq") => VectorIndexType::HnswSq {
65            m: o.m.unwrap_or(16),
66            ef_construction: o.ef_construction.unwrap_or(200),
67            num_partitions: o.partitions,
68        },
69        Some("hnsw_pq") => VectorIndexType::HnswPq {
70            m: o.m.unwrap_or(16),
71            ef_construction: o.ef_construction.unwrap_or(200),
72            num_sub_vectors: o.sub_vectors.unwrap_or(16),
73            num_partitions: o.partitions,
74        },
75        // None / unknown → IVF_PQ (the canonical default for BOTH paths).
76        _ => VectorIndexType::IvfPq {
77            num_partitions: o.partitions.unwrap_or(256),
78            num_sub_vectors: o.sub_vectors.unwrap_or(16),
79            bits_per_subvector: o.num_bits.unwrap_or(8),
80        },
81    }
82}
83
84/// Build a [`VectorIndexType`] from raw options. `type:'muvera'` produces a MUVERA index
85/// whose `inner` ANN (over the derived FDE column) is itself parsed via the private
86/// `ann_type` helper.
87///
88/// NOTE: the MUVERA defaults below (`k_sim=4, reps=20, d_proj=16`) are reasonable starting
89/// points, NOT values validated for recall on a specific corpus. FDE recall is
90/// corpus-dependent; tune these per corpus and confirm recall@k with the bench harness
91/// `crates/uni-store/examples/multivec_recall_real.rs` (real ColBERT corpus) before relying
92/// on the first-stage retrieval quality.
93pub fn build_vector_index_type(o: &VectorIndexOpts) -> VectorIndexType {
94    match o.type_name {
95        Some("muvera") => VectorIndexType::Muvera {
96            k_sim: o.k_sim.unwrap_or(4),
97            reps: o.reps.unwrap_or(20),
98            d_proj: o.d_proj.unwrap_or(16),
99            seed: o.seed.unwrap_or(DEFAULT_FDE_SEED),
100            inner: Box::new(ann_type(o, o.inner)),
101        },
102        other => ann_type(o, other),
103    }
104}
105
106/// Parse a vector distance-metric name; errors on an unknown value. `None` → `Cosine`
107/// (the ColBERT/vector default). Shared by both paths so the error text matches.
108pub fn parse_vector_metric(s: Option<&str>) -> Result<DistanceMetric> {
109    match s.map(|m| m.to_ascii_lowercase()).as_deref() {
110        Some("l2") | Some("euclidean") => Ok(DistanceMetric::L2),
111        Some("dot") => Ok(DistanceMetric::Dot),
112        Some("cosine") | None => Ok(DistanceMetric::Cosine),
113        Some(other) => Err(anyhow::anyhow!(
114            "Unknown vector index metric '{other}' (expected cosine, l2, or dot)"
115        )),
116    }
117}
118
119#[cfg(test)]
120mod tests {
121    use super::*;
122
123    fn opts(type_name: Option<&str>) -> VectorIndexOpts<'_> {
124        VectorIndexOpts {
125            type_name,
126            ..Default::default()
127        }
128    }
129
130    #[test]
131    fn default_is_ivf_pq_for_both_paths() {
132        // None and unknown names both default to IVF_PQ (the canonical default).
133        assert!(matches!(
134            build_vector_index_type(&opts(None)),
135            VectorIndexType::IvfPq { .. }
136        ));
137        assert!(matches!(
138            build_vector_index_type(&opts(Some("nonsense"))),
139            VectorIndexType::IvfPq { .. }
140        ));
141    }
142
143    #[test]
144    fn named_types_map() {
145        assert!(matches!(
146            build_vector_index_type(&opts(Some("flat"))),
147            VectorIndexType::Flat
148        ));
149        assert!(matches!(
150            build_vector_index_type(&opts(Some("hnsw"))),
151            VectorIndexType::HnswSq { .. }
152        ));
153    }
154
155    #[test]
156    fn muvera_defaults_and_inner() {
157        let o = VectorIndexOpts {
158            type_name: Some("muvera"),
159            inner: Some("flat"),
160            ..Default::default()
161        };
162        match build_vector_index_type(&o) {
163            VectorIndexType::Muvera {
164                k_sim,
165                reps,
166                d_proj,
167                seed,
168                inner,
169            } => {
170                assert_eq!((k_sim, reps, d_proj), (4, 20, 16));
171                assert_eq!(seed, DEFAULT_FDE_SEED);
172                assert!(matches!(*inner, VectorIndexType::Flat));
173            }
174            other => panic!("expected Muvera, got {other:?}"),
175        }
176        // Default inner is IVF_PQ.
177        assert!(matches!(
178            build_vector_index_type(&opts(Some("muvera"))),
179            VectorIndexType::Muvera { inner, .. } if matches!(*inner, VectorIndexType::IvfPq { .. })
180        ));
181    }
182
183    #[test]
184    fn metric_parsing() {
185        assert_eq!(parse_vector_metric(None).unwrap(), DistanceMetric::Cosine);
186        assert_eq!(parse_vector_metric(Some("L2")).unwrap(), DistanceMetric::L2);
187        assert_eq!(
188            parse_vector_metric(Some("dot")).unwrap(),
189            DistanceMetric::Dot
190        );
191        assert!(parse_vector_metric(Some("hamming")).is_err());
192    }
193}