lance_index/
vector.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Vector Index
5//!
6
7use std::any::Any;
8use std::fmt::Debug;
9use std::{collections::HashMap, sync::Arc};
10
11use arrow_array::{ArrayRef, Float32Array, RecordBatch, UInt32Array};
12use arrow_schema::Field;
13use async_trait::async_trait;
14use datafusion::execution::SendableRecordBatchStream;
15use deepsize::DeepSizeOf;
16use ivf::storage::IvfModel;
17use lance_core::{Result, ROW_ID_FIELD};
18use lance_io::traits::Reader;
19use lance_linalg::distance::DistanceType;
20use quantizer::{QuantizationType, Quantizer};
21use std::sync::LazyLock;
22use v3::subindex::SubIndexType;
23
24pub mod bq;
25pub mod distributed;
26pub mod flat;
27pub mod graph;
28pub mod hnsw;
29pub mod ivf;
30pub mod kmeans;
31pub mod pq;
32pub mod quantizer;
33pub mod residual;
34pub mod shared;
35pub mod sq;
36pub mod storage;
37pub mod transform;
38pub mod utils;
39pub mod v3;
40
41use super::pb;
42use crate::metrics::MetricsCollector;
43use crate::{prefilter::PreFilter, Index};
44
45// TODO: Make these crate private once the migration from lance to lance-index is done.
46pub const DIST_COL: &str = "_distance";
47pub const DISTANCE_TYPE_KEY: &str = "distance_type";
48pub const INDEX_UUID_COLUMN: &str = "__index_uuid";
49pub const PART_ID_COLUMN: &str = "__ivf_part_id";
50pub const DIST_Q_C_COLUMN: &str = "__dist_q_c";
51// dist from vector to centroid
52pub const CENTROID_DIST_COLUMN: &str = "__centroid_dist";
53pub const PQ_CODE_COLUMN: &str = "__pq_code";
54pub const SQ_CODE_COLUMN: &str = "__sq_code";
55pub const LOSS_METADATA_KEY: &str = "_loss";
56
57pub static VECTOR_RESULT_SCHEMA: LazyLock<arrow_schema::SchemaRef> = LazyLock::new(|| {
58    arrow_schema::SchemaRef::new(arrow_schema::Schema::new(vec![
59        Field::new(DIST_COL, arrow_schema::DataType::Float32, false),
60        ROW_ID_FIELD.clone(),
61    ]))
62});
63
64pub static PART_ID_FIELD: LazyLock<arrow_schema::Field> = LazyLock::new(|| {
65    arrow_schema::Field::new(PART_ID_COLUMN, arrow_schema::DataType::UInt32, true)
66});
67
68pub static CENTROID_DIST_FIELD: LazyLock<arrow_schema::Field> = LazyLock::new(|| {
69    arrow_schema::Field::new(CENTROID_DIST_COLUMN, arrow_schema::DataType::Float32, true)
70});
71
72/// Query parameters for the vector indices
73#[derive(Debug, Clone)]
74pub struct Query {
75    /// The column to be searched.
76    pub column: String,
77
78    /// The vector to be searched.
79    pub key: ArrayRef,
80
81    /// Top k results to return.
82    pub k: usize,
83
84    /// The lower bound (inclusive) of the distance to be searched.
85    pub lower_bound: Option<f32>,
86
87    /// The upper bound (exclusive) of the distance to be searched.
88    pub upper_bound: Option<f32>,
89
90    /// The minimum number of probes to load and search.  More partitions
91    /// will only be loaded if we have not found k results, or the algorithm
92    /// determines more partitions are needed to satisfy recall requirements.
93    ///
94    /// The planner will always search at least this many partitions. Defaults to 1.
95    pub minimum_nprobes: usize,
96
97    /// The maximum number of probes to load and search.  If not set then
98    /// ALL partitions will be searched, if needed, to satisfy k results.
99    pub maximum_nprobes: Option<usize>,
100
101    /// The number of candidates to reserve while searching.
102    /// this is an optional parameter for HNSW related index types.
103    pub ef: Option<usize>,
104
105    /// If presented, apply a refine step.
106    /// TODO: should we support fraction / float number here?
107    pub refine_factor: Option<u32>,
108
109    /// Distance metric type. If None, uses the index's metric (if available)
110    /// or the default for the data type.
111    pub metric_type: Option<DistanceType>,
112
113    /// Whether to use an ANN index if available
114    pub use_index: bool,
115
116    /// the distance between the query and the centroid
117    /// this is only used for IVF index with Rabit quantization
118    pub dist_q_c: f32,
119}
120
121impl From<pb::VectorMetricType> for DistanceType {
122    fn from(proto: pb::VectorMetricType) -> Self {
123        match proto {
124            pb::VectorMetricType::L2 => Self::L2,
125            pb::VectorMetricType::Cosine => Self::Cosine,
126            pb::VectorMetricType::Dot => Self::Dot,
127            pb::VectorMetricType::Hamming => Self::Hamming,
128        }
129    }
130}
131
132impl From<DistanceType> for pb::VectorMetricType {
133    fn from(mt: DistanceType) -> Self {
134        match mt {
135            DistanceType::L2 => Self::L2,
136            DistanceType::Cosine => Self::Cosine,
137            DistanceType::Dot => Self::Dot,
138            DistanceType::Hamming => Self::Hamming,
139        }
140    }
141}
142
143/// Vector Index for (Approximate) Nearest Neighbor (ANN) Search.
144///
145/// Vector indices are often built as a chain of indices.  For example, IVF -> PQ
146/// or IVF -> HNSW -> SQ.
147///
148/// We use one trait for both the top-level and the sub-indices.  Typically the top-level
149/// search is a partition-aware search and all sub-indices are whole-index searches.
150#[async_trait]
151#[allow(clippy::redundant_pub_crate)]
152pub trait VectorIndex: Send + Sync + std::fmt::Debug + Index {
153    /// Search entire index for k nearest neighbors.
154    ///
155    /// It returns a [RecordBatch] with Schema of:
156    ///
157    /// ```
158    /// use arrow_schema::{Schema, Field, DataType};
159    ///
160    /// Schema::new(vec![
161    ///   Field::new("_rowid", DataType::UInt64, true),
162    ///   Field::new("_distance", DataType::Float32, false),
163    /// ]);
164    /// ```
165    ///
166    /// The `pre_filter` argument is used to filter out row ids that we know are
167    /// not relevant to the query. For example, it removes deleted rows or rows that
168    /// do not match a user-provided filter.
169    async fn search(
170        &self,
171        query: &Query,
172        pre_filter: Arc<dyn PreFilter>,
173        metrics: &dyn MetricsCollector,
174    ) -> Result<RecordBatch>;
175
176    /// Find partitions that may contain nearest neighbors.
177    ///
178    /// If maximum_nprobes is set then this method will return the partitions
179    /// that are most likely to contain the nearest neighbors (e.g. the closest
180    /// partitions to the query vector).
181    ///
182    /// Return the partition ids and the distances between the query and the centroids,
183    /// the results should be in sorted order from closest to farthest.
184    fn find_partitions(&self, query: &Query) -> Result<(UInt32Array, Float32Array)>;
185
186    /// Get the total number of partitions in the index.
187    fn total_partitions(&self) -> usize;
188
189    /// Search a single partition for nearest neighbors.
190    ///
191    /// This method should return the same results as [`VectorIndex::search`] method except
192    /// that it will only search a single partition.
193    async fn search_in_partition(
194        &self,
195        partition_id: usize,
196        query: &Query,
197        pre_filter: Arc<dyn PreFilter>,
198        metrics: &dyn MetricsCollector,
199    ) -> Result<RecordBatch>;
200
201    /// If the index is loadable by IVF, so it can be a sub-index that
202    /// is loaded on demand by IVF.
203    fn is_loadable(&self) -> bool;
204
205    /// Use residual vector to search.
206    fn use_residual(&self) -> bool;
207
208    // async fn append(&self, batches: Vec<RecordBatch>) -> Result<()>;
209    // async fn merge(&self, indices: Vec<Arc<dyn VectorIndex>>) -> Result<()>;
210
211    /// Load the index from the reader on-demand.
212    async fn load(
213        &self,
214        reader: Arc<dyn Reader>,
215        offset: usize,
216        length: usize,
217    ) -> Result<Box<dyn VectorIndex>>;
218
219    /// Load the partition from the reader on-demand.
220    async fn load_partition(
221        &self,
222        reader: Arc<dyn Reader>,
223        offset: usize,
224        length: usize,
225        _partition_id: usize,
226    ) -> Result<Box<dyn VectorIndex>> {
227        self.load(reader, offset, length).await
228    }
229
230    // for IVF only
231    async fn partition_reader(
232        &self,
233        _partition_id: usize,
234        _with_vector: bool,
235        _metrics: &dyn MetricsCollector,
236    ) -> Result<SendableRecordBatchStream> {
237        unimplemented!("only for IVF")
238    }
239
240    // for SubIndex only
241    async fn to_batch_stream(&self, with_vector: bool) -> Result<SendableRecordBatchStream>;
242
243    fn num_rows(&self) -> u64;
244
245    /// Return the IDs of rows in the index.
246    fn row_ids(&self) -> Box<dyn Iterator<Item = &'_ u64> + '_>;
247
248    /// Remap the index according to mapping
249    ///
250    /// Each item in mapping describes an old row id -> new row id
251    /// pair.  If old row id -> None then that row id has been
252    /// deleted and can be removed from the index.
253    ///
254    /// If an old row id is not in the mapping then it should be
255    /// left alone.
256    async fn remap(&mut self, mapping: &HashMap<u64, Option<u64>>) -> Result<()>;
257
258    /// The metric type of this vector index.
259    fn metric_type(&self) -> DistanceType;
260
261    fn ivf_model(&self) -> &IvfModel;
262    fn quantizer(&self) -> Quantizer;
263    fn partition_size(&self, part_id: usize) -> usize;
264
265    /// the index type of this vector index.
266    fn sub_index_type(&self) -> (SubIndexType, QuantizationType);
267}
268
269// it can be an IVF index or a partition of IVF index
270pub trait VectorIndexCacheEntry: Debug + Send + Sync + DeepSizeOf {
271    fn as_any(&self) -> &dyn Any;
272}
lance_index/vector.rs

lance_index/
vector.rs