lance_index/
vector.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Vector Index
5//!
6
7use std::any::Any;
8use std::fmt::Debug;
9use std::{collections::HashMap, sync::Arc};
10
11use arrow_array::{ArrayRef, Float32Array, RecordBatch, UInt32Array};
12use arrow_schema::Field;
13use async_trait::async_trait;
14use datafusion::execution::SendableRecordBatchStream;
15use deepsize::DeepSizeOf;
16use ivf::storage::IvfModel;
17use lance_core::{Result, ROW_ID_FIELD};
18use lance_io::traits::Reader;
19use lance_linalg::distance::DistanceType;
20use quantizer::{QuantizationType, Quantizer};
21use std::sync::LazyLock;
22use v3::subindex::SubIndexType;
23
24pub mod bq;
25pub mod flat;
26pub mod graph;
27pub mod hnsw;
28pub mod ivf;
29pub mod kmeans;
30pub mod pq;
31pub mod quantizer;
32pub mod residual;
33pub mod sq;
34pub mod storage;
35pub mod transform;
36pub mod utils;
37pub mod v3;
38
39use super::pb;
40use crate::metrics::MetricsCollector;
41use crate::{prefilter::PreFilter, Index};
42
43// TODO: Make these crate private once the migration from lance to lance-index is done.
44pub const DIST_COL: &str = "_distance";
45pub const DISTANCE_TYPE_KEY: &str = "distance_type";
46pub const INDEX_UUID_COLUMN: &str = "__index_uuid";
47pub const PART_ID_COLUMN: &str = "__ivf_part_id";
48pub const DIST_Q_C_COLUMN: &str = "__dist_q_c";
49// dist from vector to centroid
50pub const CENTROID_DIST_COLUMN: &str = "__centroid_dist";
51pub const PQ_CODE_COLUMN: &str = "__pq_code";
52pub const SQ_CODE_COLUMN: &str = "__sq_code";
53pub const LOSS_METADATA_KEY: &str = "_loss";
54
55pub static VECTOR_RESULT_SCHEMA: LazyLock<arrow_schema::SchemaRef> = LazyLock::new(|| {
56    arrow_schema::SchemaRef::new(arrow_schema::Schema::new(vec![
57        Field::new(DIST_COL, arrow_schema::DataType::Float32, false),
58        ROW_ID_FIELD.clone(),
59    ]))
60});
61
62pub static PART_ID_FIELD: LazyLock<arrow_schema::Field> = LazyLock::new(|| {
63    arrow_schema::Field::new(PART_ID_COLUMN, arrow_schema::DataType::UInt32, true)
64});
65
66pub static CENTROID_DIST_FIELD: LazyLock<arrow_schema::Field> = LazyLock::new(|| {
67    arrow_schema::Field::new(CENTROID_DIST_COLUMN, arrow_schema::DataType::Float32, true)
68});
69
70/// Query parameters for the vector indices
71#[derive(Debug, Clone)]
72pub struct Query {
73    /// The column to be searched.
74    pub column: String,
75
76    /// The vector to be searched.
77    pub key: ArrayRef,
78
79    /// Top k results to return.
80    pub k: usize,
81
82    /// The lower bound (inclusive) of the distance to be searched.
83    pub lower_bound: Option<f32>,
84
85    /// The upper bound (exclusive) of the distance to be searched.
86    pub upper_bound: Option<f32>,
87
88    /// The minimum number of probes to load and search.  More partitions
89    /// will only be loaded if we have not found k results.
90    pub minimum_nprobes: usize,
91
92    /// The maximum number of probes to load and search.  If not set then
93    /// ALL partitions will be searched, if needed, to satisfy k results.
94    pub maximum_nprobes: Option<usize>,
95
96    /// The number of candidates to reserve while searching.
97    /// this is an optional parameter for HNSW related index types.
98    pub ef: Option<usize>,
99
100    /// If presented, apply a refine step.
101    /// TODO: should we support fraction / float number here?
102    pub refine_factor: Option<u32>,
103
104    /// Distance metric type
105    pub metric_type: DistanceType,
106
107    /// Whether to use an ANN index if available
108    pub use_index: bool,
109
110    /// the distance between the query and the centroid
111    /// this is only used for IVF index with Rabit quantization
112    pub dist_q_c: f32,
113}
114
115impl From<pb::VectorMetricType> for DistanceType {
116    fn from(proto: pb::VectorMetricType) -> Self {
117        match proto {
118            pb::VectorMetricType::L2 => Self::L2,
119            pb::VectorMetricType::Cosine => Self::Cosine,
120            pb::VectorMetricType::Dot => Self::Dot,
121            pb::VectorMetricType::Hamming => Self::Hamming,
122        }
123    }
124}
125
126impl From<DistanceType> for pb::VectorMetricType {
127    fn from(mt: DistanceType) -> Self {
128        match mt {
129            DistanceType::L2 => Self::L2,
130            DistanceType::Cosine => Self::Cosine,
131            DistanceType::Dot => Self::Dot,
132            DistanceType::Hamming => Self::Hamming,
133        }
134    }
135}
136
137/// Vector Index for (Approximate) Nearest Neighbor (ANN) Search.
138///
139/// Vector indices are often built as a chain of indices.  For example, IVF -> PQ
140/// or IVF -> HNSW -> SQ.
141///
142/// We use one trait for both the top-level and the sub-indices.  Typically the top-level
143/// search is a partition-aware search and all sub-indices are whole-index searches.
144#[async_trait]
145#[allow(clippy::redundant_pub_crate)]
146pub trait VectorIndex: Send + Sync + std::fmt::Debug + Index {
147    /// Search entire index for k nearest neighbors.
148    ///
149    /// It returns a [RecordBatch] with Schema of:
150    ///
151    /// ```
152    /// use arrow_schema::{Schema, Field, DataType};
153    ///
154    /// Schema::new(vec![
155    ///   Field::new("_rowid", DataType::UInt64, true),
156    ///   Field::new("_distance", DataType::Float32, false),
157    /// ]);
158    /// ```
159    ///
160    /// The `pre_filter` argument is used to filter out row ids that we know are
161    /// not relevant to the query. For example, it removes deleted rows or rows that
162    /// do not match a user-provided filter.
163    async fn search(
164        &self,
165        query: &Query,
166        pre_filter: Arc<dyn PreFilter>,
167        metrics: &dyn MetricsCollector,
168    ) -> Result<RecordBatch>;
169
170    /// Find partitions that may contain nearest neighbors.
171    ///
172    /// If maximum_nprobes is set then this method will return the partitions
173    /// that are most likely to contain the nearest neighbors (e.g. the closest
174    /// partitions to the query vector).
175    ///
176    /// Return the partition ids and the distances between the query and the centroids,
177    /// the results should be in sorted order from closest to farthest.
178    fn find_partitions(&self, query: &Query) -> Result<(UInt32Array, Float32Array)>;
179
180    /// Get the total number of partitions in the index.
181    fn total_partitions(&self) -> usize;
182
183    /// Search a single partition for nearest neighbors.
184    ///
185    /// This method should return the same results as [`VectorIndex::search`] method except
186    /// that it will only search a single partition.
187    async fn search_in_partition(
188        &self,
189        partition_id: usize,
190        query: &Query,
191        pre_filter: Arc<dyn PreFilter>,
192        metrics: &dyn MetricsCollector,
193    ) -> Result<RecordBatch>;
194
195    /// If the index is loadable by IVF, so it can be a sub-index that
196    /// is loaded on demand by IVF.
197    fn is_loadable(&self) -> bool;
198
199    /// Use residual vector to search.
200    fn use_residual(&self) -> bool;
201
202    // async fn append(&self, batches: Vec<RecordBatch>) -> Result<()>;
203    // async fn merge(&self, indices: Vec<Arc<dyn VectorIndex>>) -> Result<()>;
204
205    /// Load the index from the reader on-demand.
206    async fn load(
207        &self,
208        reader: Arc<dyn Reader>,
209        offset: usize,
210        length: usize,
211    ) -> Result<Box<dyn VectorIndex>>;
212
213    /// Load the partition from the reader on-demand.
214    async fn load_partition(
215        &self,
216        reader: Arc<dyn Reader>,
217        offset: usize,
218        length: usize,
219        _partition_id: usize,
220    ) -> Result<Box<dyn VectorIndex>> {
221        self.load(reader, offset, length).await
222    }
223
224    // for IVF only
225    async fn partition_reader(
226        &self,
227        _partition_id: usize,
228        _with_vector: bool,
229        _metrics: &dyn MetricsCollector,
230    ) -> Result<SendableRecordBatchStream> {
231        unimplemented!("only for IVF")
232    }
233
234    // for SubIndex only
235    async fn to_batch_stream(&self, with_vector: bool) -> Result<SendableRecordBatchStream>;
236
237    fn num_rows(&self) -> u64;
238
239    /// Return the IDs of rows in the index.
240    fn row_ids(&self) -> Box<dyn Iterator<Item = &'_ u64> + '_>;
241
242    /// Remap the index according to mapping
243    ///
244    /// Each item in mapping describes an old row id -> new row id
245    /// pair.  If old row id -> None then that row id has been
246    /// deleted and can be removed from the index.
247    ///
248    /// If an old row id is not in the mapping then it should be
249    /// left alone.
250    async fn remap(&mut self, mapping: &HashMap<u64, Option<u64>>) -> Result<()>;
251
252    /// The metric type of this vector index.
253    fn metric_type(&self) -> DistanceType;
254
255    fn ivf_model(&self) -> &IvfModel;
256    fn quantizer(&self) -> Quantizer;
257
258    /// the index type of this vector index.
259    fn sub_index_type(&self) -> (SubIndexType, QuantizationType);
260}
261
262// it can be an IVF index or a partition of IVF index
263pub trait VectorIndexCacheEntry: Debug + Send + Sync + DeepSizeOf {
264    fn as_any(&self) -> &dyn Any;
265}