lance_index/
vector.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Vector Index
5//!
6
7use std::{collections::HashMap, sync::Arc};
8
9use arrow_array::{ArrayRef, RecordBatch, UInt32Array};
10use arrow_schema::Field;
11use async_trait::async_trait;
12use datafusion::execution::SendableRecordBatchStream;
13use ivf::storage::IvfModel;
14use lance_core::{Result, ROW_ID_FIELD};
15use lance_io::object_store::ObjectStore;
16use lance_io::traits::Reader;
17use lance_linalg::distance::DistanceType;
18use lazy_static::lazy_static;
19use object_store::path::Path;
20use quantizer::{QuantizationType, Quantizer};
21use v3::subindex::SubIndexType;
22
23pub mod bq;
24pub mod flat;
25pub mod graph;
26pub mod hnsw;
27pub mod ivf;
28pub mod kmeans;
29pub mod pq;
30pub mod quantizer;
31pub mod residual;
32pub mod sq;
33pub mod storage;
34pub mod transform;
35pub mod utils;
36pub mod v3;
37
38use super::pb;
39use crate::{prefilter::PreFilter, Index};
40pub use residual::RESIDUAL_COLUMN;
41
42// TODO: Make these crate private once the migration from lance to lance-index is done.
43pub const DIST_COL: &str = "_distance";
44pub const DISTANCE_TYPE_KEY: &str = "distance_type";
45pub const INDEX_UUID_COLUMN: &str = "__index_uuid";
46pub const PART_ID_COLUMN: &str = "__ivf_part_id";
47pub const PQ_CODE_COLUMN: &str = "__pq_code";
48pub const SQ_CODE_COLUMN: &str = "__sq_code";
49
50lazy_static! {
51    pub static ref VECTOR_RESULT_SCHEMA: arrow_schema::SchemaRef =
52        arrow_schema::SchemaRef::new(arrow_schema::Schema::new(vec![
53            Field::new(DIST_COL, arrow_schema::DataType::Float32, false),
54            ROW_ID_FIELD.clone(),
55        ]));
56}
57
58/// Query parameters for the vector indices
59#[derive(Debug, Clone)]
60pub struct Query {
61    /// The column to be searched.
62    pub column: String,
63
64    /// The vector to be searched.
65    pub key: ArrayRef,
66
67    /// Top k results to return.
68    pub k: usize,
69
70    /// The lower bound (inclusive) of the distance to be searched.
71    pub lower_bound: Option<f32>,
72
73    /// The upper bound (exclusive) of the distance to be searched.
74    pub upper_bound: Option<f32>,
75
76    /// The number of probes to load and search.
77    pub nprobes: usize,
78
79    /// The number of candidates to reserve while searching.
80    /// this is an optional parameter for HNSW related index types.
81    pub ef: Option<usize>,
82
83    /// If presented, apply a refine step.
84    /// TODO: should we support fraction / float number here?
85    pub refine_factor: Option<u32>,
86
87    /// Distance metric type
88    pub metric_type: DistanceType,
89
90    /// Whether to use an ANN index if available
91    pub use_index: bool,
92}
93
94impl From<pb::VectorMetricType> for DistanceType {
95    fn from(proto: pb::VectorMetricType) -> Self {
96        match proto {
97            pb::VectorMetricType::L2 => Self::L2,
98            pb::VectorMetricType::Cosine => Self::Cosine,
99            pb::VectorMetricType::Dot => Self::Dot,
100            pb::VectorMetricType::Hamming => Self::Hamming,
101        }
102    }
103}
104
105impl From<DistanceType> for pb::VectorMetricType {
106    fn from(mt: DistanceType) -> Self {
107        match mt {
108            DistanceType::L2 => Self::L2,
109            DistanceType::Cosine => Self::Cosine,
110            DistanceType::Dot => Self::Dot,
111            DistanceType::Hamming => Self::Hamming,
112        }
113    }
114}
115
116/// Vector Index for (Approximate) Nearest Neighbor (ANN) Search.
117/// It's always the IVF index, any other index types without partitioning will be treated as IVF with one partition.
118#[async_trait]
119#[allow(clippy::redundant_pub_crate)]
120pub trait VectorIndex: Send + Sync + std::fmt::Debug + Index {
121    /// Search the vector for nearest neighbors.
122    ///
123    /// It returns a [RecordBatch] with Schema of:
124    ///
125    /// ```
126    /// use arrow_schema::{Schema, Field, DataType};
127    ///
128    /// Schema::new(vec![
129    ///   Field::new("_rowid", DataType::UInt64, true),
130    ///   Field::new("_distance", DataType::Float32, false),
131    /// ]);
132    /// ```
133    ///
134    /// The `pre_filter` argument is used to filter out row ids that we know are
135    /// not relevant to the query. For example, it removes deleted rows.
136    ///
137    /// *WARNINGS*:
138    ///  - Only supports `f32` now. Will add f64/f16 later.
139    async fn search(&self, query: &Query, pre_filter: Arc<dyn PreFilter>) -> Result<RecordBatch>;
140
141    fn find_partitions(&self, query: &Query) -> Result<UInt32Array>;
142
143    async fn search_in_partition(
144        &self,
145        partition_id: usize,
146        query: &Query,
147        pre_filter: Arc<dyn PreFilter>,
148    ) -> Result<RecordBatch>;
149
150    /// If the index is loadable by IVF, so it can be a sub-index that
151    /// is loaded on demand by IVF.
152    fn is_loadable(&self) -> bool;
153
154    /// Use residual vector to search.
155    fn use_residual(&self) -> bool;
156
157    /// If the index can be remapped return Ok.  Else return an error
158    /// explaining why not
159    fn check_can_remap(&self) -> Result<()>;
160
161    // async fn append(&self, batches: Vec<RecordBatch>) -> Result<()>;
162    // async fn merge(&self, indices: Vec<Arc<dyn VectorIndex>>) -> Result<()>;
163
164    /// Load the index from the reader on-demand.
165    async fn load(
166        &self,
167        reader: Arc<dyn Reader>,
168        offset: usize,
169        length: usize,
170    ) -> Result<Box<dyn VectorIndex>>;
171
172    /// Load the partition from the reader on-demand.
173    async fn load_partition(
174        &self,
175        reader: Arc<dyn Reader>,
176        offset: usize,
177        length: usize,
178        _partition_id: usize,
179    ) -> Result<Box<dyn VectorIndex>> {
180        self.load(reader, offset, length).await
181    }
182
183    // for IVF only
184    async fn partition_reader(
185        &self,
186        _partition_id: usize,
187        _with_vector: bool,
188    ) -> Result<SendableRecordBatchStream> {
189        unimplemented!("only for IVF")
190    }
191
192    // for SubIndex only
193    async fn to_batch_stream(&self, with_vector: bool) -> Result<SendableRecordBatchStream>;
194
195    /// Return the IDs of rows in the index.
196    fn row_ids(&self) -> Box<dyn Iterator<Item = &'_ u64> + '_>;
197
198    /// Remap the index according to mapping
199    ///
200    /// Each item in mapping describes an old row id -> new row id
201    /// pair.  If old row id -> None then that row id has been
202    /// deleted and can be removed from the index.
203    ///
204    /// If an old row id is not in the mapping then it should be
205    /// left alone.
206    async fn remap(&mut self, mapping: &HashMap<u64, Option<u64>>) -> Result<()>;
207
208    /// Remap the index according to mapping
209    ///
210    /// write the remapped index to the index_dir
211    /// this is available for only v3 index
212    async fn remap_to(
213        self: Arc<Self>,
214        _store: ObjectStore,
215        _mapping: &HashMap<u64, Option<u64>>,
216        _column: String,
217        _index_dir: Path,
218    ) -> Result<()> {
219        unimplemented!("only for v3 index")
220    }
221
222    /// The metric type of this vector index.
223    fn metric_type(&self) -> DistanceType;
224
225    fn ivf_model(&self) -> IvfModel;
226    fn quantizer(&self) -> Quantizer;
227
228    /// the index type of this vector index.
229    fn sub_index_type(&self) -> (SubIndexType, QuantizationType);
230}