1use std::any::Any;
8use std::fmt::Debug;
9use std::{collections::HashMap, sync::Arc};
10
11use arrow_array::{ArrayRef, Float32Array, RecordBatch, UInt32Array};
12use arrow_schema::Field;
13use async_trait::async_trait;
14use datafusion::execution::SendableRecordBatchStream;
15use deepsize::DeepSizeOf;
16use ivf::storage::IvfModel;
17use lance_core::{Result, ROW_ID_FIELD};
18use lance_io::traits::Reader;
19use lance_linalg::distance::DistanceType;
20use quantizer::{QuantizationType, Quantizer};
21use std::sync::LazyLock;
22use v3::subindex::SubIndexType;
23
24pub mod bq;
25pub mod distributed;
26pub mod flat;
27pub mod graph;
28pub mod hnsw;
29pub mod ivf;
30pub mod kmeans;
31pub mod pq;
32pub mod quantizer;
33pub mod residual;
34pub mod shared;
35pub mod sq;
36pub mod storage;
37pub mod transform;
38pub mod utils;
39pub mod v3;
40
41use super::pb;
42use crate::metrics::MetricsCollector;
43use crate::{prefilter::PreFilter, Index};
44
45pub const DIST_COL: &str = "_distance";
47pub const DISTANCE_TYPE_KEY: &str = "distance_type";
48pub const INDEX_UUID_COLUMN: &str = "__index_uuid";
49pub const PART_ID_COLUMN: &str = "__ivf_part_id";
50pub const DIST_Q_C_COLUMN: &str = "__dist_q_c";
51pub const CENTROID_DIST_COLUMN: &str = "__centroid_dist";
53pub const PQ_CODE_COLUMN: &str = "__pq_code";
54pub const SQ_CODE_COLUMN: &str = "__sq_code";
55pub const LOSS_METADATA_KEY: &str = "_loss";
56
57pub static VECTOR_RESULT_SCHEMA: LazyLock<arrow_schema::SchemaRef> = LazyLock::new(|| {
58 arrow_schema::SchemaRef::new(arrow_schema::Schema::new(vec![
59 Field::new(DIST_COL, arrow_schema::DataType::Float32, false),
60 ROW_ID_FIELD.clone(),
61 ]))
62});
63
64pub static PART_ID_FIELD: LazyLock<arrow_schema::Field> = LazyLock::new(|| {
65 arrow_schema::Field::new(PART_ID_COLUMN, arrow_schema::DataType::UInt32, true)
66});
67
68pub static CENTROID_DIST_FIELD: LazyLock<arrow_schema::Field> = LazyLock::new(|| {
69 arrow_schema::Field::new(CENTROID_DIST_COLUMN, arrow_schema::DataType::Float32, true)
70});
71
72#[derive(Debug, Clone)]
74pub struct Query {
75 pub column: String,
77
78 pub key: ArrayRef,
80
81 pub k: usize,
83
84 pub lower_bound: Option<f32>,
86
87 pub upper_bound: Option<f32>,
89
90 pub minimum_nprobes: usize,
96
97 pub maximum_nprobes: Option<usize>,
100
101 pub ef: Option<usize>,
104
105 pub refine_factor: Option<u32>,
108
109 pub metric_type: Option<DistanceType>,
112
113 pub use_index: bool,
115
116 pub dist_q_c: f32,
119}
120
121impl From<pb::VectorMetricType> for DistanceType {
122 fn from(proto: pb::VectorMetricType) -> Self {
123 match proto {
124 pb::VectorMetricType::L2 => Self::L2,
125 pb::VectorMetricType::Cosine => Self::Cosine,
126 pb::VectorMetricType::Dot => Self::Dot,
127 pb::VectorMetricType::Hamming => Self::Hamming,
128 }
129 }
130}
131
132impl From<DistanceType> for pb::VectorMetricType {
133 fn from(mt: DistanceType) -> Self {
134 match mt {
135 DistanceType::L2 => Self::L2,
136 DistanceType::Cosine => Self::Cosine,
137 DistanceType::Dot => Self::Dot,
138 DistanceType::Hamming => Self::Hamming,
139 }
140 }
141}
142
143#[async_trait]
151#[allow(clippy::redundant_pub_crate)]
152pub trait VectorIndex: Send + Sync + std::fmt::Debug + Index {
153 async fn search(
170 &self,
171 query: &Query,
172 pre_filter: Arc<dyn PreFilter>,
173 metrics: &dyn MetricsCollector,
174 ) -> Result<RecordBatch>;
175
176 fn find_partitions(&self, query: &Query) -> Result<(UInt32Array, Float32Array)>;
185
186 fn total_partitions(&self) -> usize;
188
189 async fn search_in_partition(
194 &self,
195 partition_id: usize,
196 query: &Query,
197 pre_filter: Arc<dyn PreFilter>,
198 metrics: &dyn MetricsCollector,
199 ) -> Result<RecordBatch>;
200
201 fn is_loadable(&self) -> bool;
204
205 fn use_residual(&self) -> bool;
207
208 async fn load(
213 &self,
214 reader: Arc<dyn Reader>,
215 offset: usize,
216 length: usize,
217 ) -> Result<Box<dyn VectorIndex>>;
218
219 async fn load_partition(
221 &self,
222 reader: Arc<dyn Reader>,
223 offset: usize,
224 length: usize,
225 _partition_id: usize,
226 ) -> Result<Box<dyn VectorIndex>> {
227 self.load(reader, offset, length).await
228 }
229
230 async fn partition_reader(
232 &self,
233 _partition_id: usize,
234 _with_vector: bool,
235 _metrics: &dyn MetricsCollector,
236 ) -> Result<SendableRecordBatchStream> {
237 unimplemented!("only for IVF")
238 }
239
240 async fn to_batch_stream(&self, with_vector: bool) -> Result<SendableRecordBatchStream>;
242
243 fn num_rows(&self) -> u64;
244
245 fn row_ids(&self) -> Box<dyn Iterator<Item = &'_ u64> + '_>;
247
248 async fn remap(&mut self, mapping: &HashMap<u64, Option<u64>>) -> Result<()>;
257
258 fn metric_type(&self) -> DistanceType;
260
261 fn ivf_model(&self) -> &IvfModel;
262 fn quantizer(&self) -> Quantizer;
263 fn partition_size(&self, part_id: usize) -> usize;
264
265 fn sub_index_type(&self) -> (SubIndexType, QuantizationType);
267}
268
269pub trait VectorIndexCacheEntry: Debug + Send + Sync + DeepSizeOf {
271 fn as_any(&self) -> &dyn Any;
272}