1use std::any::Any;
8use std::fmt::Debug;
9use std::{collections::HashMap, sync::Arc};
10
11use arrow_array::{ArrayRef, Float32Array, RecordBatch, UInt32Array};
12use arrow_schema::Field;
13use async_trait::async_trait;
14use datafusion::execution::SendableRecordBatchStream;
15use deepsize::DeepSizeOf;
16use ivf::storage::IvfModel;
17use lance_core::{Result, ROW_ID_FIELD};
18use lance_io::traits::Reader;
19use lance_linalg::distance::DistanceType;
20use quantizer::{QuantizationType, Quantizer};
21use std::sync::LazyLock;
22use v3::subindex::SubIndexType;
23
24pub mod bq;
25pub mod flat;
26pub mod graph;
27pub mod hnsw;
28pub mod ivf;
29pub mod kmeans;
30pub mod pq;
31pub mod quantizer;
32pub mod residual;
33pub mod sq;
34pub mod storage;
35pub mod transform;
36pub mod utils;
37pub mod v3;
38
39use super::pb;
40use crate::metrics::MetricsCollector;
41use crate::{prefilter::PreFilter, Index};
42
43pub const DIST_COL: &str = "_distance";
45pub const DISTANCE_TYPE_KEY: &str = "distance_type";
46pub const INDEX_UUID_COLUMN: &str = "__index_uuid";
47pub const PART_ID_COLUMN: &str = "__ivf_part_id";
48pub const DIST_Q_C_COLUMN: &str = "__dist_q_c";
49pub const CENTROID_DIST_COLUMN: &str = "__centroid_dist";
51pub const PQ_CODE_COLUMN: &str = "__pq_code";
52pub const SQ_CODE_COLUMN: &str = "__sq_code";
53pub const LOSS_METADATA_KEY: &str = "_loss";
54
55pub static VECTOR_RESULT_SCHEMA: LazyLock<arrow_schema::SchemaRef> = LazyLock::new(|| {
56 arrow_schema::SchemaRef::new(arrow_schema::Schema::new(vec![
57 Field::new(DIST_COL, arrow_schema::DataType::Float32, false),
58 ROW_ID_FIELD.clone(),
59 ]))
60});
61
62pub static PART_ID_FIELD: LazyLock<arrow_schema::Field> = LazyLock::new(|| {
63 arrow_schema::Field::new(PART_ID_COLUMN, arrow_schema::DataType::UInt32, true)
64});
65
66pub static CENTROID_DIST_FIELD: LazyLock<arrow_schema::Field> = LazyLock::new(|| {
67 arrow_schema::Field::new(CENTROID_DIST_COLUMN, arrow_schema::DataType::Float32, true)
68});
69
70#[derive(Debug, Clone)]
72pub struct Query {
73 pub column: String,
75
76 pub key: ArrayRef,
78
79 pub k: usize,
81
82 pub lower_bound: Option<f32>,
84
85 pub upper_bound: Option<f32>,
87
88 pub minimum_nprobes: usize,
94
95 pub maximum_nprobes: Option<usize>,
98
99 pub ef: Option<usize>,
102
103 pub refine_factor: Option<u32>,
106
107 pub metric_type: DistanceType,
109
110 pub use_index: bool,
112
113 pub dist_q_c: f32,
116}
117
118impl From<pb::VectorMetricType> for DistanceType {
119 fn from(proto: pb::VectorMetricType) -> Self {
120 match proto {
121 pb::VectorMetricType::L2 => Self::L2,
122 pb::VectorMetricType::Cosine => Self::Cosine,
123 pb::VectorMetricType::Dot => Self::Dot,
124 pb::VectorMetricType::Hamming => Self::Hamming,
125 }
126 }
127}
128
129impl From<DistanceType> for pb::VectorMetricType {
130 fn from(mt: DistanceType) -> Self {
131 match mt {
132 DistanceType::L2 => Self::L2,
133 DistanceType::Cosine => Self::Cosine,
134 DistanceType::Dot => Self::Dot,
135 DistanceType::Hamming => Self::Hamming,
136 }
137 }
138}
139
140#[async_trait]
148#[allow(clippy::redundant_pub_crate)]
149pub trait VectorIndex: Send + Sync + std::fmt::Debug + Index {
150 async fn search(
167 &self,
168 query: &Query,
169 pre_filter: Arc<dyn PreFilter>,
170 metrics: &dyn MetricsCollector,
171 ) -> Result<RecordBatch>;
172
173 fn find_partitions(&self, query: &Query) -> Result<(UInt32Array, Float32Array)>;
182
183 fn total_partitions(&self) -> usize;
185
186 async fn search_in_partition(
191 &self,
192 partition_id: usize,
193 query: &Query,
194 pre_filter: Arc<dyn PreFilter>,
195 metrics: &dyn MetricsCollector,
196 ) -> Result<RecordBatch>;
197
198 fn is_loadable(&self) -> bool;
201
202 fn use_residual(&self) -> bool;
204
205 async fn load(
210 &self,
211 reader: Arc<dyn Reader>,
212 offset: usize,
213 length: usize,
214 ) -> Result<Box<dyn VectorIndex>>;
215
216 async fn load_partition(
218 &self,
219 reader: Arc<dyn Reader>,
220 offset: usize,
221 length: usize,
222 _partition_id: usize,
223 ) -> Result<Box<dyn VectorIndex>> {
224 self.load(reader, offset, length).await
225 }
226
227 async fn partition_reader(
229 &self,
230 _partition_id: usize,
231 _with_vector: bool,
232 _metrics: &dyn MetricsCollector,
233 ) -> Result<SendableRecordBatchStream> {
234 unimplemented!("only for IVF")
235 }
236
237 async fn to_batch_stream(&self, with_vector: bool) -> Result<SendableRecordBatchStream>;
239
240 fn num_rows(&self) -> u64;
241
242 fn row_ids(&self) -> Box<dyn Iterator<Item = &'_ u64> + '_>;
244
245 async fn remap(&mut self, mapping: &HashMap<u64, Option<u64>>) -> Result<()>;
254
255 fn metric_type(&self) -> DistanceType;
257
258 fn ivf_model(&self) -> &IvfModel;
259 fn quantizer(&self) -> Quantizer;
260 fn partition_size(&self, part_id: usize) -> usize;
261
262 fn sub_index_type(&self) -> (SubIndexType, QuantizationType);
264}
265
266pub trait VectorIndexCacheEntry: Debug + Send + Sync + DeepSizeOf {
268 fn as_any(&self) -> &dyn Any;
269}