Skip to main content

lance_index/
lib.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Lance secondary index library
5//!
6//! <section class="warning">
7//! This is internal crate used by <a href="https://github.com/lance-format/lance">the lance project</a>.
8//! <br/>
9//! API stability is not guaranteed.
10//! </section>
11
12use std::{any::Any, sync::Arc};
13
14use crate::frag_reuse::FRAG_REUSE_INDEX_NAME;
15use crate::mem_wal::MEM_WAL_INDEX_NAME;
16use async_trait::async_trait;
17use deepsize::DeepSizeOf;
18use lance_core::{Error, Result};
19use roaring::RoaringBitmap;
20use serde::{Deserialize, Serialize};
21use std::convert::TryFrom;
22
23pub mod frag_reuse;
24pub mod mem_wal;
25pub mod metrics;
26pub mod optimize;
27pub mod prefilter;
28pub mod progress;
29pub mod registry;
30pub mod scalar;
31pub mod traits;
32pub mod types;
33pub mod vector;
34
35pub use crate::traits::*;
36pub use crate::types::{IndexSegment, IndexSegmentPlan};
37
38pub const INDEX_FILE_NAME: &str = "index.idx";
39/// The name of the auxiliary index file.
40///
41/// This file is used to store additional information about the index, to improve performance.
42/// - For 'IVF_HNSW' index, it stores the partitioned PQ Storage.
43pub const INDEX_AUXILIARY_FILE_NAME: &str = "auxiliary.idx";
44pub const INDEX_METADATA_SCHEMA_KEY: &str = "lance:index";
45
46/// Default version for vector index metadata.
47///
48/// Most vector indices should use this version unless they need to bump for a
49/// format change.
50pub const VECTOR_INDEX_VERSION: u32 = 1;
51/// Version for IVF_RQ indices.
52pub const IVF_RQ_INDEX_VERSION: u32 = 2;
53
54/// The factor of threshold to trigger split / join for vector index.
55///
56/// If the number of rows in the single partition is greater than `MAX_PARTITION_SIZE_FACTOR * target_partition_size`,
57/// the partition will be split.
58/// If the number of rows in the single partition is less than `MIN_PARTITION_SIZE_PERCENT *target_partition_size / 100`,
59/// the partition will be joined.
60pub const MAX_PARTITION_SIZE_FACTOR: usize = 4;
61pub const MIN_PARTITION_SIZE_PERCENT: usize = 25;
62
63pub mod pb {
64    #![allow(clippy::use_self)]
65    include!(concat!(env!("OUT_DIR"), "/lance.index.pb.rs"));
66}
67
68pub mod pbold {
69    #![allow(clippy::use_self)]
70    include!(concat!(env!("OUT_DIR"), "/lance.table.rs"));
71}
72
73/// Generic methods common across all types of secondary indices
74///
75#[async_trait]
76pub trait Index: Send + Sync + DeepSizeOf {
77    /// Cast to [Any].
78    fn as_any(&self) -> &dyn Any;
79
80    /// Cast to [Index]
81    fn as_index(self: Arc<Self>) -> Arc<dyn Index>;
82
83    /// Cast to [vector::VectorIndex]
84    fn as_vector_index(self: Arc<Self>) -> Result<Arc<dyn vector::VectorIndex>>;
85
86    /// Retrieve index statistics as a JSON Value
87    fn statistics(&self) -> Result<serde_json::Value>;
88
89    /// Prewarm the index.
90    ///
91    /// This will load the index into memory and cache it.
92    async fn prewarm(&self) -> Result<()>;
93
94    /// Get the type of the index
95    fn index_type(&self) -> IndexType;
96
97    /// Read through the index and determine which fragment ids are covered by the index
98    ///
99    /// This is a kind of slow operation.  It's better to use the fragment_bitmap.  This
100    /// only exists for cases where the fragment_bitmap has become corrupted or missing.
101    async fn calculate_included_frags(&self) -> Result<RoaringBitmap>;
102}
103
104/// Index Type
105#[derive(Debug, PartialEq, Eq, Copy, Hash, Clone, DeepSizeOf)]
106pub enum IndexType {
107    // Preserve 0-100 for simple indices.
108    Scalar = 0, // Legacy scalar index, alias to BTree
109
110    BTree = 1, // BTree
111
112    Bitmap = 2, // Bitmap
113
114    LabelList = 3, // LabelList
115
116    Inverted = 4, // Inverted
117
118    NGram = 5, // NGram
119
120    FragmentReuse = 6,
121
122    MemWal = 7,
123
124    ZoneMap = 8, // ZoneMap
125
126    BloomFilter = 9, // Bloom filter
127
128    RTree = 10, // RTree
129
130    // 100+ and up for vector index.
131    /// Flat vector index.
132    Vector = 100, // Legacy vector index, alias to IvfPq
133    IvfFlat = 101,
134    IvfSq = 102,
135    IvfPq = 103,
136    IvfHnswSq = 104,
137    IvfHnswPq = 105,
138    IvfHnswFlat = 106,
139    IvfRq = 107,
140}
141
142impl std::fmt::Display for IndexType {
143    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
144        match self {
145            Self::Scalar | Self::BTree => write!(f, "BTree"),
146            Self::Bitmap => write!(f, "Bitmap"),
147            Self::LabelList => write!(f, "LabelList"),
148            Self::Inverted => write!(f, "Inverted"),
149            Self::NGram => write!(f, "NGram"),
150            Self::FragmentReuse => write!(f, "FragmentReuse"),
151            Self::MemWal => write!(f, "MemWal"),
152            Self::ZoneMap => write!(f, "ZoneMap"),
153            Self::BloomFilter => write!(f, "BloomFilter"),
154            Self::RTree => write!(f, "RTree"),
155            Self::Vector | Self::IvfPq => write!(f, "IVF_PQ"),
156            Self::IvfFlat => write!(f, "IVF_FLAT"),
157            Self::IvfSq => write!(f, "IVF_SQ"),
158            Self::IvfHnswSq => write!(f, "IVF_HNSW_SQ"),
159            Self::IvfHnswPq => write!(f, "IVF_HNSW_PQ"),
160            Self::IvfHnswFlat => write!(f, "IVF_HNSW_FLAT"),
161            Self::IvfRq => write!(f, "IVF_RQ"),
162        }
163    }
164}
165
166impl TryFrom<i32> for IndexType {
167    type Error = Error;
168
169    fn try_from(value: i32) -> Result<Self> {
170        match value {
171            v if v == Self::Scalar as i32 => Ok(Self::Scalar),
172            v if v == Self::BTree as i32 => Ok(Self::BTree),
173            v if v == Self::Bitmap as i32 => Ok(Self::Bitmap),
174            v if v == Self::LabelList as i32 => Ok(Self::LabelList),
175            v if v == Self::NGram as i32 => Ok(Self::NGram),
176            v if v == Self::Inverted as i32 => Ok(Self::Inverted),
177            v if v == Self::FragmentReuse as i32 => Ok(Self::FragmentReuse),
178            v if v == Self::MemWal as i32 => Ok(Self::MemWal),
179            v if v == Self::ZoneMap as i32 => Ok(Self::ZoneMap),
180            v if v == Self::BloomFilter as i32 => Ok(Self::BloomFilter),
181            v if v == Self::Vector as i32 => Ok(Self::Vector),
182            v if v == Self::IvfFlat as i32 => Ok(Self::IvfFlat),
183            v if v == Self::IvfSq as i32 => Ok(Self::IvfSq),
184            v if v == Self::IvfPq as i32 => Ok(Self::IvfPq),
185            v if v == Self::IvfHnswSq as i32 => Ok(Self::IvfHnswSq),
186            v if v == Self::IvfHnswPq as i32 => Ok(Self::IvfHnswPq),
187            v if v == Self::IvfHnswFlat as i32 => Ok(Self::IvfHnswFlat),
188            v if v == Self::IvfRq as i32 => Ok(Self::IvfRq),
189            _ => Err(Error::invalid_input_source(
190                format!("the input value {} is not a valid IndexType", value).into(),
191            )),
192        }
193    }
194}
195
196impl TryFrom<&str> for IndexType {
197    type Error = Error;
198
199    fn try_from(value: &str) -> Result<Self> {
200        match value {
201            "BTree" | "BTREE" => Ok(Self::BTree),
202            "Bitmap" | "BITMAP" => Ok(Self::Bitmap),
203            "LabelList" | "LABELLIST" => Ok(Self::LabelList),
204            "Inverted" | "INVERTED" => Ok(Self::Inverted),
205            "NGram" | "NGRAM" => Ok(Self::NGram),
206            "ZoneMap" | "ZONEMAP" => Ok(Self::ZoneMap),
207            "Vector" | "VECTOR" => Ok(Self::Vector),
208            "IVF_FLAT" => Ok(Self::IvfFlat),
209            "IVF_SQ" => Ok(Self::IvfSq),
210            "IVF_PQ" => Ok(Self::IvfPq),
211            "IVF_RQ" => Ok(Self::IvfRq),
212            "IVF_HNSW_FLAT" => Ok(Self::IvfHnswFlat),
213            "IVF_HNSW_SQ" => Ok(Self::IvfHnswSq),
214            "IVF_HNSW_PQ" => Ok(Self::IvfHnswPq),
215            "FragmentReuse" => Ok(Self::FragmentReuse),
216            "MemWal" => Ok(Self::MemWal),
217            _ => Err(Error::invalid_input(format!(
218                "invalid index type: {}",
219                value
220            ))),
221        }
222    }
223}
224
225impl IndexType {
226    pub fn is_scalar(&self) -> bool {
227        matches!(
228            self,
229            Self::Scalar
230                | Self::BTree
231                | Self::Bitmap
232                | Self::LabelList
233                | Self::Inverted
234                | Self::NGram
235                | Self::ZoneMap
236                | Self::BloomFilter
237                | Self::RTree,
238        )
239    }
240
241    pub fn is_vector(&self) -> bool {
242        matches!(
243            self,
244            Self::Vector
245                | Self::IvfPq
246                | Self::IvfHnswSq
247                | Self::IvfHnswPq
248                | Self::IvfHnswFlat
249                | Self::IvfFlat
250                | Self::IvfSq
251                | Self::IvfRq
252        )
253    }
254
255    pub fn is_system(&self) -> bool {
256        matches!(self, Self::FragmentReuse | Self::MemWal)
257    }
258
259    /// Returns the current format version of the index type,
260    /// bump this when the index format changes.
261    /// Indices which higher version than these will be ignored for compatibility,
262    /// This would happen when creating index in a newer version of Lance,
263    /// but then opening the index in older version of Lance
264    pub fn version(&self) -> i32 {
265        match self {
266            Self::Scalar => 0,
267            Self::BTree => 0,
268            Self::Bitmap => 0,
269            Self::LabelList => 0,
270            Self::Inverted => 0,
271            Self::NGram => 0,
272            Self::FragmentReuse => 0,
273            Self::MemWal => 0,
274            Self::ZoneMap => 0,
275            Self::BloomFilter => 0,
276            Self::RTree => 0,
277
278            // IMPORTANT: if any vector index subtype needs a format bump that is
279            // not backward compatible, its new version must be set to
280            // (current max vector index version + 1), even if only one subtype
281            // changed. Compatibility filtering currently cannot distinguish vector
282            // subtypes from details-only metadata, so vector versions effectively
283            // share one global monotonic compatibility level.
284            Self::Vector
285            | Self::IvfFlat
286            | Self::IvfSq
287            | Self::IvfPq
288            | Self::IvfHnswSq
289            | Self::IvfHnswPq
290            | Self::IvfHnswFlat => VECTOR_INDEX_VERSION as i32,
291            Self::IvfRq => IVF_RQ_INDEX_VERSION as i32,
292        }
293    }
294
295    /// Returns the target partition size for the index type.
296    ///
297    /// This is used to compute the number of partitions for the index.
298    /// The partition size is optimized for the best performance of the index.
299    ///
300    /// This is for vector indices only.
301    pub fn target_partition_size(&self) -> usize {
302        match self {
303            Self::Vector => 8192,
304            Self::IvfFlat => 4096,
305            Self::IvfSq => 8192,
306            Self::IvfPq => 8192,
307            Self::IvfHnswFlat => 1 << 20,
308            Self::IvfHnswSq => 1 << 20,
309            Self::IvfHnswPq => 1 << 20,
310            _ => 8192,
311        }
312    }
313
314    /// Returns the highest supported vector index version in this Lance build.
315    pub fn max_vector_version() -> u32 {
316        [
317            Self::Vector,
318            Self::IvfFlat,
319            Self::IvfSq,
320            Self::IvfPq,
321            Self::IvfHnswSq,
322            Self::IvfHnswPq,
323            Self::IvfHnswFlat,
324            Self::IvfRq,
325        ]
326        .into_iter()
327        .map(|index_type| index_type.version() as u32)
328        .max()
329        .unwrap_or(VECTOR_INDEX_VERSION)
330    }
331}
332
333pub trait IndexParams: Send + Sync {
334    fn as_any(&self) -> &dyn Any;
335
336    fn index_name(&self) -> &str;
337}
338
339#[derive(Serialize, Deserialize, Debug)]
340pub struct IndexMetadata {
341    #[serde(rename = "type")]
342    pub index_type: String,
343    pub distance_type: String,
344}
345
346pub fn is_system_index(index_meta: &lance_table::format::IndexMetadata) -> bool {
347    index_meta.name == FRAG_REUSE_INDEX_NAME || index_meta.name == MEM_WAL_INDEX_NAME
348}
349
350pub fn infer_system_index_type(
351    index_meta: &lance_table::format::IndexMetadata,
352) -> Option<IndexType> {
353    if index_meta.name == FRAG_REUSE_INDEX_NAME {
354        Some(IndexType::FragmentReuse)
355    } else if index_meta.name == MEM_WAL_INDEX_NAME {
356        Some(IndexType::MemWal)
357    } else {
358        None
359    }
360}
361
362#[cfg(test)]
363mod tests {
364    use super::*;
365
366    #[test]
367    fn test_ivf_rq_has_dedicated_index_version() {
368        assert!(IndexType::IvfRq.version() > IndexType::IvfPq.version());
369        assert_eq!(IndexType::IvfRq.version() as u32, IVF_RQ_INDEX_VERSION);
370    }
371
372    #[test]
373    fn test_max_vector_version_tracks_highest_supported() {
374        assert_eq!(IndexType::max_vector_version(), IVF_RQ_INDEX_VERSION);
375    }
376}