Skip to main content

lance_index/
lib.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Lance secondary index library
5//!
6//! <section class="warning">
7//! This is internal crate used by <a href="https://github.com/lance-format/lance">the lance project</a>.
8//! <br/>
9//! API stability is not guaranteed.
10//! </section>
11
12use std::{any::Any, sync::Arc};
13
14use crate::frag_reuse::FRAG_REUSE_INDEX_NAME;
15use crate::mem_wal::MEM_WAL_INDEX_NAME;
16use async_trait::async_trait;
17use deepsize::DeepSizeOf;
18use lance_core::{Error, Result};
19use roaring::RoaringBitmap;
20use serde::{Deserialize, Serialize};
21use std::convert::TryFrom;
22
23pub mod frag_reuse;
24pub mod mem_wal;
25pub mod metrics;
26pub mod optimize;
27pub mod prefilter;
28pub mod progress;
29pub mod registry;
30pub mod scalar;
31pub mod traits;
32pub mod vector;
33
34pub use crate::traits::*;
35
36pub const INDEX_FILE_NAME: &str = "index.idx";
37/// The name of the auxiliary index file.
38///
39/// This file is used to store additional information about the index, to improve performance.
40/// - For 'IVF_HNSW' index, it stores the partitioned PQ Storage.
41pub const INDEX_AUXILIARY_FILE_NAME: &str = "auxiliary.idx";
42pub const INDEX_METADATA_SCHEMA_KEY: &str = "lance:index";
43
44/// Default version for vector index metadata.
45///
46/// Most vector indices should use this version unless they need to bump for a
47/// format change.
48pub const VECTOR_INDEX_VERSION: u32 = 1;
49/// Version for IVF_RQ indices.
50pub const IVF_RQ_INDEX_VERSION: u32 = 2;
51
52/// The factor of threshold to trigger split / join for vector index.
53///
54/// If the number of rows in the single partition is greater than `MAX_PARTITION_SIZE_FACTOR * target_partition_size`,
55/// the partition will be split.
56/// If the number of rows in the single partition is less than `MIN_PARTITION_SIZE_PERCENT *target_partition_size / 100`,
57/// the partition will be joined.
58pub const MAX_PARTITION_SIZE_FACTOR: usize = 4;
59pub const MIN_PARTITION_SIZE_PERCENT: usize = 25;
60
61pub mod pb {
62    #![allow(clippy::use_self)]
63    include!(concat!(env!("OUT_DIR"), "/lance.index.pb.rs"));
64}
65
66pub mod pbold {
67    #![allow(clippy::use_self)]
68    include!(concat!(env!("OUT_DIR"), "/lance.table.rs"));
69}
70
71/// Generic methods common across all types of secondary indices
72///
73#[async_trait]
74pub trait Index: Send + Sync + DeepSizeOf {
75    /// Cast to [Any].
76    fn as_any(&self) -> &dyn Any;
77
78    /// Cast to [Index]
79    fn as_index(self: Arc<Self>) -> Arc<dyn Index>;
80
81    /// Cast to [vector::VectorIndex]
82    fn as_vector_index(self: Arc<Self>) -> Result<Arc<dyn vector::VectorIndex>>;
83
84    /// Retrieve index statistics as a JSON Value
85    fn statistics(&self) -> Result<serde_json::Value>;
86
87    /// Prewarm the index.
88    ///
89    /// This will load the index into memory and cache it.
90    async fn prewarm(&self) -> Result<()>;
91
92    /// Get the type of the index
93    fn index_type(&self) -> IndexType;
94
95    /// Read through the index and determine which fragment ids are covered by the index
96    ///
97    /// This is a kind of slow operation.  It's better to use the fragment_bitmap.  This
98    /// only exists for cases where the fragment_bitmap has become corrupted or missing.
99    async fn calculate_included_frags(&self) -> Result<RoaringBitmap>;
100}
101
102/// Index Type
103#[derive(Debug, PartialEq, Eq, Copy, Hash, Clone, DeepSizeOf)]
104pub enum IndexType {
105    // Preserve 0-100 for simple indices.
106    Scalar = 0, // Legacy scalar index, alias to BTree
107
108    BTree = 1, // BTree
109
110    Bitmap = 2, // Bitmap
111
112    LabelList = 3, // LabelList
113
114    Inverted = 4, // Inverted
115
116    NGram = 5, // NGram
117
118    FragmentReuse = 6,
119
120    MemWal = 7,
121
122    ZoneMap = 8, // ZoneMap
123
124    BloomFilter = 9, // Bloom filter
125
126    RTree = 10, // RTree
127
128    // 100+ and up for vector index.
129    /// Flat vector index.
130    Vector = 100, // Legacy vector index, alias to IvfPq
131    IvfFlat = 101,
132    IvfSq = 102,
133    IvfPq = 103,
134    IvfHnswSq = 104,
135    IvfHnswPq = 105,
136    IvfHnswFlat = 106,
137    IvfRq = 107,
138}
139
140impl std::fmt::Display for IndexType {
141    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
142        match self {
143            Self::Scalar | Self::BTree => write!(f, "BTree"),
144            Self::Bitmap => write!(f, "Bitmap"),
145            Self::LabelList => write!(f, "LabelList"),
146            Self::Inverted => write!(f, "Inverted"),
147            Self::NGram => write!(f, "NGram"),
148            Self::FragmentReuse => write!(f, "FragmentReuse"),
149            Self::MemWal => write!(f, "MemWal"),
150            Self::ZoneMap => write!(f, "ZoneMap"),
151            Self::BloomFilter => write!(f, "BloomFilter"),
152            Self::RTree => write!(f, "RTree"),
153            Self::Vector | Self::IvfPq => write!(f, "IVF_PQ"),
154            Self::IvfFlat => write!(f, "IVF_FLAT"),
155            Self::IvfSq => write!(f, "IVF_SQ"),
156            Self::IvfHnswSq => write!(f, "IVF_HNSW_SQ"),
157            Self::IvfHnswPq => write!(f, "IVF_HNSW_PQ"),
158            Self::IvfHnswFlat => write!(f, "IVF_HNSW_FLAT"),
159            Self::IvfRq => write!(f, "IVF_RQ"),
160        }
161    }
162}
163
164impl TryFrom<i32> for IndexType {
165    type Error = Error;
166
167    fn try_from(value: i32) -> Result<Self> {
168        match value {
169            v if v == Self::Scalar as i32 => Ok(Self::Scalar),
170            v if v == Self::BTree as i32 => Ok(Self::BTree),
171            v if v == Self::Bitmap as i32 => Ok(Self::Bitmap),
172            v if v == Self::LabelList as i32 => Ok(Self::LabelList),
173            v if v == Self::NGram as i32 => Ok(Self::NGram),
174            v if v == Self::Inverted as i32 => Ok(Self::Inverted),
175            v if v == Self::FragmentReuse as i32 => Ok(Self::FragmentReuse),
176            v if v == Self::MemWal as i32 => Ok(Self::MemWal),
177            v if v == Self::ZoneMap as i32 => Ok(Self::ZoneMap),
178            v if v == Self::BloomFilter as i32 => Ok(Self::BloomFilter),
179            v if v == Self::RTree as i32 => Ok(Self::RTree),
180            v if v == Self::Vector as i32 => Ok(Self::Vector),
181            v if v == Self::IvfFlat as i32 => Ok(Self::IvfFlat),
182            v if v == Self::IvfSq as i32 => Ok(Self::IvfSq),
183            v if v == Self::IvfPq as i32 => Ok(Self::IvfPq),
184            v if v == Self::IvfHnswSq as i32 => Ok(Self::IvfHnswSq),
185            v if v == Self::IvfHnswPq as i32 => Ok(Self::IvfHnswPq),
186            v if v == Self::IvfHnswFlat as i32 => Ok(Self::IvfHnswFlat),
187            v if v == Self::IvfRq as i32 => Ok(Self::IvfRq),
188            _ => Err(Error::invalid_input_source(
189                format!("the input value {} is not a valid IndexType", value).into(),
190            )),
191        }
192    }
193}
194
195impl TryFrom<&str> for IndexType {
196    type Error = Error;
197
198    fn try_from(value: &str) -> Result<Self> {
199        match value {
200            "BTree" | "BTREE" => Ok(Self::BTree),
201            "Bitmap" | "BITMAP" => Ok(Self::Bitmap),
202            "LabelList" | "LABELLIST" => Ok(Self::LabelList),
203            "Inverted" | "INVERTED" => Ok(Self::Inverted),
204            "NGram" | "NGRAM" => Ok(Self::NGram),
205            "ZoneMap" | "ZONEMAP" => Ok(Self::ZoneMap),
206            "BloomFilter" | "BLOOMFILTER" | "BLOOM_FILTER" => Ok(Self::BloomFilter),
207            "RTree" | "RTREE" | "R_TREE" => Ok(Self::RTree),
208            "Vector" | "VECTOR" => Ok(Self::Vector),
209            "IVF_FLAT" => Ok(Self::IvfFlat),
210            "IVF_SQ" => Ok(Self::IvfSq),
211            "IVF_PQ" => Ok(Self::IvfPq),
212            "IVF_RQ" => Ok(Self::IvfRq),
213            "IVF_HNSW_FLAT" => Ok(Self::IvfHnswFlat),
214            "IVF_HNSW_SQ" => Ok(Self::IvfHnswSq),
215            "IVF_HNSW_PQ" => Ok(Self::IvfHnswPq),
216            "FragmentReuse" => Ok(Self::FragmentReuse),
217            "MemWal" => Ok(Self::MemWal),
218            _ => Err(Error::invalid_input(format!(
219                "invalid index type: {}",
220                value
221            ))),
222        }
223    }
224}
225
226impl IndexType {
227    pub fn is_scalar(&self) -> bool {
228        matches!(
229            self,
230            Self::Scalar
231                | Self::BTree
232                | Self::Bitmap
233                | Self::LabelList
234                | Self::Inverted
235                | Self::NGram
236                | Self::ZoneMap
237                | Self::BloomFilter
238                | Self::RTree,
239        )
240    }
241
242    pub fn is_vector(&self) -> bool {
243        matches!(
244            self,
245            Self::Vector
246                | Self::IvfPq
247                | Self::IvfHnswSq
248                | Self::IvfHnswPq
249                | Self::IvfHnswFlat
250                | Self::IvfFlat
251                | Self::IvfSq
252                | Self::IvfRq
253        )
254    }
255
256    pub fn is_system(&self) -> bool {
257        matches!(self, Self::FragmentReuse | Self::MemWal)
258    }
259
260    /// Returns the current format version of the index type,
261    /// bump this when the index format changes.
262    /// Indices which higher version than these will be ignored for compatibility,
263    /// This would happen when creating index in a newer version of Lance,
264    /// but then opening the index in older version of Lance
265    pub fn version(&self) -> i32 {
266        match self {
267            Self::Scalar => 0,
268            Self::BTree => 0,
269            Self::Bitmap => 0,
270            Self::LabelList => 0,
271            Self::Inverted => 0,
272            Self::NGram => 0,
273            Self::FragmentReuse => 0,
274            Self::MemWal => 0,
275            Self::ZoneMap => 0,
276            Self::BloomFilter => 0,
277            Self::RTree => 0,
278
279            // IMPORTANT: if any vector index subtype needs a format bump that is
280            // not backward compatible, its new version must be set to
281            // (current max vector index version + 1), even if only one subtype
282            // changed. Compatibility filtering currently cannot distinguish vector
283            // subtypes from details-only metadata, so vector versions effectively
284            // share one global monotonic compatibility level.
285            Self::Vector
286            | Self::IvfFlat
287            | Self::IvfSq
288            | Self::IvfPq
289            | Self::IvfHnswSq
290            | Self::IvfHnswPq
291            | Self::IvfHnswFlat => VECTOR_INDEX_VERSION as i32,
292            Self::IvfRq => IVF_RQ_INDEX_VERSION as i32,
293        }
294    }
295
296    /// Returns the target partition size for the index type.
297    ///
298    /// This is used to compute the number of partitions for the index.
299    /// The partition size is optimized for the best performance of the index.
300    ///
301    /// This is for vector indices only.
302    pub fn target_partition_size(&self) -> usize {
303        match self {
304            Self::Vector => 8192,
305            Self::IvfFlat => 4096,
306            Self::IvfSq => 8192,
307            Self::IvfPq => 8192,
308            Self::IvfHnswFlat => 1 << 20,
309            Self::IvfHnswSq => 1 << 20,
310            Self::IvfHnswPq => 1 << 20,
311            _ => 8192,
312        }
313    }
314
315    /// Returns the highest supported vector index version in this Lance build.
316    pub fn max_vector_version() -> u32 {
317        [
318            Self::Vector,
319            Self::IvfFlat,
320            Self::IvfSq,
321            Self::IvfPq,
322            Self::IvfHnswSq,
323            Self::IvfHnswPq,
324            Self::IvfHnswFlat,
325            Self::IvfRq,
326        ]
327        .into_iter()
328        .map(|index_type| index_type.version() as u32)
329        .max()
330        .unwrap_or(VECTOR_INDEX_VERSION)
331    }
332}
333
334pub trait IndexParams: Send + Sync {
335    fn as_any(&self) -> &dyn Any;
336
337    fn index_name(&self) -> &str;
338}
339
340#[derive(Serialize, Deserialize, Debug)]
341pub struct IndexMetadata {
342    #[serde(rename = "type")]
343    pub index_type: String,
344    pub distance_type: String,
345}
346
347pub fn is_system_index(index_meta: &lance_table::format::IndexMetadata) -> bool {
348    index_meta.name == FRAG_REUSE_INDEX_NAME || index_meta.name == MEM_WAL_INDEX_NAME
349}
350
351pub fn infer_system_index_type(
352    index_meta: &lance_table::format::IndexMetadata,
353) -> Option<IndexType> {
354    if index_meta.name == FRAG_REUSE_INDEX_NAME {
355        Some(IndexType::FragmentReuse)
356    } else if index_meta.name == MEM_WAL_INDEX_NAME {
357        Some(IndexType::MemWal)
358    } else {
359        None
360    }
361}
362
363#[cfg(test)]
364mod tests {
365    use super::*;
366
367    #[test]
368    fn test_ivf_rq_has_dedicated_index_version() {
369        assert!(IndexType::IvfRq.version() > IndexType::IvfPq.version());
370        assert_eq!(IndexType::IvfRq.version() as u32, IVF_RQ_INDEX_VERSION);
371    }
372
373    #[test]
374    fn test_max_vector_version_tracks_highest_supported() {
375        assert_eq!(IndexType::max_vector_version(), IVF_RQ_INDEX_VERSION);
376    }
377
378    #[test]
379    fn test_index_type_try_from_i32_covers_all_variants() {
380        let all = [
381            IndexType::Scalar,
382            IndexType::BTree,
383            IndexType::Bitmap,
384            IndexType::LabelList,
385            IndexType::Inverted,
386            IndexType::NGram,
387            IndexType::FragmentReuse,
388            IndexType::MemWal,
389            IndexType::ZoneMap,
390            IndexType::BloomFilter,
391            IndexType::RTree,
392            IndexType::Vector,
393            IndexType::IvfFlat,
394            IndexType::IvfSq,
395            IndexType::IvfPq,
396            IndexType::IvfHnswSq,
397            IndexType::IvfHnswPq,
398            IndexType::IvfHnswFlat,
399            IndexType::IvfRq,
400        ];
401
402        for index_type in all {
403            assert_eq!(
404                IndexType::try_from(index_type as i32).unwrap(),
405                index_type,
406                "IndexType::try_from(i32) should support {:?}",
407                index_type
408            );
409        }
410    }
411
412    #[test]
413    fn test_index_type_try_from_str_covers_all_parseable_variants() {
414        let cases = [
415            ("BTree", IndexType::BTree),
416            ("BTREE", IndexType::BTree),
417            ("Bitmap", IndexType::Bitmap),
418            ("BITMAP", IndexType::Bitmap),
419            ("LabelList", IndexType::LabelList),
420            ("LABELLIST", IndexType::LabelList),
421            ("Inverted", IndexType::Inverted),
422            ("INVERTED", IndexType::Inverted),
423            ("NGram", IndexType::NGram),
424            ("NGRAM", IndexType::NGram),
425            ("ZoneMap", IndexType::ZoneMap),
426            ("ZONEMAP", IndexType::ZoneMap),
427            ("BloomFilter", IndexType::BloomFilter),
428            ("BLOOMFILTER", IndexType::BloomFilter),
429            ("BLOOM_FILTER", IndexType::BloomFilter),
430            ("RTree", IndexType::RTree),
431            ("RTREE", IndexType::RTree),
432            ("R_TREE", IndexType::RTree),
433            ("Vector", IndexType::Vector),
434            ("VECTOR", IndexType::Vector),
435            ("IVF_FLAT", IndexType::IvfFlat),
436            ("IVF_SQ", IndexType::IvfSq),
437            ("IVF_PQ", IndexType::IvfPq),
438            ("IVF_RQ", IndexType::IvfRq),
439            ("IVF_HNSW_FLAT", IndexType::IvfHnswFlat),
440            ("IVF_HNSW_SQ", IndexType::IvfHnswSq),
441            ("IVF_HNSW_PQ", IndexType::IvfHnswPq),
442            ("FragmentReuse", IndexType::FragmentReuse),
443            ("MemWal", IndexType::MemWal),
444        ];
445
446        for (text, expected) in cases {
447            assert_eq!(
448                IndexType::try_from(text).unwrap(),
449                expected,
450                "IndexType::try_from(&str) should support '{text}'"
451            );
452        }
453    }
454}