lance_index/
lib.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Lance secondary index library
5//!
6//! <section class="warning">
7//! This is internal crate used by <a href="https://github.com/lancedb/lance">the lance project</a>.
8//! <br/>
9//! API stability is not guaranteed.
10//! </section>
11
12use std::{any::Any, sync::Arc};
13
14use crate::frag_reuse::FRAG_REUSE_INDEX_NAME;
15use crate::mem_wal::MEM_WAL_INDEX_NAME;
16use async_trait::async_trait;
17use deepsize::DeepSizeOf;
18use lance_core::{Error, Result};
19use roaring::RoaringBitmap;
20use serde::{Deserialize, Serialize};
21use snafu::location;
22use std::convert::TryFrom;
23
24pub mod frag_reuse;
25pub mod mem_wal;
26pub mod metrics;
27pub mod optimize;
28pub mod prefilter;
29pub mod scalar;
30pub mod traits;
31pub mod vector;
32
33pub use crate::traits::*;
34
35pub const INDEX_FILE_NAME: &str = "index.idx";
36/// The name of the auxiliary index file.
37///
38/// This file is used to store additional information about the index, to improve performance.
39/// - For 'IVF_HNSW' index, it stores the partitioned PQ Storage.
40pub const INDEX_AUXILIARY_FILE_NAME: &str = "auxiliary.idx";
41pub const INDEX_METADATA_SCHEMA_KEY: &str = "lance:index";
42
43// Currently all vector indexes are version 1
44pub const VECTOR_INDEX_VERSION: u32 = 1;
45
46pub mod pb {
47    #![allow(clippy::use_self)]
48    include!(concat!(env!("OUT_DIR"), "/lance.index.pb.rs"));
49}
50
51pub mod pbold {
52    #![allow(clippy::use_self)]
53    include!(concat!(env!("OUT_DIR"), "/lance.table.rs"));
54}
55
56/// Generic methods common across all types of secondary indices
57///
58#[async_trait]
59pub trait Index: Send + Sync + DeepSizeOf {
60    /// Cast to [Any].
61    fn as_any(&self) -> &dyn Any;
62
63    /// Cast to [Index]
64    fn as_index(self: Arc<Self>) -> Arc<dyn Index>;
65
66    /// Cast to [vector::VectorIndex]
67    fn as_vector_index(self: Arc<Self>) -> Result<Arc<dyn vector::VectorIndex>>;
68
69    /// Retrieve index statistics as a JSON Value
70    fn statistics(&self) -> Result<serde_json::Value>;
71
72    /// Prewarm the index.
73    ///
74    /// This will load the index into memory and cache it.
75    async fn prewarm(&self) -> Result<()>;
76
77    /// Get the type of the index
78    fn index_type(&self) -> IndexType;
79
80    /// Read through the index and determine which fragment ids are covered by the index
81    ///
82    /// This is a kind of slow operation.  It's better to use the fragment_bitmap.  This
83    /// only exists for cases where the fragment_bitmap has become corrupted or missing.
84    async fn calculate_included_frags(&self) -> Result<RoaringBitmap>;
85}
86
87/// Index Type
88#[derive(Debug, PartialEq, Eq, Copy, Hash, Clone, DeepSizeOf)]
89pub enum IndexType {
90    // Preserve 0-100 for simple indices.
91    Scalar = 0, // Legacy scalar index, alias to BTree
92
93    BTree = 1, // BTree
94
95    Bitmap = 2, // Bitmap
96
97    LabelList = 3, // LabelList
98
99    Inverted = 4, // Inverted
100
101    NGram = 5, // NGram
102
103    FragmentReuse = 6,
104
105    MemWal = 7,
106
107    ZoneMap = 8, // ZoneMap
108
109    BloomFilter = 9, // Bloom filter
110
111    // 100+ and up for vector index.
112    /// Flat vector index.
113    Vector = 100, // Legacy vector index, alias to IvfPq
114    IvfFlat = 101,
115    IvfSq = 102,
116    IvfPq = 103,
117    IvfHnswSq = 104,
118    IvfHnswPq = 105,
119    IvfHnswFlat = 106,
120    IvfRq = 107,
121}
122
123impl std::fmt::Display for IndexType {
124    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
125        match self {
126            Self::Scalar | Self::BTree => write!(f, "BTree"),
127            Self::Bitmap => write!(f, "Bitmap"),
128            Self::LabelList => write!(f, "LabelList"),
129            Self::Inverted => write!(f, "Inverted"),
130            Self::NGram => write!(f, "NGram"),
131            Self::FragmentReuse => write!(f, "FragmentReuse"),
132            Self::MemWal => write!(f, "MemWal"),
133            Self::ZoneMap => write!(f, "ZoneMap"),
134            Self::BloomFilter => write!(f, "BloomFilter"),
135            Self::Vector | Self::IvfPq => write!(f, "IVF_PQ"),
136            Self::IvfFlat => write!(f, "IVF_FLAT"),
137            Self::IvfSq => write!(f, "IVF_SQ"),
138            Self::IvfHnswSq => write!(f, "IVF_HNSW_SQ"),
139            Self::IvfHnswPq => write!(f, "IVF_HNSW_PQ"),
140            Self::IvfHnswFlat => write!(f, "IVF_HNSW_FLAT"),
141            Self::IvfRq => write!(f, "IVF_RQ"),
142        }
143    }
144}
145
146impl TryFrom<i32> for IndexType {
147    type Error = Error;
148
149    fn try_from(value: i32) -> Result<Self> {
150        match value {
151            v if v == Self::Scalar as i32 => Ok(Self::Scalar),
152            v if v == Self::BTree as i32 => Ok(Self::BTree),
153            v if v == Self::Bitmap as i32 => Ok(Self::Bitmap),
154            v if v == Self::LabelList as i32 => Ok(Self::LabelList),
155            v if v == Self::NGram as i32 => Ok(Self::NGram),
156            v if v == Self::Inverted as i32 => Ok(Self::Inverted),
157            v if v == Self::FragmentReuse as i32 => Ok(Self::FragmentReuse),
158            v if v == Self::MemWal as i32 => Ok(Self::MemWal),
159            v if v == Self::ZoneMap as i32 => Ok(Self::ZoneMap),
160            v if v == Self::BloomFilter as i32 => Ok(Self::BloomFilter),
161            v if v == Self::Vector as i32 => Ok(Self::Vector),
162            v if v == Self::IvfFlat as i32 => Ok(Self::IvfFlat),
163            v if v == Self::IvfSq as i32 => Ok(Self::IvfSq),
164            v if v == Self::IvfPq as i32 => Ok(Self::IvfPq),
165            v if v == Self::IvfHnswSq as i32 => Ok(Self::IvfHnswSq),
166            v if v == Self::IvfHnswPq as i32 => Ok(Self::IvfHnswPq),
167            v if v == Self::IvfHnswFlat as i32 => Ok(Self::IvfHnswFlat),
168            _ => Err(Error::InvalidInput {
169                source: format!("the input value {} is not a valid IndexType", value).into(),
170                location: location!(),
171            }),
172        }
173    }
174}
175
176impl TryFrom<&str> for IndexType {
177    type Error = Error;
178
179    fn try_from(value: &str) -> Result<Self> {
180        match value {
181            "BTree" => Ok(Self::BTree),
182            "Bitmap" => Ok(Self::Bitmap),
183            "LabelList" => Ok(Self::LabelList),
184            "Inverted" => Ok(Self::Inverted),
185            "NGram" => Ok(Self::NGram),
186            "FragmentReuse" => Ok(Self::FragmentReuse),
187            "MemWal" => Ok(Self::MemWal),
188            "ZoneMap" => Ok(Self::ZoneMap),
189            "Vector" => Ok(Self::Vector),
190            "IVF_FLAT" => Ok(Self::IvfFlat),
191            "IVF_SQ" => Ok(Self::IvfSq),
192            "IVF_PQ" => Ok(Self::IvfPq),
193            "IVF_RQ" => Ok(Self::IvfRq),
194            "IVF_HNSW_FLAT" => Ok(Self::IvfHnswFlat),
195            "IVF_HNSW_SQ" => Ok(Self::IvfHnswSq),
196            "IVF_HNSW_PQ" => Ok(Self::IvfHnswPq),
197            _ => Err(Error::invalid_input(
198                format!("invalid index type: {}", value),
199                location!(),
200            )),
201        }
202    }
203}
204
205impl IndexType {
206    pub fn is_scalar(&self) -> bool {
207        matches!(
208            self,
209            Self::Scalar
210                | Self::BTree
211                | Self::Bitmap
212                | Self::LabelList
213                | Self::Inverted
214                | Self::NGram
215                | Self::ZoneMap
216                | Self::BloomFilter
217        )
218    }
219
220    pub fn is_vector(&self) -> bool {
221        matches!(
222            self,
223            Self::Vector
224                | Self::IvfPq
225                | Self::IvfHnswSq
226                | Self::IvfHnswPq
227                | Self::IvfHnswFlat
228                | Self::IvfFlat
229                | Self::IvfSq
230                | Self::IvfRq
231        )
232    }
233
234    pub fn is_system(&self) -> bool {
235        matches!(self, Self::FragmentReuse | Self::MemWal)
236    }
237
238    /// Returns the current format version of the index type,
239    /// bump this when the index format changes.
240    /// Indices which higher version than these will be ignored for compatibility,
241    /// This would happen when creating index in a newer version of Lance,
242    /// but then opening the index in older version of Lance
243    pub fn version(&self) -> i32 {
244        match self {
245            Self::Scalar => 0,
246            Self::BTree => 0,
247            Self::Bitmap => 0,
248            Self::LabelList => 0,
249            Self::Inverted => 0,
250            Self::NGram => 0,
251            Self::FragmentReuse => 0,
252            Self::MemWal => 0,
253            Self::ZoneMap => 0,
254            Self::BloomFilter => 0,
255
256            // for now all vector indices are built by the same builder,
257            // so they share the same version.
258            Self::Vector
259            | Self::IvfFlat
260            | Self::IvfSq
261            | Self::IvfPq
262            | Self::IvfHnswSq
263            | Self::IvfHnswPq
264            | Self::IvfHnswFlat
265            | Self::IvfRq => 1,
266        }
267    }
268
269    /// Returns the target partition size for the index type.
270    ///
271    /// This is used to compute the number of partitions for the index.
272    /// The partition size is optimized for the best performance of the index.
273    ///
274    /// This is for vector indices only.
275    pub fn target_partition_size(&self) -> usize {
276        match self {
277            Self::Vector => 8192,
278            Self::IvfFlat => 4096,
279            Self::IvfSq => 8192,
280            Self::IvfPq => 8192,
281            Self::IvfHnswFlat => 1 << 20,
282            Self::IvfHnswSq => 1 << 20,
283            Self::IvfHnswPq => 1 << 20,
284            _ => 8192,
285        }
286    }
287}
288
289pub trait IndexParams: Send + Sync {
290    fn as_any(&self) -> &dyn Any;
291
292    fn index_name(&self) -> &str;
293}
294
295#[derive(Serialize, Deserialize, Debug)]
296pub struct IndexMetadata {
297    #[serde(rename = "type")]
298    pub index_type: String,
299    pub distance_type: String,
300}
301
302pub fn is_system_index(index_meta: &lance_table::format::IndexMetadata) -> bool {
303    index_meta.name == FRAG_REUSE_INDEX_NAME || index_meta.name == MEM_WAL_INDEX_NAME
304}
305
306pub fn infer_system_index_type(
307    index_meta: &lance_table::format::IndexMetadata,
308) -> Option<IndexType> {
309    if index_meta.name == FRAG_REUSE_INDEX_NAME {
310        Some(IndexType::FragmentReuse)
311    } else if index_meta.name == MEM_WAL_INDEX_NAME {
312        Some(IndexType::MemWal)
313    } else {
314        None
315    }
316}