Skip to main content

lance_index/
lib.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Lance secondary index library
5//!
6//! <section class="warning">
7//! This is internal crate used by <a href="https://github.com/lance-format/lance">the lance project</a>.
8//! <br/>
9//! API stability is not guaranteed.
10//! </section>
11
12use std::{any::Any, sync::Arc};
13
14use crate::frag_reuse::FRAG_REUSE_INDEX_NAME;
15use crate::mem_wal::MEM_WAL_INDEX_NAME;
16use async_trait::async_trait;
17use deepsize::DeepSizeOf;
18use lance_core::{Error, Result};
19use roaring::RoaringBitmap;
20use serde::{Deserialize, Serialize};
21use snafu::location;
22use std::convert::TryFrom;
23
24pub mod frag_reuse;
25pub mod mem_wal;
26pub mod metrics;
27pub mod optimize;
28pub mod prefilter;
29pub mod registry;
30pub mod scalar;
31pub mod traits;
32pub mod vector;
33
34pub use crate::traits::*;
35
36pub const INDEX_FILE_NAME: &str = "index.idx";
37/// The name of the auxiliary index file.
38///
39/// This file is used to store additional information about the index, to improve performance.
40/// - For 'IVF_HNSW' index, it stores the partitioned PQ Storage.
41pub const INDEX_AUXILIARY_FILE_NAME: &str = "auxiliary.idx";
42pub const INDEX_METADATA_SCHEMA_KEY: &str = "lance:index";
43
44// Currently all vector indexes are version 1
45pub const VECTOR_INDEX_VERSION: u32 = 1;
46
47/// The factor of threshold to trigger split / join for vector index.
48///
49/// If the number of rows in the single partition is greater than `MAX_PARTITION_SIZE_FACTOR * target_partition_size`,
50/// the partition will be split.
51/// If the number of rows in the single partition is less than `MIN_PARTITION_SIZE_PERCENT *target_partition_size / 100`,
52/// the partition will be joined.
53pub const MAX_PARTITION_SIZE_FACTOR: usize = 4;
54pub const MIN_PARTITION_SIZE_PERCENT: usize = 25;
55
56pub mod pb {
57    #![allow(clippy::use_self)]
58    include!(concat!(env!("OUT_DIR"), "/lance.index.pb.rs"));
59}
60
61pub mod pbold {
62    #![allow(clippy::use_self)]
63    include!(concat!(env!("OUT_DIR"), "/lance.table.rs"));
64}
65
66/// Generic methods common across all types of secondary indices
67///
68#[async_trait]
69pub trait Index: Send + Sync + DeepSizeOf {
70    /// Cast to [Any].
71    fn as_any(&self) -> &dyn Any;
72
73    /// Cast to [Index]
74    fn as_index(self: Arc<Self>) -> Arc<dyn Index>;
75
76    /// Cast to [vector::VectorIndex]
77    fn as_vector_index(self: Arc<Self>) -> Result<Arc<dyn vector::VectorIndex>>;
78
79    /// Retrieve index statistics as a JSON Value
80    fn statistics(&self) -> Result<serde_json::Value>;
81
82    /// Prewarm the index.
83    ///
84    /// This will load the index into memory and cache it.
85    async fn prewarm(&self) -> Result<()>;
86
87    /// Get the type of the index
88    fn index_type(&self) -> IndexType;
89
90    /// Read through the index and determine which fragment ids are covered by the index
91    ///
92    /// This is a kind of slow operation.  It's better to use the fragment_bitmap.  This
93    /// only exists for cases where the fragment_bitmap has become corrupted or missing.
94    async fn calculate_included_frags(&self) -> Result<RoaringBitmap>;
95}
96
97/// Index Type
98#[derive(Debug, PartialEq, Eq, Copy, Hash, Clone, DeepSizeOf)]
99pub enum IndexType {
100    // Preserve 0-100 for simple indices.
101    Scalar = 0, // Legacy scalar index, alias to BTree
102
103    BTree = 1, // BTree
104
105    Bitmap = 2, // Bitmap
106
107    LabelList = 3, // LabelList
108
109    Inverted = 4, // Inverted
110
111    NGram = 5, // NGram
112
113    FragmentReuse = 6,
114
115    MemWal = 7,
116
117    ZoneMap = 8, // ZoneMap
118
119    BloomFilter = 9, // Bloom filter
120
121    RTree = 10, // RTree
122
123    // 100+ and up for vector index.
124    /// Flat vector index.
125    Vector = 100, // Legacy vector index, alias to IvfPq
126    IvfFlat = 101,
127    IvfSq = 102,
128    IvfPq = 103,
129    IvfHnswSq = 104,
130    IvfHnswPq = 105,
131    IvfHnswFlat = 106,
132    IvfRq = 107,
133}
134
135impl std::fmt::Display for IndexType {
136    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
137        match self {
138            Self::Scalar | Self::BTree => write!(f, "BTree"),
139            Self::Bitmap => write!(f, "Bitmap"),
140            Self::LabelList => write!(f, "LabelList"),
141            Self::Inverted => write!(f, "Inverted"),
142            Self::NGram => write!(f, "NGram"),
143            Self::FragmentReuse => write!(f, "FragmentReuse"),
144            Self::MemWal => write!(f, "MemWal"),
145            Self::ZoneMap => write!(f, "ZoneMap"),
146            Self::BloomFilter => write!(f, "BloomFilter"),
147            Self::RTree => write!(f, "RTree"),
148            Self::Vector | Self::IvfPq => write!(f, "IVF_PQ"),
149            Self::IvfFlat => write!(f, "IVF_FLAT"),
150            Self::IvfSq => write!(f, "IVF_SQ"),
151            Self::IvfHnswSq => write!(f, "IVF_HNSW_SQ"),
152            Self::IvfHnswPq => write!(f, "IVF_HNSW_PQ"),
153            Self::IvfHnswFlat => write!(f, "IVF_HNSW_FLAT"),
154            Self::IvfRq => write!(f, "IVF_RQ"),
155        }
156    }
157}
158
159impl TryFrom<i32> for IndexType {
160    type Error = Error;
161
162    fn try_from(value: i32) -> Result<Self> {
163        match value {
164            v if v == Self::Scalar as i32 => Ok(Self::Scalar),
165            v if v == Self::BTree as i32 => Ok(Self::BTree),
166            v if v == Self::Bitmap as i32 => Ok(Self::Bitmap),
167            v if v == Self::LabelList as i32 => Ok(Self::LabelList),
168            v if v == Self::NGram as i32 => Ok(Self::NGram),
169            v if v == Self::Inverted as i32 => Ok(Self::Inverted),
170            v if v == Self::FragmentReuse as i32 => Ok(Self::FragmentReuse),
171            v if v == Self::MemWal as i32 => Ok(Self::MemWal),
172            v if v == Self::ZoneMap as i32 => Ok(Self::ZoneMap),
173            v if v == Self::BloomFilter as i32 => Ok(Self::BloomFilter),
174            v if v == Self::Vector as i32 => Ok(Self::Vector),
175            v if v == Self::IvfFlat as i32 => Ok(Self::IvfFlat),
176            v if v == Self::IvfSq as i32 => Ok(Self::IvfSq),
177            v if v == Self::IvfPq as i32 => Ok(Self::IvfPq),
178            v if v == Self::IvfHnswSq as i32 => Ok(Self::IvfHnswSq),
179            v if v == Self::IvfHnswPq as i32 => Ok(Self::IvfHnswPq),
180            v if v == Self::IvfHnswFlat as i32 => Ok(Self::IvfHnswFlat),
181            _ => Err(Error::InvalidInput {
182                source: format!("the input value {} is not a valid IndexType", value).into(),
183                location: location!(),
184            }),
185        }
186    }
187}
188
189impl TryFrom<&str> for IndexType {
190    type Error = Error;
191
192    fn try_from(value: &str) -> Result<Self> {
193        match value {
194            "BTree" => Ok(Self::BTree),
195            "Bitmap" => Ok(Self::Bitmap),
196            "LabelList" => Ok(Self::LabelList),
197            "Inverted" => Ok(Self::Inverted),
198            "NGram" => Ok(Self::NGram),
199            "FragmentReuse" => Ok(Self::FragmentReuse),
200            "MemWal" => Ok(Self::MemWal),
201            "ZoneMap" => Ok(Self::ZoneMap),
202            "Vector" => Ok(Self::Vector),
203            "IVF_FLAT" => Ok(Self::IvfFlat),
204            "IVF_SQ" => Ok(Self::IvfSq),
205            "IVF_PQ" => Ok(Self::IvfPq),
206            "IVF_RQ" => Ok(Self::IvfRq),
207            "IVF_HNSW_FLAT" => Ok(Self::IvfHnswFlat),
208            "IVF_HNSW_SQ" => Ok(Self::IvfHnswSq),
209            "IVF_HNSW_PQ" => Ok(Self::IvfHnswPq),
210            _ => Err(Error::invalid_input(
211                format!("invalid index type: {}", value),
212                location!(),
213            )),
214        }
215    }
216}
217
218impl IndexType {
219    pub fn is_scalar(&self) -> bool {
220        matches!(
221            self,
222            Self::Scalar
223                | Self::BTree
224                | Self::Bitmap
225                | Self::LabelList
226                | Self::Inverted
227                | Self::NGram
228                | Self::ZoneMap
229                | Self::BloomFilter
230                | Self::RTree,
231        )
232    }
233
234    pub fn is_vector(&self) -> bool {
235        matches!(
236            self,
237            Self::Vector
238                | Self::IvfPq
239                | Self::IvfHnswSq
240                | Self::IvfHnswPq
241                | Self::IvfHnswFlat
242                | Self::IvfFlat
243                | Self::IvfSq
244                | Self::IvfRq
245        )
246    }
247
248    pub fn is_system(&self) -> bool {
249        matches!(self, Self::FragmentReuse | Self::MemWal)
250    }
251
252    /// Returns the current format version of the index type,
253    /// bump this when the index format changes.
254    /// Indices which higher version than these will be ignored for compatibility,
255    /// This would happen when creating index in a newer version of Lance,
256    /// but then opening the index in older version of Lance
257    pub fn version(&self) -> i32 {
258        match self {
259            Self::Scalar => 0,
260            Self::BTree => 0,
261            Self::Bitmap => 0,
262            Self::LabelList => 0,
263            Self::Inverted => 0,
264            Self::NGram => 0,
265            Self::FragmentReuse => 0,
266            Self::MemWal => 0,
267            Self::ZoneMap => 0,
268            Self::BloomFilter => 0,
269            Self::RTree => 0,
270
271            // for now all vector indices are built by the same builder,
272            // so they share the same version.
273            Self::Vector
274            | Self::IvfFlat
275            | Self::IvfSq
276            | Self::IvfPq
277            | Self::IvfHnswSq
278            | Self::IvfHnswPq
279            | Self::IvfHnswFlat
280            | Self::IvfRq => 1,
281        }
282    }
283
284    /// Returns the target partition size for the index type.
285    ///
286    /// This is used to compute the number of partitions for the index.
287    /// The partition size is optimized for the best performance of the index.
288    ///
289    /// This is for vector indices only.
290    pub fn target_partition_size(&self) -> usize {
291        match self {
292            Self::Vector => 8192,
293            Self::IvfFlat => 4096,
294            Self::IvfSq => 8192,
295            Self::IvfPq => 8192,
296            Self::IvfHnswFlat => 1 << 20,
297            Self::IvfHnswSq => 1 << 20,
298            Self::IvfHnswPq => 1 << 20,
299            _ => 8192,
300        }
301    }
302}
303
304pub trait IndexParams: Send + Sync {
305    fn as_any(&self) -> &dyn Any;
306
307    fn index_name(&self) -> &str;
308}
309
310#[derive(Serialize, Deserialize, Debug)]
311pub struct IndexMetadata {
312    #[serde(rename = "type")]
313    pub index_type: String,
314    pub distance_type: String,
315}
316
317pub fn is_system_index(index_meta: &lance_table::format::IndexMetadata) -> bool {
318    index_meta.name == FRAG_REUSE_INDEX_NAME || index_meta.name == MEM_WAL_INDEX_NAME
319}
320
321pub fn infer_system_index_type(
322    index_meta: &lance_table::format::IndexMetadata,
323) -> Option<IndexType> {
324    if index_meta.name == FRAG_REUSE_INDEX_NAME {
325        Some(IndexType::FragmentReuse)
326    } else if index_meta.name == MEM_WAL_INDEX_NAME {
327        Some(IndexType::MemWal)
328    } else {
329        None
330    }
331}