lance_index/
lib.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Lance secondary index library
5//!
6//! <section class="warning">
7//! This is internal crate used by <a href="https://github.com/lancedb/lance">the lance project</a>.
8//! <br/>
9//! API stability is not guaranteed.
10//! </section>
11
12use std::{any::Any, sync::Arc};
13
14use crate::frag_reuse::FRAG_REUSE_INDEX_NAME;
15use crate::mem_wal::MEM_WAL_INDEX_NAME;
16use async_trait::async_trait;
17use deepsize::DeepSizeOf;
18use lance_core::{Error, Result};
19use roaring::RoaringBitmap;
20use serde::{Deserialize, Serialize};
21use snafu::location;
22use std::convert::TryFrom;
23
24pub mod frag_reuse;
25pub mod mem_wal;
26pub mod metrics;
27pub mod optimize;
28pub mod prefilter;
29pub mod scalar;
30pub mod traits;
31pub mod vector;
32
33pub use crate::traits::*;
34
35pub const INDEX_FILE_NAME: &str = "index.idx";
36/// The name of the auxiliary index file.
37///
38/// This file is used to store additional information about the index, to improve performance.
39/// - For 'IVF_HNSW' index, it stores the partitioned PQ Storage.
40pub const INDEX_AUXILIARY_FILE_NAME: &str = "auxiliary.idx";
41pub const INDEX_METADATA_SCHEMA_KEY: &str = "lance:index";
42
43pub mod pb {
44    #![allow(clippy::use_self)]
45    include!(concat!(env!("OUT_DIR"), "/lance.index.pb.rs"));
46}
47
48/// Generic methods common across all types of secondary indices
49///
50#[async_trait]
51pub trait Index: Send + Sync + DeepSizeOf {
52    /// Cast to [Any].
53    fn as_any(&self) -> &dyn Any;
54
55    /// Cast to [Index]
56    fn as_index(self: Arc<Self>) -> Arc<dyn Index>;
57
58    /// Cast to [vector::VectorIndex]
59    fn as_vector_index(self: Arc<Self>) -> Result<Arc<dyn vector::VectorIndex>>;
60
61    /// Retrieve index statistics as a JSON Value
62    fn statistics(&self) -> Result<serde_json::Value>;
63
64    /// Prewarm the index.
65    ///
66    /// This will load the index into memory and cache it.
67    async fn prewarm(&self) -> Result<()>;
68
69    /// Get the type of the index
70    fn index_type(&self) -> IndexType;
71
72    /// Read through the index and determine which fragment ids are covered by the index
73    ///
74    /// This is a kind of slow operation.  It's better to use the fragment_bitmap.  This
75    /// only exists for cases where the fragment_bitmap has become corrupted or missing.
76    async fn calculate_included_frags(&self) -> Result<RoaringBitmap>;
77}
78
79/// Index Type
80#[derive(Debug, PartialEq, Eq, Copy, Hash, Clone, DeepSizeOf)]
81pub enum IndexType {
82    // Preserve 0-100 for simple indices.
83    Scalar = 0, // Legacy scalar index, alias to BTree
84
85    BTree = 1, // BTree
86
87    Bitmap = 2, // Bitmap
88
89    LabelList = 3, // LabelList
90
91    Inverted = 4, // Inverted
92
93    NGram = 5, // NGram
94
95    FragmentReuse = 6,
96
97    MemWal = 7,
98
99    ZoneMap = 8, // ZoneMap
100
101    // 100+ and up for vector index.
102    /// Flat vector index.
103    Vector = 100, // Legacy vector index, alias to IvfPq
104    IvfFlat = 101,
105    IvfSq = 102,
106    IvfPq = 103,
107    IvfHnswSq = 104,
108    IvfHnswPq = 105,
109    IvfHnswFlat = 106,
110}
111
112impl std::fmt::Display for IndexType {
113    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
114        match self {
115            Self::Scalar | Self::BTree => write!(f, "BTree"),
116            Self::Bitmap => write!(f, "Bitmap"),
117            Self::LabelList => write!(f, "LabelList"),
118            Self::Inverted => write!(f, "Inverted"),
119            Self::NGram => write!(f, "NGram"),
120            Self::FragmentReuse => write!(f, "FragmentReuse"),
121            Self::MemWal => write!(f, "MemWal"),
122            Self::ZoneMap => write!(f, "ZoneMap"),
123            Self::Vector | Self::IvfPq => write!(f, "IVF_PQ"),
124            Self::IvfFlat => write!(f, "IVF_FLAT"),
125            Self::IvfSq => write!(f, "IVF_SQ"),
126            Self::IvfHnswSq => write!(f, "IVF_HNSW_SQ"),
127            Self::IvfHnswPq => write!(f, "IVF_HNSW_PQ"),
128            Self::IvfHnswFlat => write!(f, "IVF_HNSW_FLAT"),
129        }
130    }
131}
132
133impl TryFrom<i32> for IndexType {
134    type Error = Error;
135
136    fn try_from(value: i32) -> Result<Self> {
137        match value {
138            v if v == Self::Scalar as i32 => Ok(Self::Scalar),
139            v if v == Self::BTree as i32 => Ok(Self::BTree),
140            v if v == Self::Bitmap as i32 => Ok(Self::Bitmap),
141            v if v == Self::LabelList as i32 => Ok(Self::LabelList),
142            v if v == Self::NGram as i32 => Ok(Self::NGram),
143            v if v == Self::Inverted as i32 => Ok(Self::Inverted),
144            v if v == Self::FragmentReuse as i32 => Ok(Self::FragmentReuse),
145            v if v == Self::MemWal as i32 => Ok(Self::MemWal),
146            v if v == Self::ZoneMap as i32 => Ok(Self::ZoneMap),
147            v if v == Self::Vector as i32 => Ok(Self::Vector),
148            v if v == Self::IvfFlat as i32 => Ok(Self::IvfFlat),
149            v if v == Self::IvfSq as i32 => Ok(Self::IvfSq),
150            v if v == Self::IvfPq as i32 => Ok(Self::IvfPq),
151            v if v == Self::IvfHnswSq as i32 => Ok(Self::IvfHnswSq),
152            v if v == Self::IvfHnswPq as i32 => Ok(Self::IvfHnswPq),
153            v if v == Self::IvfHnswFlat as i32 => Ok(Self::IvfHnswFlat),
154            _ => Err(Error::InvalidInput {
155                source: format!("the input value {} is not a valid IndexType", value).into(),
156                location: location!(),
157            }),
158        }
159    }
160}
161
162impl IndexType {
163    pub fn is_scalar(&self) -> bool {
164        matches!(
165            self,
166            Self::Scalar
167                | Self::BTree
168                | Self::Bitmap
169                | Self::LabelList
170                | Self::Inverted
171                | Self::NGram
172                | Self::ZoneMap
173        )
174    }
175
176    pub fn is_vector(&self) -> bool {
177        matches!(
178            self,
179            Self::Vector
180                | Self::IvfPq
181                | Self::IvfHnswSq
182                | Self::IvfHnswPq
183                | Self::IvfHnswFlat
184                | Self::IvfFlat
185                | Self::IvfSq
186        )
187    }
188
189    pub fn is_system(&self) -> bool {
190        matches!(self, Self::FragmentReuse | Self::MemWal)
191    }
192
193    /// Returns the current format version of the index type,
194    /// bump this when the index format changes.
195    /// Indices which higher version than these will be ignored for compatibility,
196    /// This would happen when creating index in a newer version of Lance,
197    /// but then opening the index in older version of Lance
198    pub fn version(&self) -> i32 {
199        match self {
200            Self::Scalar => 0,
201            Self::BTree => 0,
202            Self::Bitmap => 0,
203            Self::LabelList => 0,
204            Self::Inverted => 0,
205            Self::NGram => 0,
206            Self::FragmentReuse => 0,
207            Self::MemWal => 0,
208            Self::ZoneMap => 0,
209
210            // for now all vector indices are built by the same builder,
211            // so they share the same version.
212            Self::Vector
213            | Self::IvfFlat
214            | Self::IvfSq
215            | Self::IvfPq
216            | Self::IvfHnswSq
217            | Self::IvfHnswPq
218            | Self::IvfHnswFlat => 1,
219        }
220    }
221}
222
223pub trait IndexParams: Send + Sync {
224    fn as_any(&self) -> &dyn Any;
225
226    fn index_type(&self) -> IndexType;
227
228    fn index_name(&self) -> &str;
229}
230
231#[derive(Serialize, Deserialize, Debug)]
232pub struct IndexMetadata {
233    #[serde(rename = "type")]
234    pub index_type: String,
235    pub distance_type: String,
236}
237
238pub fn is_system_index(index_meta: &lance_table::format::Index) -> bool {
239    index_meta.name == FRAG_REUSE_INDEX_NAME || index_meta.name == MEM_WAL_INDEX_NAME
240}
241
242pub fn infer_system_index_type(index_meta: &lance_table::format::Index) -> Option<IndexType> {
243    if index_meta.name == FRAG_REUSE_INDEX_NAME {
244        Some(IndexType::FragmentReuse)
245    } else if index_meta.name == MEM_WAL_INDEX_NAME {
246        Some(IndexType::MemWal)
247    } else {
248        None
249    }
250}