lance_index/
lib.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Lance secondary index library
5//!
6//! <section class="warning">
7//! This is internal crate used by <a href="https://github.com/lancedb/lance">the lance project</a>.
8//! <br/>
9//! API stability is not guaranteed.
10//! </section>
11
12use std::{any::Any, sync::Arc};
13
14use crate::frag_reuse::FRAG_REUSE_INDEX_NAME;
15use crate::mem_wal::MEM_WAL_INDEX_NAME;
16use async_trait::async_trait;
17use deepsize::DeepSizeOf;
18use lance_core::{Error, Result};
19use roaring::RoaringBitmap;
20use serde::{Deserialize, Serialize};
21use snafu::location;
22use std::convert::TryFrom;
23
24pub mod frag_reuse;
25pub mod mem_wal;
26pub mod metrics;
27pub mod optimize;
28pub mod prefilter;
29pub mod scalar;
30pub mod traits;
31pub mod vector;
32
33pub use crate::traits::*;
34
35pub const INDEX_FILE_NAME: &str = "index.idx";
36/// The name of the auxiliary index file.
37///
38/// This file is used to store additional information about the index, to improve performance.
39/// - For 'IVF_HNSW' index, it stores the partitioned PQ Storage.
40pub const INDEX_AUXILIARY_FILE_NAME: &str = "auxiliary.idx";
41pub const INDEX_METADATA_SCHEMA_KEY: &str = "lance:index";
42
43// Currently all vector indexes are version 1
44pub const VECTOR_INDEX_VERSION: u32 = 1;
45
46pub mod pb {
47    #![allow(clippy::use_self)]
48    include!(concat!(env!("OUT_DIR"), "/lance.index.pb.rs"));
49}
50
51/// Generic methods common across all types of secondary indices
52///
53#[async_trait]
54pub trait Index: Send + Sync + DeepSizeOf {
55    /// Cast to [Any].
56    fn as_any(&self) -> &dyn Any;
57
58    /// Cast to [Index]
59    fn as_index(self: Arc<Self>) -> Arc<dyn Index>;
60
61    /// Cast to [vector::VectorIndex]
62    fn as_vector_index(self: Arc<Self>) -> Result<Arc<dyn vector::VectorIndex>>;
63
64    /// Retrieve index statistics as a JSON Value
65    fn statistics(&self) -> Result<serde_json::Value>;
66
67    /// Prewarm the index.
68    ///
69    /// This will load the index into memory and cache it.
70    async fn prewarm(&self) -> Result<()>;
71
72    /// Get the type of the index
73    fn index_type(&self) -> IndexType;
74
75    /// Read through the index and determine which fragment ids are covered by the index
76    ///
77    /// This is a kind of slow operation.  It's better to use the fragment_bitmap.  This
78    /// only exists for cases where the fragment_bitmap has become corrupted or missing.
79    async fn calculate_included_frags(&self) -> Result<RoaringBitmap>;
80}
81
82/// Index Type
83#[derive(Debug, PartialEq, Eq, Copy, Hash, Clone, DeepSizeOf)]
84pub enum IndexType {
85    // Preserve 0-100 for simple indices.
86    Scalar = 0, // Legacy scalar index, alias to BTree
87
88    BTree = 1, // BTree
89
90    Bitmap = 2, // Bitmap
91
92    LabelList = 3, // LabelList
93
94    Inverted = 4, // Inverted
95
96    NGram = 5, // NGram
97
98    FragmentReuse = 6,
99
100    MemWal = 7,
101
102    ZoneMap = 8, // ZoneMap
103
104    BloomFilter = 9, // Bloom filter
105
106    // 100+ and up for vector index.
107    /// Flat vector index.
108    Vector = 100, // Legacy vector index, alias to IvfPq
109    IvfFlat = 101,
110    IvfSq = 102,
111    IvfPq = 103,
112    IvfHnswSq = 104,
113    IvfHnswPq = 105,
114    IvfHnswFlat = 106,
115}
116
117impl std::fmt::Display for IndexType {
118    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
119        match self {
120            Self::Scalar | Self::BTree => write!(f, "BTree"),
121            Self::Bitmap => write!(f, "Bitmap"),
122            Self::LabelList => write!(f, "LabelList"),
123            Self::Inverted => write!(f, "Inverted"),
124            Self::NGram => write!(f, "NGram"),
125            Self::FragmentReuse => write!(f, "FragmentReuse"),
126            Self::MemWal => write!(f, "MemWal"),
127            Self::ZoneMap => write!(f, "ZoneMap"),
128            Self::BloomFilter => write!(f, "BloomFilter"),
129            Self::Vector | Self::IvfPq => write!(f, "IVF_PQ"),
130            Self::IvfFlat => write!(f, "IVF_FLAT"),
131            Self::IvfSq => write!(f, "IVF_SQ"),
132            Self::IvfHnswSq => write!(f, "IVF_HNSW_SQ"),
133            Self::IvfHnswPq => write!(f, "IVF_HNSW_PQ"),
134            Self::IvfHnswFlat => write!(f, "IVF_HNSW_FLAT"),
135        }
136    }
137}
138
139impl TryFrom<i32> for IndexType {
140    type Error = Error;
141
142    fn try_from(value: i32) -> Result<Self> {
143        match value {
144            v if v == Self::Scalar as i32 => Ok(Self::Scalar),
145            v if v == Self::BTree as i32 => Ok(Self::BTree),
146            v if v == Self::Bitmap as i32 => Ok(Self::Bitmap),
147            v if v == Self::LabelList as i32 => Ok(Self::LabelList),
148            v if v == Self::NGram as i32 => Ok(Self::NGram),
149            v if v == Self::Inverted as i32 => Ok(Self::Inverted),
150            v if v == Self::FragmentReuse as i32 => Ok(Self::FragmentReuse),
151            v if v == Self::MemWal as i32 => Ok(Self::MemWal),
152            v if v == Self::ZoneMap as i32 => Ok(Self::ZoneMap),
153            v if v == Self::BloomFilter as i32 => Ok(Self::BloomFilter),
154            v if v == Self::Vector as i32 => Ok(Self::Vector),
155            v if v == Self::IvfFlat as i32 => Ok(Self::IvfFlat),
156            v if v == Self::IvfSq as i32 => Ok(Self::IvfSq),
157            v if v == Self::IvfPq as i32 => Ok(Self::IvfPq),
158            v if v == Self::IvfHnswSq as i32 => Ok(Self::IvfHnswSq),
159            v if v == Self::IvfHnswPq as i32 => Ok(Self::IvfHnswPq),
160            v if v == Self::IvfHnswFlat as i32 => Ok(Self::IvfHnswFlat),
161            _ => Err(Error::InvalidInput {
162                source: format!("the input value {} is not a valid IndexType", value).into(),
163                location: location!(),
164            }),
165        }
166    }
167}
168
169impl TryFrom<&str> for IndexType {
170    type Error = Error;
171
172    fn try_from(value: &str) -> Result<Self> {
173        match value {
174            "BTree" => Ok(Self::BTree),
175            "Bitmap" => Ok(Self::Bitmap),
176            "LabelList" => Ok(Self::LabelList),
177            "Inverted" => Ok(Self::Inverted),
178            "NGram" => Ok(Self::NGram),
179            "FragmentReuse" => Ok(Self::FragmentReuse),
180            "MemWal" => Ok(Self::MemWal),
181            "ZoneMap" => Ok(Self::ZoneMap),
182            "Vector" => Ok(Self::Vector),
183            "IVF_FLAT" => Ok(Self::IvfFlat),
184            "IVF_SQ" => Ok(Self::IvfSq),
185            "IVF_PQ" => Ok(Self::IvfPq),
186            "IVF_HNSW_FLAT" => Ok(Self::IvfHnswFlat),
187            "IVF_HNSW_SQ" => Ok(Self::IvfHnswSq),
188            "IVF_HNSW_PQ" => Ok(Self::IvfHnswPq),
189            _ => Err(Error::invalid_input(
190                format!("invalid index type: {}", value),
191                location!(),
192            )),
193        }
194    }
195}
196
197impl IndexType {
198    pub fn is_scalar(&self) -> bool {
199        matches!(
200            self,
201            Self::Scalar
202                | Self::BTree
203                | Self::Bitmap
204                | Self::LabelList
205                | Self::Inverted
206                | Self::NGram
207                | Self::ZoneMap
208                | Self::BloomFilter
209        )
210    }
211
212    pub fn is_vector(&self) -> bool {
213        matches!(
214            self,
215            Self::Vector
216                | Self::IvfPq
217                | Self::IvfHnswSq
218                | Self::IvfHnswPq
219                | Self::IvfHnswFlat
220                | Self::IvfFlat
221                | Self::IvfSq
222        )
223    }
224
225    pub fn is_system(&self) -> bool {
226        matches!(self, Self::FragmentReuse | Self::MemWal)
227    }
228
229    /// Returns the current format version of the index type,
230    /// bump this when the index format changes.
231    /// Indices which higher version than these will be ignored for compatibility,
232    /// This would happen when creating index in a newer version of Lance,
233    /// but then opening the index in older version of Lance
234    pub fn version(&self) -> i32 {
235        match self {
236            Self::Scalar => 0,
237            Self::BTree => 0,
238            Self::Bitmap => 0,
239            Self::LabelList => 0,
240            Self::Inverted => 0,
241            Self::NGram => 0,
242            Self::FragmentReuse => 0,
243            Self::MemWal => 0,
244            Self::ZoneMap => 0,
245            Self::BloomFilter => 0,
246
247            // for now all vector indices are built by the same builder,
248            // so they share the same version.
249            Self::Vector
250            | Self::IvfFlat
251            | Self::IvfSq
252            | Self::IvfPq
253            | Self::IvfHnswSq
254            | Self::IvfHnswPq
255            | Self::IvfHnswFlat => 1,
256        }
257    }
258
259    /// Returns the target partition size for the index type.
260    ///
261    /// This is used to compute the number of partitions for the index.
262    /// The partition size is optimized for the best performance of the index.
263    ///
264    /// This is for vector indices only.
265    pub fn target_partition_size(&self) -> usize {
266        match self {
267            Self::Vector => 8192,
268            Self::IvfFlat => 4096,
269            Self::IvfSq => 8192,
270            Self::IvfPq => 8192,
271            Self::IvfHnswFlat => 1 << 20,
272            Self::IvfHnswSq => 1 << 20,
273            Self::IvfHnswPq => 1 << 20,
274            _ => 8192,
275        }
276    }
277}
278
279pub trait IndexParams: Send + Sync {
280    fn as_any(&self) -> &dyn Any;
281
282    fn index_name(&self) -> &str;
283}
284
285#[derive(Serialize, Deserialize, Debug)]
286pub struct IndexMetadata {
287    #[serde(rename = "type")]
288    pub index_type: String,
289    pub distance_type: String,
290}
291
292pub fn is_system_index(index_meta: &lance_table::format::Index) -> bool {
293    index_meta.name == FRAG_REUSE_INDEX_NAME || index_meta.name == MEM_WAL_INDEX_NAME
294}
295
296pub fn infer_system_index_type(index_meta: &lance_table::format::Index) -> Option<IndexType> {
297    if index_meta.name == FRAG_REUSE_INDEX_NAME {
298        Some(IndexType::FragmentReuse)
299    } else if index_meta.name == MEM_WAL_INDEX_NAME {
300        Some(IndexType::MemWal)
301    } else {
302        None
303    }
304}