lance_index/
traits.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::sync::Arc;
5
6use async_trait::async_trait;
7use datafusion::execution::SendableRecordBatchStream;
8use lance_core::{Error, Result};
9use snafu::location;
10
11use crate::{optimize::OptimizeOptions, scalar::ScalarIndexType, IndexParams, IndexType};
12use lance_table::format::Index;
13use uuid::Uuid;
14
15/// A set of criteria used to filter potential indices to use for a query
16#[derive(Debug, Default)]
17pub struct ScalarIndexCriteria<'a> {
18    /// Only consider indices for this column (this also means the index
19    /// maps to a single column)
20    pub for_column: Option<&'a str>,
21    /// Only consider indices with this name
22    pub has_name: Option<&'a str>,
23    /// Only consider indices with this type
24    pub has_type: Option<ScalarIndexType>,
25    /// Only consider indices that support exact equality
26    pub supports_exact_equality: bool,
27}
28
29impl<'a> ScalarIndexCriteria<'a> {
30    /// Only consider indices for this column (this also means the index
31    /// maps to a single column)
32    pub fn for_column(mut self, column: &'a str) -> Self {
33        self.for_column = Some(column);
34        self
35    }
36
37    /// Only consider indices with this name
38    pub fn with_name(mut self, name: &'a str) -> Self {
39        self.has_name = Some(name);
40        self
41    }
42
43    /// Only consider indices with this type
44    pub fn with_type(mut self, ty: ScalarIndexType) -> Self {
45        self.has_type = Some(ty);
46        self
47    }
48
49    /// Only consider indices that support exact equality
50    ///
51    /// This will disqualify, for example, the ngram and inverted indices
52    /// or an index like a bloom filter
53    pub fn supports_exact_equality(mut self) -> Self {
54        self.supports_exact_equality = true;
55        self
56    }
57}
58
59// Extends Lance Dataset with secondary index.
60#[async_trait]
61pub trait DatasetIndexExt {
62    /// Create indices on columns.
63    ///
64    /// Upon finish, a new dataset version is generated.
65    ///
66    /// Parameters:
67    ///
68    ///  - `columns`: the columns to build the indices on.
69    ///  - `index_type`: specify [`IndexType`].
70    ///  - `name`: optional index name. Must be unique in the dataset.
71    ///            if not provided, it will auto-generate one.
72    ///  - `params`: index parameters.
73    ///  - `replace`: replace the existing index if it exists.
74    async fn create_index(
75        &mut self,
76        columns: &[&str],
77        index_type: IndexType,
78        name: Option<String>,
79        params: &dyn IndexParams,
80        replace: bool,
81    ) -> Result<()>;
82
83    /// Drop indices by name.
84    ///
85    /// Upon finish, a new dataset version is generated.
86    ///
87    /// Parameters:
88    ///
89    /// - `name`: the name of the index to drop.
90    async fn drop_index(&mut self, name: &str) -> Result<()>;
91
92    /// Prewarm an index by name.
93    ///
94    /// This will load the index into memory and cache it.
95    ///
96    /// Generally, this should only be called when it is known the entire index will
97    /// fit into the index cache.
98    ///
99    /// This is a hint that is not enforced by all indices today.  Some indices may choose
100    /// to ignore this hint.
101    async fn prewarm_index(&self, name: &str) -> Result<()>;
102
103    /// Read all indices of this Dataset version.
104    ///
105    /// The indices are lazy loaded and cached in memory within the [`Dataset`] instance.
106    /// The cache is invalidated when the dataset version (Manifest) is changed.
107    async fn load_indices(&self) -> Result<Arc<Vec<Index>>>;
108
109    /// Loads all the indies of a given UUID.
110    ///
111    /// Note that it is possible to have multiple indices with the same UUID,
112    /// as they are the deltas of the same index.
113    async fn load_index(&self, uuid: &str) -> Result<Option<Index>> {
114        self.load_indices().await.map(|indices| {
115            indices
116                .iter()
117                .find(|idx| idx.uuid.to_string() == uuid)
118                .cloned()
119        })
120    }
121
122    /// Loads a specific index with the given index name
123    ///
124    /// Returns
125    /// -------
126    /// - `Ok(indices)`: if the index exists, returns the index.
127    /// - `Ok(vec![])`: if the index does not exist.
128    /// - `Err(e)`: if there is an error loading indices.
129    ///
130    async fn load_indices_by_name(&self, name: &str) -> Result<Vec<Index>> {
131        self.load_indices().await.map(|indices| {
132            indices
133                .iter()
134                .filter(|idx| idx.name == name)
135                .cloned()
136                .collect()
137        })
138    }
139
140    /// Loads a specific index with the given index name.
141    /// This function only works for indices that are unique.
142    /// If there are multiple indices sharing the same name, please use [load_indices_by_name]
143    ///
144    /// Returns
145    /// -------
146    /// - `Ok(Some(index))`: if the index exists, returns the index.
147    /// - `Ok(None)`: if the index does not exist.
148    /// - `Err(e)`: Index error if there are multiple indexes sharing the same name.
149    ///
150    async fn load_index_by_name(&self, name: &str) -> Result<Option<Index>> {
151        let indices = self.load_indices_by_name(name).await?;
152        if indices.is_empty() {
153            Ok(None)
154        } else if indices.len() == 1 {
155            Ok(Some(indices[0].clone()))
156        } else {
157            Err(Error::Index {
158                message: format!("Found multiple indices of the same name: {:?}, please use load_indices_by_name", 
159                    indices.iter().map(|idx| &idx.name).collect::<Vec<_>>()),
160                location: location!(),
161            })
162        }
163    }
164
165    /// Loads a specific index with the given index name.
166    async fn load_scalar_index<'a, 'b>(
167        &'a self,
168        criteria: ScalarIndexCriteria<'b>,
169    ) -> Result<Option<Index>>;
170
171    /// Optimize indices.
172    async fn optimize_indices(&mut self, options: &OptimizeOptions) -> Result<()>;
173
174    /// Find index with a given index_name and return its serialized statistics.
175    ///
176    /// If the index does not exist, return Error.
177    async fn index_statistics(&self, index_name: &str) -> Result<String>;
178
179    async fn commit_existing_index(
180        &mut self,
181        index_name: &str,
182        column: &str,
183        index_id: Uuid,
184    ) -> Result<()>;
185
186    async fn read_index_partition(
187        &self,
188        index_name: &str,
189        partition_id: usize,
190        with_vector: bool,
191    ) -> Result<SendableRecordBatchStream>;
192}