lance_index/traits.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::sync::Arc;
5
6use async_trait::async_trait;
7use datafusion::execution::SendableRecordBatchStream;
8use lance_core::{Error, Result};
9use snafu::location;
10
11use crate::{optimize::OptimizeOptions, IndexParams, IndexType};
12use lance_table::format::IndexMetadata;
13use uuid::Uuid;
14
15/// A set of criteria used to filter potential indices to use for a query
16#[derive(Debug, Default)]
17pub struct IndexCriteria<'a> {
18 /// Only consider indices for this column (this also means the index
19 /// maps to a single column)
20 pub for_column: Option<&'a str>,
21 /// Only consider indices with this name
22 pub has_name: Option<&'a str>,
23 /// If true, only consider indices that support FTS
24 pub must_support_fts: bool,
25 /// If true, only consider indices that support exact equality
26 pub must_support_exact_equality: bool,
27}
28
29impl<'a> IndexCriteria<'a> {
30 /// Only consider indices for this column (this also means the index
31 /// maps to a single column)
32 pub fn for_column(mut self, column: &'a str) -> Self {
33 self.for_column = Some(column);
34 self
35 }
36
37 /// Only consider indices with this name
38 pub fn with_name(mut self, name: &'a str) -> Self {
39 self.has_name = Some(name);
40 self
41 }
42
43 /// Only consider indices that support FTS
44 pub fn supports_fts(mut self) -> Self {
45 self.must_support_fts = true;
46 self
47 }
48
49 /// Only consider indices that support exact equality
50 ///
51 /// This will disqualify, for example, the ngram and inverted indices
52 /// or an index like a bloom filter
53 pub fn supports_exact_equality(mut self) -> Self {
54 self.must_support_exact_equality = true;
55 self
56 }
57}
58
59#[deprecated(since = "0.39.0", note = "Use IndexCriteria instead")]
60pub type ScalarIndexCriteria<'a> = IndexCriteria<'a>;
61
62/// Additional information about an index
63///
64/// Note that a single index might consist of multiple segments. Each segment has its own
65/// UUID and collection of files and covers some subset of the data fragments.
66///
67/// All segments in an index should have the same index type and index details.
68pub trait IndexDescription: Send + Sync {
69 /// Returns the index name
70 ///
71 /// This is the user-defined name of the index. It is shared by all segments of the index
72 /// and is what is used to refer to the index in the API. It is guaranteed to be unique
73 /// within the dataset.
74 fn name(&self) -> &str;
75
76 /// Returns the index metadata
77 ///
78 /// This is the raw metadata information stored in the manifest. There is one
79 /// IndexMetadata for each segment of the index.
80 fn metadata(&self) -> &[IndexMetadata];
81
82 /// Returns the index type URL
83 ///
84 /// This is extracted from the type url of the index details
85 fn type_url(&self) -> &str;
86
87 /// Returns the index type
88 ///
89 /// This is a short string identifier that is friendlier than the type URL but not
90 /// guaranteed to be unique.
91 ///
92 /// This is calculated by the plugin and will be "Unknown" if no plugin could be found
93 /// for the type URL.
94 fn index_type(&self) -> &str;
95
96 /// Returns the number of rows indexed by the index, across all segments.
97 ///
98 /// This is an approximate count and may include rows that have been
99 /// deleted.
100 fn rows_indexed(&self) -> u64;
101
102 /// Returns the ids of the fields that the index is built on.
103 fn field_ids(&self) -> &[u32];
104
105 /// Returns a JSON string representation of the index details
106 ///
107 /// The format of these details will vary depending on the index type and
108 /// since indexes can be provided by plugins we cannot fully define it here.
109 ///
110 /// However, plugins should do their best to maintain backwards compatibility
111 /// and consider this method part of the public API.
112 ///
113 /// See individual index plugins for more description of the expected format.
114 ///
115 /// The conversion from Any to JSON is controlled by the index
116 /// plugin. As a result, this method may fail if there is no plugin
117 /// available for the index.
118 fn details(&self) -> Result<String>;
119}
120
121// Extends Lance Dataset with secondary index.
122#[async_trait]
123pub trait DatasetIndexExt {
124 type IndexBuilder<'a>
125 where
126 Self: 'a;
127
128 /// Create a builder for creating an index on columns.
129 ///
130 /// This returns a builder that can be configured with additional options
131 /// like `name()`, `replace()`, and `train()` before awaiting to execute.
132 ///
133 /// # Parameters
134 /// - `columns`: the columns to build the indices on.
135 /// - `index_type`: specify [`IndexType`].
136 /// - `params`: index parameters.
137 fn create_index_builder<'a>(
138 &'a mut self,
139 columns: &'a [&'a str],
140 index_type: IndexType,
141 params: &'a dyn IndexParams,
142 ) -> Self::IndexBuilder<'a>;
143
144 /// Create indices on columns.
145 ///
146 /// Upon finish, a new dataset version is generated.
147 ///
148 /// Parameters:
149 ///
150 /// - `columns`: the columns to build the indices on.
151 /// - `index_type`: specify [`IndexType`].
152 /// - `name`: optional index name. Must be unique in the dataset.
153 /// if not provided, it will auto-generate one.
154 /// - `params`: index parameters.
155 /// - `replace`: replace the existing index if it exists.
156 async fn create_index(
157 &mut self,
158 columns: &[&str],
159 index_type: IndexType,
160 name: Option<String>,
161 params: &dyn IndexParams,
162 replace: bool,
163 ) -> Result<()>;
164
165 /// Drop indices by name.
166 ///
167 /// Upon finish, a new dataset version is generated.
168 ///
169 /// Parameters:
170 ///
171 /// - `name`: the name of the index to drop.
172 async fn drop_index(&mut self, name: &str) -> Result<()>;
173
174 /// Prewarm an index by name.
175 ///
176 /// This will load the index into memory and cache it.
177 ///
178 /// Generally, this should only be called when it is known the entire index will
179 /// fit into the index cache.
180 ///
181 /// This is a hint that is not enforced by all indices today. Some indices may choose
182 /// to ignore this hint.
183 async fn prewarm_index(&self, name: &str) -> Result<()>;
184
185 /// Read all indices of this Dataset version.
186 ///
187 /// The indices are lazy loaded and cached in memory within the [`Dataset`] instance.
188 /// The cache is invalidated when the dataset version (Manifest) is changed.
189 async fn load_indices(&self) -> Result<Arc<Vec<IndexMetadata>>>;
190
191 /// Loads all the indies of a given UUID.
192 ///
193 /// Note that it is possible to have multiple indices with the same UUID,
194 /// as they are the deltas of the same index.
195 async fn load_index(&self, uuid: &str) -> Result<Option<IndexMetadata>> {
196 self.load_indices().await.map(|indices| {
197 indices
198 .iter()
199 .find(|idx| idx.uuid.to_string() == uuid)
200 .cloned()
201 })
202 }
203
204 /// Loads a specific index with the given index name
205 ///
206 /// Returns
207 /// -------
208 /// - `Ok(indices)`: if the index exists, returns the index.
209 /// - `Ok(vec![])`: if the index does not exist.
210 /// - `Err(e)`: if there is an error loading indices.
211 ///
212 async fn load_indices_by_name(&self, name: &str) -> Result<Vec<IndexMetadata>> {
213 self.load_indices().await.map(|indices| {
214 indices
215 .iter()
216 .filter(|idx| idx.name == name)
217 .cloned()
218 .collect()
219 })
220 }
221
222 /// Loads a specific index with the given index name.
223 /// This function only works for indices that are unique.
224 /// If there are multiple indices sharing the same name, please use [load_indices_by_name]
225 ///
226 /// Returns
227 /// -------
228 /// - `Ok(Some(index))`: if the index exists, returns the index.
229 /// - `Ok(None)`: if the index does not exist.
230 /// - `Err(e)`: Index error if there are multiple indexes sharing the same name.
231 ///
232 async fn load_index_by_name(&self, name: &str) -> Result<Option<IndexMetadata>> {
233 let indices = self.load_indices_by_name(name).await?;
234 if indices.is_empty() {
235 Ok(None)
236 } else if indices.len() == 1 {
237 Ok(Some(indices[0].clone()))
238 } else {
239 Err(Error::Index {
240 message: format!("Found multiple indices of the same name: {:?}, please use load_indices_by_name",
241 indices.iter().map(|idx| &idx.name).collect::<Vec<_>>()),
242 location: location!(),
243 })
244 }
245 }
246
247 /// Describes indexes in a dataset
248 ///
249 /// This method should only access the index metadata and should not load the index into memory.
250 ///
251 /// More detailed information may be available from [`index_statistics`] but that will require
252 /// loading the index into memory.
253 async fn describe_indices<'a, 'b>(
254 &'a self,
255 criteria: Option<IndexCriteria<'b>>,
256 ) -> Result<Vec<Arc<dyn IndexDescription>>>;
257
258 /// Loads a specific index with the given index name.
259 async fn load_scalar_index<'a, 'b>(
260 &'a self,
261 criteria: IndexCriteria<'b>,
262 ) -> Result<Option<IndexMetadata>>;
263
264 /// Optimize indices.
265 async fn optimize_indices(&mut self, options: &OptimizeOptions) -> Result<()>;
266
267 /// Find index with a given index_name and return its serialized statistics.
268 ///
269 /// If the index does not exist, return Error.
270 async fn index_statistics(&self, index_name: &str) -> Result<String>;
271
272 async fn commit_existing_index(
273 &mut self,
274 index_name: &str,
275 column: &str,
276 index_id: Uuid,
277 ) -> Result<()>;
278
279 async fn read_index_partition(
280 &self,
281 index_name: &str,
282 partition_id: usize,
283 with_vector: bool,
284 ) -> Result<SendableRecordBatchStream>;
285}