lance_index/traits.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::sync::Arc;
5
6use async_trait::async_trait;
7use datafusion::execution::SendableRecordBatchStream;
8use lance_core::{Error, Result};
9
10use crate::{IndexParams, IndexType, optimize::OptimizeOptions};
11use lance_table::format::IndexMetadata;
12use uuid::Uuid;
13
14/// A set of criteria used to filter potential indices to use for a query
15#[derive(Debug, Default)]
16pub struct IndexCriteria<'a> {
17 /// Only consider indices for this column (this also means the index
18 /// maps to a single column)
19 pub for_column: Option<&'a str>,
20 /// Only consider indices with this name
21 pub has_name: Option<&'a str>,
22 /// If true, only consider indices that support FTS
23 pub must_support_fts: bool,
24 /// If true, only consider indices that support exact equality
25 pub must_support_exact_equality: bool,
26}
27
28impl<'a> IndexCriteria<'a> {
29 /// Only consider indices for this column (this also means the index
30 /// maps to a single column)
31 pub fn for_column(mut self, column: &'a str) -> Self {
32 self.for_column = Some(column);
33 self
34 }
35
36 /// Only consider indices with this name
37 pub fn with_name(mut self, name: &'a str) -> Self {
38 self.has_name = Some(name);
39 self
40 }
41
42 /// Only consider indices that support FTS
43 pub fn supports_fts(mut self) -> Self {
44 self.must_support_fts = true;
45 self
46 }
47
48 /// Only consider indices that support exact equality
49 ///
50 /// This will disqualify, for example, the ngram and inverted indices
51 /// or an index like a bloom filter
52 pub fn supports_exact_equality(mut self) -> Self {
53 self.must_support_exact_equality = true;
54 self
55 }
56}
57
58#[deprecated(since = "0.39.0", note = "Use IndexCriteria instead")]
59pub type ScalarIndexCriteria<'a> = IndexCriteria<'a>;
60
61/// Additional information about an index
62///
63/// Note that a single index might consist of multiple segments. Each segment has its own
64/// UUID and collection of files and covers some subset of the data fragments.
65///
66/// All segments in an index should have the same index type and index details.
67pub trait IndexDescription: Send + Sync {
68 /// Returns the index name
69 ///
70 /// This is the user-defined name of the index. It is shared by all segments of the index
71 /// and is what is used to refer to the index in the API. It is guaranteed to be unique
72 /// within the dataset.
73 fn name(&self) -> &str;
74
75 /// Returns the index metadata
76 ///
77 /// This is the raw metadata information stored in the manifest. There is one
78 /// IndexMetadata for each segment of the index.
79 fn metadata(&self) -> &[IndexMetadata];
80
81 /// Returns the index type URL
82 ///
83 /// This is extracted from the type url of the index details
84 fn type_url(&self) -> &str;
85
86 /// Returns the index type
87 ///
88 /// This is a short string identifier that is friendlier than the type URL but not
89 /// guaranteed to be unique.
90 ///
91 /// This is calculated by the plugin and will be "Unknown" if no plugin could be found
92 /// for the type URL.
93 fn index_type(&self) -> &str;
94
95 /// Returns the number of rows indexed by the index, across all segments.
96 ///
97 /// This is an approximate count and may include rows that have been
98 /// deleted.
99 fn rows_indexed(&self) -> u64;
100
101 /// Returns the ids of the fields that the index is built on.
102 fn field_ids(&self) -> &[u32];
103
104 /// Returns a JSON string representation of the index details
105 ///
106 /// The format of these details will vary depending on the index type and
107 /// since indexes can be provided by plugins we cannot fully define it here.
108 ///
109 /// However, plugins should do their best to maintain backwards compatibility
110 /// and consider this method part of the public API.
111 ///
112 /// See individual index plugins for more description of the expected format.
113 ///
114 /// The conversion from Any to JSON is controlled by the index
115 /// plugin. As a result, this method may fail if there is no plugin
116 /// available for the index.
117 fn details(&self) -> Result<String>;
118}
119
120// Extends Lance Dataset with secondary index.
121#[async_trait]
122pub trait DatasetIndexExt {
123 type IndexBuilder<'a>
124 where
125 Self: 'a;
126
127 /// Create a builder for creating an index on columns.
128 ///
129 /// This returns a builder that can be configured with additional options
130 /// like `name()`, `replace()`, and `train()` before awaiting to execute.
131 ///
132 /// # Parameters
133 /// - `columns`: the columns to build the indices on.
134 /// - `index_type`: specify [`IndexType`].
135 /// - `params`: index parameters.
136 fn create_index_builder<'a>(
137 &'a mut self,
138 columns: &'a [&'a str],
139 index_type: IndexType,
140 params: &'a dyn IndexParams,
141 ) -> Self::IndexBuilder<'a>;
142
143 /// Create indices on columns.
144 ///
145 /// Upon finish, a new dataset version is generated.
146 ///
147 /// Parameters:
148 ///
149 /// - `columns`: the columns to build the indices on.
150 /// - `index_type`: specify [`IndexType`].
151 /// - `name`: optional index name. Must be unique in the dataset.
152 /// if not provided, it will auto-generate one.
153 /// - `params`: index parameters.
154 /// - `replace`: replace the existing index if it exists.
155 ///
156 /// Returns the metadata of the created index.
157 async fn create_index(
158 &mut self,
159 columns: &[&str],
160 index_type: IndexType,
161 name: Option<String>,
162 params: &dyn IndexParams,
163 replace: bool,
164 ) -> Result<IndexMetadata>;
165
166 /// Drop indices by name.
167 ///
168 /// Upon finish, a new dataset version is generated.
169 ///
170 /// Parameters:
171 ///
172 /// - `name`: the name of the index to drop.
173 async fn drop_index(&mut self, name: &str) -> Result<()>;
174
175 /// Prewarm an index by name.
176 ///
177 /// This will load the index into memory and cache it.
178 ///
179 /// Generally, this should only be called when it is known the entire index will
180 /// fit into the index cache.
181 ///
182 /// This is a hint that is not enforced by all indices today. Some indices may choose
183 /// to ignore this hint.
184 async fn prewarm_index(&self, name: &str) -> Result<()>;
185
186 /// Read all indices of this Dataset version.
187 ///
188 /// The indices are lazy loaded and cached in memory within the `Dataset` instance.
189 /// The cache is invalidated when the dataset version (Manifest) is changed.
190 async fn load_indices(&self) -> Result<Arc<Vec<IndexMetadata>>>;
191
192 /// Loads all the indies of a given UUID.
193 ///
194 /// Note that it is possible to have multiple indices with the same UUID,
195 /// as they are the deltas of the same index.
196 async fn load_index(&self, uuid: &str) -> Result<Option<IndexMetadata>> {
197 self.load_indices().await.map(|indices| {
198 indices
199 .iter()
200 .find(|idx| idx.uuid.to_string() == uuid)
201 .cloned()
202 })
203 }
204
205 /// Loads a specific index with the given index name
206 ///
207 /// Returns
208 /// -------
209 /// - `Ok(indices)`: if the index exists, returns the index.
210 /// - `Ok(vec![])`: if the index does not exist.
211 /// - `Err(e)`: if there is an error loading indices.
212 ///
213 async fn load_indices_by_name(&self, name: &str) -> Result<Vec<IndexMetadata>> {
214 self.load_indices().await.map(|indices| {
215 indices
216 .iter()
217 .filter(|idx| idx.name == name)
218 .cloned()
219 .collect()
220 })
221 }
222
223 /// Loads a specific index with the given index name.
224 /// This function only works for indices that are unique.
225 /// If there are multiple indices sharing the same name, please use [`Self::load_indices_by_name`]
226 ///
227 /// Returns
228 /// -------
229 /// - `Ok(Some(index))`: if the index exists, returns the index.
230 /// - `Ok(None)`: if the index does not exist.
231 /// - `Err(e)`: Index error if there are multiple indexes sharing the same name.
232 ///
233 async fn load_index_by_name(&self, name: &str) -> Result<Option<IndexMetadata>> {
234 let indices = self.load_indices_by_name(name).await?;
235 if indices.is_empty() {
236 Ok(None)
237 } else if indices.len() == 1 {
238 Ok(Some(indices[0].clone()))
239 } else {
240 Err(Error::index(format!(
241 "Found multiple indices of the same name: {:?}, please use load_indices_by_name",
242 indices.iter().map(|idx| &idx.name).collect::<Vec<_>>()
243 )))
244 }
245 }
246
247 /// Describes indexes in a dataset
248 ///
249 /// This method should only access the index metadata and should not load the index into memory.
250 ///
251 /// More detailed information may be available from `index_statistics` but that will require
252 /// loading the index into memory.
253 async fn describe_indices<'a, 'b>(
254 &'a self,
255 criteria: Option<IndexCriteria<'b>>,
256 ) -> Result<Vec<Arc<dyn IndexDescription>>>;
257
258 /// Loads a specific index with the given index name.
259 async fn load_scalar_index<'a, 'b>(
260 &'a self,
261 criteria: IndexCriteria<'b>,
262 ) -> Result<Option<IndexMetadata>>;
263
264 /// Optimize indices.
265 async fn optimize_indices(&mut self, options: &OptimizeOptions) -> Result<()>;
266
267 /// Find index with a given index_name and return its serialized statistics.
268 ///
269 /// If the index does not exist, return Error.
270 async fn index_statistics(&self, index_name: &str) -> Result<String>;
271
272 async fn commit_existing_index(
273 &mut self,
274 index_name: &str,
275 column: &str,
276 index_id: Uuid,
277 ) -> Result<()>;
278
279 async fn read_index_partition(
280 &self,
281 index_name: &str,
282 partition_id: usize,
283 with_vector: bool,
284 ) -> Result<SendableRecordBatchStream>;
285}