lance_index/traits.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::sync::Arc;
5
6use async_trait::async_trait;
7use datafusion::execution::SendableRecordBatchStream;
8use lance_core::{Error, Result};
9
10use crate::{IndexParams, IndexType, optimize::OptimizeOptions, types::IndexSegment};
11use lance_table::format::IndexMetadata;
12
13/// A set of criteria used to filter potential indices to use for a query
14#[derive(Debug, Default)]
15pub struct IndexCriteria<'a> {
16 /// Only consider indices for this column (this also means the index
17 /// maps to a single column)
18 pub for_column: Option<&'a str>,
19 /// Only consider indices with this name
20 pub has_name: Option<&'a str>,
21 /// If true, only consider indices that support FTS
22 pub must_support_fts: bool,
23 /// If true, only consider indices that support exact equality
24 pub must_support_exact_equality: bool,
25}
26
27impl<'a> IndexCriteria<'a> {
28 /// Only consider indices for this column (this also means the index
29 /// maps to a single column)
30 pub fn for_column(mut self, column: &'a str) -> Self {
31 self.for_column = Some(column);
32 self
33 }
34
35 /// Only consider indices with this name
36 pub fn with_name(mut self, name: &'a str) -> Self {
37 self.has_name = Some(name);
38 self
39 }
40
41 /// Only consider indices that support FTS
42 pub fn supports_fts(mut self) -> Self {
43 self.must_support_fts = true;
44 self
45 }
46
47 /// Only consider indices that support exact equality
48 ///
49 /// This will disqualify, for example, the ngram and inverted indices
50 /// or an index like a bloom filter
51 pub fn supports_exact_equality(mut self) -> Self {
52 self.must_support_exact_equality = true;
53 self
54 }
55}
56
57#[deprecated(since = "0.39.0", note = "Use IndexCriteria instead")]
58pub type ScalarIndexCriteria<'a> = IndexCriteria<'a>;
59
60/// Additional information about an index
61///
62/// Note that a single index might consist of multiple segments. Each segment has its own
63/// UUID and collection of files and covers some subset of the data fragments.
64///
65/// All segments in an index should have the same index type and index details.
66pub trait IndexDescription: Send + Sync {
67 /// Returns the index name
68 ///
69 /// This is the user-defined name of the index. It is shared by all segments of the index
70 /// and is what is used to refer to the index in the API. It is guaranteed to be unique
71 /// within the dataset.
72 fn name(&self) -> &str;
73
74 /// Returns the index metadata
75 ///
76 /// This is the raw metadata information stored in the manifest. There is one
77 /// IndexMetadata for each segment of the index.
78 fn metadata(&self) -> &[IndexMetadata];
79
80 /// Returns the index type URL
81 ///
82 /// This is extracted from the type url of the index details
83 fn type_url(&self) -> &str;
84
85 /// Returns the index type
86 ///
87 /// This is a short string identifier that is friendlier than the type URL but not
88 /// guaranteed to be unique.
89 ///
90 /// This is calculated by the plugin and will be "Unknown" if no plugin could be found
91 /// for the type URL.
92 fn index_type(&self) -> &str;
93
94 /// Returns the number of rows indexed by the index, across all segments.
95 ///
96 /// This is an approximate count and may include rows that have been
97 /// deleted.
98 fn rows_indexed(&self) -> u64;
99
100 /// Returns the ids of the fields that the index is built on.
101 fn field_ids(&self) -> &[u32];
102
103 /// Returns a JSON string representation of the index details
104 ///
105 /// The format of these details will vary depending on the index type and
106 /// since indexes can be provided by plugins we cannot fully define it here.
107 ///
108 /// However, plugins should do their best to maintain backwards compatibility
109 /// and consider this method part of the public API.
110 ///
111 /// See individual index plugins for more description of the expected format.
112 ///
113 /// The conversion from Any to JSON is controlled by the index
114 /// plugin. As a result, this method may fail if there is no plugin
115 /// available for the index.
116 fn details(&self) -> Result<String>;
117
118 /// Returns the total size in bytes of all files across all segments.
119 ///
120 /// Returns `None` if file size information is not available for any segment
121 /// (for backward compatibility with indices created before file tracking was added).
122 fn total_size_bytes(&self) -> Option<u64>;
123}
124
125// Extends Lance Dataset with secondary index.
126#[async_trait]
127pub trait DatasetIndexExt {
128 type IndexBuilder<'a>
129 where
130 Self: 'a;
131 type IndexSegmentBuilder<'a>
132 where
133 Self: 'a;
134
135 /// Create a builder for creating an index on columns.
136 ///
137 /// This returns a builder that can be configured with additional options
138 /// like `name()`, `replace()`, and `train()` before awaiting to execute.
139 ///
140 /// # Parameters
141 /// - `columns`: the columns to build the indices on.
142 /// - `index_type`: specify [`IndexType`].
143 /// - `params`: index parameters.
144 fn create_index_builder<'a>(
145 &'a mut self,
146 columns: &'a [&'a str],
147 index_type: IndexType,
148 params: &'a dyn IndexParams,
149 ) -> Self::IndexBuilder<'a>;
150
151 /// Create a builder for building physical index segments from uncommitted
152 /// vector index outputs.
153 ///
154 /// The caller supplies the uncommitted index metadata returned by
155 /// `execute_uncommitted()` so the builder can plan segment grouping without
156 /// rediscovering fragment coverage.
157 ///
158 /// This is the canonical entry point for distributed vector segment build.
159 /// After building the physical segments, publish them as a
160 /// logical index with [`Self::commit_existing_index_segments`].
161 fn create_index_segment_builder<'a>(&'a self) -> Self::IndexSegmentBuilder<'a>;
162
163 /// Create indices on columns.
164 ///
165 /// Upon finish, a new dataset version is generated.
166 ///
167 /// Parameters:
168 ///
169 /// - `columns`: the columns to build the indices on.
170 /// - `index_type`: specify [`IndexType`].
171 /// - `name`: optional index name. Must be unique in the dataset.
172 /// if not provided, it will auto-generate one.
173 /// - `params`: index parameters.
174 /// - `replace`: replace the existing index if it exists.
175 ///
176 /// Returns the metadata of the created index.
177 async fn create_index(
178 &mut self,
179 columns: &[&str],
180 index_type: IndexType,
181 name: Option<String>,
182 params: &dyn IndexParams,
183 replace: bool,
184 ) -> Result<IndexMetadata>;
185
186 /// Drop indices by name.
187 ///
188 /// Upon finish, a new dataset version is generated.
189 ///
190 /// Parameters:
191 ///
192 /// - `name`: the name of the index to drop.
193 async fn drop_index(&mut self, name: &str) -> Result<()>;
194
195 /// Prewarm an index by name.
196 ///
197 /// This will load the index into memory and cache it.
198 ///
199 /// Generally, this should only be called when it is known the entire index will
200 /// fit into the index cache.
201 ///
202 /// This is a hint that is not enforced by all indices today. Some indices may choose
203 /// to ignore this hint.
204 async fn prewarm_index(&self, name: &str) -> Result<()>;
205
206 /// Read all indices of this Dataset version.
207 ///
208 /// The indices are lazy loaded and cached in memory within the `Dataset` instance.
209 /// The cache is invalidated when the dataset version (Manifest) is changed.
210 async fn load_indices(&self) -> Result<Arc<Vec<IndexMetadata>>>;
211
212 /// Loads all the indies of a given UUID.
213 ///
214 /// Note that it is possible to have multiple indices with the same UUID,
215 /// as they are the deltas of the same index.
216 async fn load_index(&self, uuid: &str) -> Result<Option<IndexMetadata>> {
217 self.load_indices().await.map(|indices| {
218 indices
219 .iter()
220 .find(|idx| idx.uuid.to_string() == uuid)
221 .cloned()
222 })
223 }
224
225 /// Loads a specific index with the given index name
226 ///
227 /// Returns
228 /// -------
229 /// - `Ok(indices)`: if the index exists, returns the index.
230 /// - `Ok(vec![])`: if the index does not exist.
231 /// - `Err(e)`: if there is an error loading indices.
232 ///
233 async fn load_indices_by_name(&self, name: &str) -> Result<Vec<IndexMetadata>> {
234 self.load_indices().await.map(|indices| {
235 indices
236 .iter()
237 .filter(|idx| idx.name == name)
238 .cloned()
239 .collect()
240 })
241 }
242
243 /// Loads a specific index with the given index name.
244 /// This function only works for indices that are unique.
245 /// If there are multiple indices sharing the same name, please use [`Self::load_indices_by_name`]
246 ///
247 /// Returns
248 /// -------
249 /// - `Ok(Some(index))`: if the index exists, returns the index.
250 /// - `Ok(None)`: if the index does not exist.
251 /// - `Err(e)`: Index error if there are multiple indexes sharing the same name.
252 ///
253 async fn load_index_by_name(&self, name: &str) -> Result<Option<IndexMetadata>> {
254 let indices = self.load_indices_by_name(name).await?;
255 if indices.is_empty() {
256 Ok(None)
257 } else if indices.len() == 1 {
258 Ok(Some(indices[0].clone()))
259 } else {
260 Err(Error::index(format!(
261 "Found multiple indices of the same name: {:?}, please use load_indices_by_name",
262 indices.iter().map(|idx| &idx.name).collect::<Vec<_>>()
263 )))
264 }
265 }
266
267 /// Describes indexes in a dataset
268 ///
269 /// This method should only access the index metadata and should not load the index into memory.
270 ///
271 /// More detailed information may be available from `index_statistics` but that will require
272 /// loading the index into memory.
273 async fn describe_indices<'a, 'b>(
274 &'a self,
275 criteria: Option<IndexCriteria<'b>>,
276 ) -> Result<Vec<Arc<dyn IndexDescription>>>;
277
278 /// Loads a specific index with the given index name.
279 async fn load_scalar_index<'a, 'b>(
280 &'a self,
281 criteria: IndexCriteria<'b>,
282 ) -> Result<Option<IndexMetadata>>;
283
284 /// Optimize indices.
285 async fn optimize_indices(&mut self, options: &OptimizeOptions) -> Result<()>;
286
287 /// Find index with a given index_name and return its serialized statistics.
288 ///
289 /// If the index does not exist, return Error.
290 async fn index_statistics(&self, index_name: &str) -> Result<String>;
291
292 /// Commit one or more existing physical index segments as a logical index.
293 ///
294 /// This publishes already-built physical segments. It does not build
295 /// or merge index data; callers should first build segments with
296 /// [`Self::create_index_segment_builder`] or another index-specific build
297 /// path and then pass the resulting segments here.
298 async fn commit_existing_index_segments(
299 &mut self,
300 index_name: &str,
301 column: &str,
302 segments: Vec<IndexSegment>,
303 ) -> Result<()>;
304
305 async fn read_index_partition(
306 &self,
307 index_name: &str,
308 partition_id: usize,
309 with_vector: bool,
310 ) -> Result<SendableRecordBatchStream>;
311}