Skip to main content

lance_index/
traits.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::sync::Arc;
5
6use async_trait::async_trait;
7use datafusion::execution::SendableRecordBatchStream;
8use lance_core::{Error, Result};
9
10use crate::{IndexParams, IndexType, optimize::OptimizeOptions, types::IndexSegment};
11use lance_table::format::IndexMetadata;
12
13/// A set of criteria used to filter potential indices to use for a query
14#[derive(Debug, Default)]
15pub struct IndexCriteria<'a> {
16    /// Only consider indices for this column (this also means the index
17    /// maps to a single column)
18    pub for_column: Option<&'a str>,
19    /// Only consider indices with this name
20    pub has_name: Option<&'a str>,
21    /// If true, only consider indices that support FTS
22    pub must_support_fts: bool,
23    /// If true, only consider indices that support exact equality
24    pub must_support_exact_equality: bool,
25}
26
27impl<'a> IndexCriteria<'a> {
28    /// Only consider indices for this column (this also means the index
29    /// maps to a single column)
30    pub fn for_column(mut self, column: &'a str) -> Self {
31        self.for_column = Some(column);
32        self
33    }
34
35    /// Only consider indices with this name
36    pub fn with_name(mut self, name: &'a str) -> Self {
37        self.has_name = Some(name);
38        self
39    }
40
41    /// Only consider indices that support FTS
42    pub fn supports_fts(mut self) -> Self {
43        self.must_support_fts = true;
44        self
45    }
46
47    /// Only consider indices that support exact equality
48    ///
49    /// This will disqualify, for example, the ngram and inverted indices
50    /// or an index like a bloom filter
51    pub fn supports_exact_equality(mut self) -> Self {
52        self.must_support_exact_equality = true;
53        self
54    }
55}
56
57#[deprecated(since = "0.39.0", note = "Use IndexCriteria instead")]
58pub type ScalarIndexCriteria<'a> = IndexCriteria<'a>;
59
60/// Additional information about an index
61///
62/// Note that a single index might consist of multiple segments.  Each segment has its own
63/// UUID and collection of files and covers some subset of the data fragments.
64///
65/// All segments in an index should have the same index type and index details.
66pub trait IndexDescription: Send + Sync {
67    /// Returns the index name
68    ///
69    /// This is the user-defined name of the index.  It is shared by all segments of the index
70    /// and is what is used to refer to the index in the API.  It is guaranteed to be unique
71    /// within the dataset.
72    fn name(&self) -> &str;
73
74    /// Returns the index metadata
75    ///
76    /// This is the raw metadata information stored in the manifest.  There is one
77    /// IndexMetadata for each segment of the index.
78    fn metadata(&self) -> &[IndexMetadata];
79
80    /// Returns the index type URL
81    ///
82    /// This is extracted from the type url of the index details
83    fn type_url(&self) -> &str;
84
85    /// Returns the index type
86    ///
87    /// This is a short string identifier that is friendlier than the type URL but not
88    /// guaranteed to be unique.
89    ///
90    /// This is calculated by the plugin and will be "Unknown" if no plugin could be found
91    /// for the type URL.
92    fn index_type(&self) -> &str;
93
94    /// Returns the number of rows indexed by the index, across all segments.
95    ///
96    /// This is an approximate count and may include rows that have been
97    /// deleted.
98    fn rows_indexed(&self) -> u64;
99
100    /// Returns the ids of the fields that the index is built on.
101    fn field_ids(&self) -> &[u32];
102
103    /// Returns a JSON string representation of the index details
104    ///
105    /// The format of these details will vary depending on the index type and
106    /// since indexes can be provided by plugins we cannot fully define it here.
107    ///
108    /// However, plugins should do their best to maintain backwards compatibility
109    /// and consider this method part of the public API.
110    ///
111    /// See individual index plugins for more description of the expected format.
112    ///
113    /// The conversion from Any to JSON is controlled by the index
114    /// plugin.  As a result, this method may fail if there is no plugin
115    /// available for the index.
116    fn details(&self) -> Result<String>;
117
118    /// Returns the total size in bytes of all files across all segments.
119    ///
120    /// Returns `None` if file size information is not available for any segment
121    /// (for backward compatibility with indices created before file tracking was added).
122    fn total_size_bytes(&self) -> Option<u64>;
123}
124
125// Extends Lance Dataset with secondary index.
126#[async_trait]
127pub trait DatasetIndexExt {
128    type IndexBuilder<'a>
129    where
130        Self: 'a;
131    type IndexSegmentBuilder<'a>
132    where
133        Self: 'a;
134
135    /// Create a builder for creating an index on columns.
136    ///
137    /// This returns a builder that can be configured with additional options
138    /// like `name()`, `replace()`, and `train()` before awaiting to execute.
139    ///
140    /// # Parameters
141    /// - `columns`: the columns to build the indices on.
142    /// - `index_type`: specify [`IndexType`].
143    /// - `params`: index parameters.
144    fn create_index_builder<'a>(
145        &'a mut self,
146        columns: &'a [&'a str],
147        index_type: IndexType,
148        params: &'a dyn IndexParams,
149    ) -> Self::IndexBuilder<'a>;
150
151    /// Create a builder for building physical index segments from uncommitted
152    /// vector index outputs.
153    ///
154    /// The caller supplies the uncommitted index metadata returned by
155    /// `execute_uncommitted()` so the builder can plan segment grouping without
156    /// rediscovering fragment coverage.
157    ///
158    /// This is the canonical entry point for distributed vector segment build.
159    /// After building the physical segments, publish them as a
160    /// logical index with [`Self::commit_existing_index_segments`].
161    fn create_index_segment_builder<'a>(&'a self) -> Self::IndexSegmentBuilder<'a>;
162
163    /// Create indices on columns.
164    ///
165    /// Upon finish, a new dataset version is generated.
166    ///
167    /// Parameters:
168    ///
169    ///  - `columns`: the columns to build the indices on.
170    ///  - `index_type`: specify [`IndexType`].
171    ///  - `name`: optional index name. Must be unique in the dataset.
172    ///            if not provided, it will auto-generate one.
173    ///  - `params`: index parameters.
174    ///  - `replace`: replace the existing index if it exists.
175    ///
176    /// Returns the metadata of the created index.
177    async fn create_index(
178        &mut self,
179        columns: &[&str],
180        index_type: IndexType,
181        name: Option<String>,
182        params: &dyn IndexParams,
183        replace: bool,
184    ) -> Result<IndexMetadata>;
185
186    /// Drop indices by name.
187    ///
188    /// Upon finish, a new dataset version is generated.
189    ///
190    /// Parameters:
191    ///
192    /// - `name`: the name of the index to drop.
193    async fn drop_index(&mut self, name: &str) -> Result<()>;
194
195    /// Prewarm an index by name.
196    ///
197    /// This will load the index into memory and cache it.
198    ///
199    /// Generally, this should only be called when it is known the entire index will
200    /// fit into the index cache.
201    ///
202    /// This is a hint that is not enforced by all indices today.  Some indices may choose
203    /// to ignore this hint.
204    async fn prewarm_index(&self, name: &str) -> Result<()>;
205
206    /// Read all indices of this Dataset version.
207    ///
208    /// The indices are lazy loaded and cached in memory within the `Dataset` instance.
209    /// The cache is invalidated when the dataset version (Manifest) is changed.
210    async fn load_indices(&self) -> Result<Arc<Vec<IndexMetadata>>>;
211
212    /// Loads all the indies of a given UUID.
213    ///
214    /// Note that it is possible to have multiple indices with the same UUID,
215    /// as they are the deltas of the same index.
216    async fn load_index(&self, uuid: &str) -> Result<Option<IndexMetadata>> {
217        self.load_indices().await.map(|indices| {
218            indices
219                .iter()
220                .find(|idx| idx.uuid.to_string() == uuid)
221                .cloned()
222        })
223    }
224
225    /// Loads a specific index with the given index name
226    ///
227    /// Returns
228    /// -------
229    /// - `Ok(indices)`: if the index exists, returns the index.
230    /// - `Ok(vec![])`: if the index does not exist.
231    /// - `Err(e)`: if there is an error loading indices.
232    ///
233    async fn load_indices_by_name(&self, name: &str) -> Result<Vec<IndexMetadata>> {
234        self.load_indices().await.map(|indices| {
235            indices
236                .iter()
237                .filter(|idx| idx.name == name)
238                .cloned()
239                .collect()
240        })
241    }
242
243    /// Loads a specific index with the given index name.
244    /// This function only works for indices that are unique.
245    /// If there are multiple indices sharing the same name, please use [`Self::load_indices_by_name`]
246    ///
247    /// Returns
248    /// -------
249    /// - `Ok(Some(index))`: if the index exists, returns the index.
250    /// - `Ok(None)`: if the index does not exist.
251    /// - `Err(e)`: Index error if there are multiple indexes sharing the same name.
252    ///
253    async fn load_index_by_name(&self, name: &str) -> Result<Option<IndexMetadata>> {
254        let indices = self.load_indices_by_name(name).await?;
255        if indices.is_empty() {
256            Ok(None)
257        } else if indices.len() == 1 {
258            Ok(Some(indices[0].clone()))
259        } else {
260            Err(Error::index(format!(
261                "Found multiple indices of the same name: {:?}, please use load_indices_by_name",
262                indices.iter().map(|idx| &idx.name).collect::<Vec<_>>()
263            )))
264        }
265    }
266
267    /// Describes indexes in a dataset
268    ///
269    /// This method should only access the index metadata and should not load the index into memory.
270    ///
271    /// More detailed information may be available from `index_statistics` but that will require
272    /// loading the index into memory.
273    async fn describe_indices<'a, 'b>(
274        &'a self,
275        criteria: Option<IndexCriteria<'b>>,
276    ) -> Result<Vec<Arc<dyn IndexDescription>>>;
277
278    /// Loads a specific index with the given index name.
279    async fn load_scalar_index<'a, 'b>(
280        &'a self,
281        criteria: IndexCriteria<'b>,
282    ) -> Result<Option<IndexMetadata>>;
283
284    /// Optimize indices.
285    async fn optimize_indices(&mut self, options: &OptimizeOptions) -> Result<()>;
286
287    /// Find index with a given index_name and return its serialized statistics.
288    ///
289    /// If the index does not exist, return Error.
290    async fn index_statistics(&self, index_name: &str) -> Result<String>;
291
292    /// Commit one or more existing physical index segments as a logical index.
293    ///
294    /// This publishes already-built physical segments. It does not build
295    /// or merge index data; callers should first build segments with
296    /// [`Self::create_index_segment_builder`] or another index-specific build
297    /// path and then pass the resulting segments here.
298    async fn commit_existing_index_segments(
299        &mut self,
300        index_name: &str,
301        column: &str,
302        segments: Vec<IndexSegment>,
303    ) -> Result<()>;
304
305    async fn read_index_partition(
306        &self,
307        index_name: &str,
308        partition_id: usize,
309        with_vector: bool,
310    ) -> Result<SendableRecordBatchStream>;
311}