lance_index/
traits.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::sync::Arc;
5
6use async_trait::async_trait;
7use datafusion::execution::SendableRecordBatchStream;
8use lance_core::{Error, Result};
9use snafu::location;
10
11use crate::{optimize::OptimizeOptions, IndexParams, IndexType};
12use lance_table::format::IndexMetadata;
13use uuid::Uuid;
14
15/// A set of criteria used to filter potential indices to use for a query
16#[derive(Debug, Default)]
17pub struct IndexCriteria<'a> {
18    /// Only consider indices for this column (this also means the index
19    /// maps to a single column)
20    pub for_column: Option<&'a str>,
21    /// Only consider indices with this name
22    pub has_name: Option<&'a str>,
23    /// If true, only consider indices that support FTS
24    pub must_support_fts: bool,
25    /// If true, only consider indices that support exact equality
26    pub must_support_exact_equality: bool,
27}
28
29impl<'a> IndexCriteria<'a> {
30    /// Only consider indices for this column (this also means the index
31    /// maps to a single column)
32    pub fn for_column(mut self, column: &'a str) -> Self {
33        self.for_column = Some(column);
34        self
35    }
36
37    /// Only consider indices with this name
38    pub fn with_name(mut self, name: &'a str) -> Self {
39        self.has_name = Some(name);
40        self
41    }
42
43    /// Only consider indices that support FTS
44    pub fn supports_fts(mut self) -> Self {
45        self.must_support_fts = true;
46        self
47    }
48
49    /// Only consider indices that support exact equality
50    ///
51    /// This will disqualify, for example, the ngram and inverted indices
52    /// or an index like a bloom filter
53    pub fn supports_exact_equality(mut self) -> Self {
54        self.must_support_exact_equality = true;
55        self
56    }
57}
58
59#[deprecated(since = "0.39.0", note = "Use IndexCriteria instead")]
60pub type ScalarIndexCriteria<'a> = IndexCriteria<'a>;
61
62/// Additional information about an index
63///
64/// Note that a single index might consist of multiple segments.  Each segment has its own
65/// UUID and collection of files and covers some subset of the data fragments.
66///
67/// All segments in an index should have the same index type and index details.
68pub trait IndexDescription: Send + Sync {
69    /// Returns the index name
70    ///
71    /// This is the user-defined name of the index.  It is shared by all segments of the index
72    /// and is what is used to refer to the index in the API.  It is guaranteed to be unique
73    /// within the dataset.
74    fn name(&self) -> &str;
75
76    /// Returns the index metadata
77    ///
78    /// This is the raw metadata information stored in the manifest.  There is one
79    /// IndexMetadata for each segment of the index.
80    fn metadata(&self) -> &[IndexMetadata];
81
82    /// Returns the index type URL
83    ///
84    /// This is extracted from the type url of the index details
85    fn type_url(&self) -> &str;
86
87    /// Returns the index type
88    ///
89    /// This is a short string identifier that is friendlier than the type URL but not
90    /// guaranteed to be unique.
91    ///
92    /// This is calculated by the plugin and will be "Unknown" if no plugin could be found
93    /// for the type URL.
94    fn index_type(&self) -> &str;
95
96    /// Returns the number of rows indexed by the index, across all segments.
97    ///
98    /// This is an approximate count and may include rows that have been
99    /// deleted.
100    fn rows_indexed(&self) -> u64;
101
102    /// Returns the ids of the fields that the index is built on.
103    fn field_ids(&self) -> &[u32];
104
105    /// Returns a JSON string representation of the index details
106    ///
107    /// The format of these details will vary depending on the index type and
108    /// since indexes can be provided by plugins we cannot fully define it here.
109    ///
110    /// However, plugins should do their best to maintain backwards compatibility
111    /// and consider this method part of the public API.
112    ///
113    /// See individual index plugins for more description of the expected format.
114    ///
115    /// The conversion from Any to JSON is controlled by the index
116    /// plugin.  As a result, this method may fail if there is no plugin
117    /// available for the index.
118    fn details(&self) -> Result<String>;
119}
120
121// Extends Lance Dataset with secondary index.
122#[async_trait]
123pub trait DatasetIndexExt {
124    type IndexBuilder<'a>
125    where
126        Self: 'a;
127
128    /// Create a builder for creating an index on columns.
129    ///
130    /// This returns a builder that can be configured with additional options
131    /// like `name()`, `replace()`, and `train()` before awaiting to execute.
132    ///
133    /// # Parameters
134    /// - `columns`: the columns to build the indices on.
135    /// - `index_type`: specify [`IndexType`].
136    /// - `params`: index parameters.
137    fn create_index_builder<'a>(
138        &'a mut self,
139        columns: &'a [&'a str],
140        index_type: IndexType,
141        params: &'a dyn IndexParams,
142    ) -> Self::IndexBuilder<'a>;
143
144    /// Create indices on columns.
145    ///
146    /// Upon finish, a new dataset version is generated.
147    ///
148    /// Parameters:
149    ///
150    ///  - `columns`: the columns to build the indices on.
151    ///  - `index_type`: specify [`IndexType`].
152    ///  - `name`: optional index name. Must be unique in the dataset.
153    ///            if not provided, it will auto-generate one.
154    ///  - `params`: index parameters.
155    ///  - `replace`: replace the existing index if it exists.
156    async fn create_index(
157        &mut self,
158        columns: &[&str],
159        index_type: IndexType,
160        name: Option<String>,
161        params: &dyn IndexParams,
162        replace: bool,
163    ) -> Result<()>;
164
165    /// Drop indices by name.
166    ///
167    /// Upon finish, a new dataset version is generated.
168    ///
169    /// Parameters:
170    ///
171    /// - `name`: the name of the index to drop.
172    async fn drop_index(&mut self, name: &str) -> Result<()>;
173
174    /// Prewarm an index by name.
175    ///
176    /// This will load the index into memory and cache it.
177    ///
178    /// Generally, this should only be called when it is known the entire index will
179    /// fit into the index cache.
180    ///
181    /// This is a hint that is not enforced by all indices today.  Some indices may choose
182    /// to ignore this hint.
183    async fn prewarm_index(&self, name: &str) -> Result<()>;
184
185    /// Read all indices of this Dataset version.
186    ///
187    /// The indices are lazy loaded and cached in memory within the [`Dataset`] instance.
188    /// The cache is invalidated when the dataset version (Manifest) is changed.
189    async fn load_indices(&self) -> Result<Arc<Vec<IndexMetadata>>>;
190
191    /// Loads all the indies of a given UUID.
192    ///
193    /// Note that it is possible to have multiple indices with the same UUID,
194    /// as they are the deltas of the same index.
195    async fn load_index(&self, uuid: &str) -> Result<Option<IndexMetadata>> {
196        self.load_indices().await.map(|indices| {
197            indices
198                .iter()
199                .find(|idx| idx.uuid.to_string() == uuid)
200                .cloned()
201        })
202    }
203
204    /// Loads a specific index with the given index name
205    ///
206    /// Returns
207    /// -------
208    /// - `Ok(indices)`: if the index exists, returns the index.
209    /// - `Ok(vec![])`: if the index does not exist.
210    /// - `Err(e)`: if there is an error loading indices.
211    ///
212    async fn load_indices_by_name(&self, name: &str) -> Result<Vec<IndexMetadata>> {
213        self.load_indices().await.map(|indices| {
214            indices
215                .iter()
216                .filter(|idx| idx.name == name)
217                .cloned()
218                .collect()
219        })
220    }
221
222    /// Loads a specific index with the given index name.
223    /// This function only works for indices that are unique.
224    /// If there are multiple indices sharing the same name, please use [load_indices_by_name]
225    ///
226    /// Returns
227    /// -------
228    /// - `Ok(Some(index))`: if the index exists, returns the index.
229    /// - `Ok(None)`: if the index does not exist.
230    /// - `Err(e)`: Index error if there are multiple indexes sharing the same name.
231    ///
232    async fn load_index_by_name(&self, name: &str) -> Result<Option<IndexMetadata>> {
233        let indices = self.load_indices_by_name(name).await?;
234        if indices.is_empty() {
235            Ok(None)
236        } else if indices.len() == 1 {
237            Ok(Some(indices[0].clone()))
238        } else {
239            Err(Error::Index {
240                message: format!("Found multiple indices of the same name: {:?}, please use load_indices_by_name", 
241                    indices.iter().map(|idx| &idx.name).collect::<Vec<_>>()),
242                location: location!(),
243            })
244        }
245    }
246
247    /// Describes indexes in a dataset
248    ///
249    /// This method should only access the index metadata and should not load the index into memory.
250    ///
251    /// More detailed information may be available from [`index_statistics`] but that will require
252    /// loading the index into memory.
253    async fn describe_indices<'a, 'b>(
254        &'a self,
255        criteria: Option<IndexCriteria<'b>>,
256    ) -> Result<Vec<Arc<dyn IndexDescription>>>;
257
258    /// Loads a specific index with the given index name.
259    async fn load_scalar_index<'a, 'b>(
260        &'a self,
261        criteria: IndexCriteria<'b>,
262    ) -> Result<Option<IndexMetadata>>;
263
264    /// Optimize indices.
265    async fn optimize_indices(&mut self, options: &OptimizeOptions) -> Result<()>;
266
267    /// Find index with a given index_name and return its serialized statistics.
268    ///
269    /// If the index does not exist, return Error.
270    async fn index_statistics(&self, index_name: &str) -> Result<String>;
271
272    async fn commit_existing_index(
273        &mut self,
274        index_name: &str,
275        column: &str,
276        index_id: Uuid,
277    ) -> Result<()>;
278
279    async fn read_index_partition(
280        &self,
281        index_name: &str,
282        partition_id: usize,
283        with_vector: bool,
284    ) -> Result<SendableRecordBatchStream>;
285}