Skip to main content

lance_index/
traits.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::sync::Arc;
5
6use async_trait::async_trait;
7use datafusion::execution::SendableRecordBatchStream;
8use lance_core::{Error, Result};
9
10use crate::{IndexParams, IndexType, optimize::OptimizeOptions};
11use lance_table::format::IndexMetadata;
12use uuid::Uuid;
13
14/// A set of criteria used to filter potential indices to use for a query
15#[derive(Debug, Default)]
16pub struct IndexCriteria<'a> {
17    /// Only consider indices for this column (this also means the index
18    /// maps to a single column)
19    pub for_column: Option<&'a str>,
20    /// Only consider indices with this name
21    pub has_name: Option<&'a str>,
22    /// If true, only consider indices that support FTS
23    pub must_support_fts: bool,
24    /// If true, only consider indices that support exact equality
25    pub must_support_exact_equality: bool,
26}
27
28impl<'a> IndexCriteria<'a> {
29    /// Only consider indices for this column (this also means the index
30    /// maps to a single column)
31    pub fn for_column(mut self, column: &'a str) -> Self {
32        self.for_column = Some(column);
33        self
34    }
35
36    /// Only consider indices with this name
37    pub fn with_name(mut self, name: &'a str) -> Self {
38        self.has_name = Some(name);
39        self
40    }
41
42    /// Only consider indices that support FTS
43    pub fn supports_fts(mut self) -> Self {
44        self.must_support_fts = true;
45        self
46    }
47
48    /// Only consider indices that support exact equality
49    ///
50    /// This will disqualify, for example, the ngram and inverted indices
51    /// or an index like a bloom filter
52    pub fn supports_exact_equality(mut self) -> Self {
53        self.must_support_exact_equality = true;
54        self
55    }
56}
57
58#[deprecated(since = "0.39.0", note = "Use IndexCriteria instead")]
59pub type ScalarIndexCriteria<'a> = IndexCriteria<'a>;
60
61/// Additional information about an index
62///
63/// Note that a single index might consist of multiple segments.  Each segment has its own
64/// UUID and collection of files and covers some subset of the data fragments.
65///
66/// All segments in an index should have the same index type and index details.
67pub trait IndexDescription: Send + Sync {
68    /// Returns the index name
69    ///
70    /// This is the user-defined name of the index.  It is shared by all segments of the index
71    /// and is what is used to refer to the index in the API.  It is guaranteed to be unique
72    /// within the dataset.
73    fn name(&self) -> &str;
74
75    /// Returns the index metadata
76    ///
77    /// This is the raw metadata information stored in the manifest.  There is one
78    /// IndexMetadata for each segment of the index.
79    fn metadata(&self) -> &[IndexMetadata];
80
81    /// Returns the index type URL
82    ///
83    /// This is extracted from the type url of the index details
84    fn type_url(&self) -> &str;
85
86    /// Returns the index type
87    ///
88    /// This is a short string identifier that is friendlier than the type URL but not
89    /// guaranteed to be unique.
90    ///
91    /// This is calculated by the plugin and will be "Unknown" if no plugin could be found
92    /// for the type URL.
93    fn index_type(&self) -> &str;
94
95    /// Returns the number of rows indexed by the index, across all segments.
96    ///
97    /// This is an approximate count and may include rows that have been
98    /// deleted.
99    fn rows_indexed(&self) -> u64;
100
101    /// Returns the ids of the fields that the index is built on.
102    fn field_ids(&self) -> &[u32];
103
104    /// Returns a JSON string representation of the index details
105    ///
106    /// The format of these details will vary depending on the index type and
107    /// since indexes can be provided by plugins we cannot fully define it here.
108    ///
109    /// However, plugins should do their best to maintain backwards compatibility
110    /// and consider this method part of the public API.
111    ///
112    /// See individual index plugins for more description of the expected format.
113    ///
114    /// The conversion from Any to JSON is controlled by the index
115    /// plugin.  As a result, this method may fail if there is no plugin
116    /// available for the index.
117    fn details(&self) -> Result<String>;
118}
119
120// Extends Lance Dataset with secondary index.
121#[async_trait]
122pub trait DatasetIndexExt {
123    type IndexBuilder<'a>
124    where
125        Self: 'a;
126
127    /// Create a builder for creating an index on columns.
128    ///
129    /// This returns a builder that can be configured with additional options
130    /// like `name()`, `replace()`, and `train()` before awaiting to execute.
131    ///
132    /// # Parameters
133    /// - `columns`: the columns to build the indices on.
134    /// - `index_type`: specify [`IndexType`].
135    /// - `params`: index parameters.
136    fn create_index_builder<'a>(
137        &'a mut self,
138        columns: &'a [&'a str],
139        index_type: IndexType,
140        params: &'a dyn IndexParams,
141    ) -> Self::IndexBuilder<'a>;
142
143    /// Create indices on columns.
144    ///
145    /// Upon finish, a new dataset version is generated.
146    ///
147    /// Parameters:
148    ///
149    ///  - `columns`: the columns to build the indices on.
150    ///  - `index_type`: specify [`IndexType`].
151    ///  - `name`: optional index name. Must be unique in the dataset.
152    ///            if not provided, it will auto-generate one.
153    ///  - `params`: index parameters.
154    ///  - `replace`: replace the existing index if it exists.
155    ///
156    /// Returns the metadata of the created index.
157    async fn create_index(
158        &mut self,
159        columns: &[&str],
160        index_type: IndexType,
161        name: Option<String>,
162        params: &dyn IndexParams,
163        replace: bool,
164    ) -> Result<IndexMetadata>;
165
166    /// Drop indices by name.
167    ///
168    /// Upon finish, a new dataset version is generated.
169    ///
170    /// Parameters:
171    ///
172    /// - `name`: the name of the index to drop.
173    async fn drop_index(&mut self, name: &str) -> Result<()>;
174
175    /// Prewarm an index by name.
176    ///
177    /// This will load the index into memory and cache it.
178    ///
179    /// Generally, this should only be called when it is known the entire index will
180    /// fit into the index cache.
181    ///
182    /// This is a hint that is not enforced by all indices today.  Some indices may choose
183    /// to ignore this hint.
184    async fn prewarm_index(&self, name: &str) -> Result<()>;
185
186    /// Read all indices of this Dataset version.
187    ///
188    /// The indices are lazy loaded and cached in memory within the `Dataset` instance.
189    /// The cache is invalidated when the dataset version (Manifest) is changed.
190    async fn load_indices(&self) -> Result<Arc<Vec<IndexMetadata>>>;
191
192    /// Loads all the indies of a given UUID.
193    ///
194    /// Note that it is possible to have multiple indices with the same UUID,
195    /// as they are the deltas of the same index.
196    async fn load_index(&self, uuid: &str) -> Result<Option<IndexMetadata>> {
197        self.load_indices().await.map(|indices| {
198            indices
199                .iter()
200                .find(|idx| idx.uuid.to_string() == uuid)
201                .cloned()
202        })
203    }
204
205    /// Loads a specific index with the given index name
206    ///
207    /// Returns
208    /// -------
209    /// - `Ok(indices)`: if the index exists, returns the index.
210    /// - `Ok(vec![])`: if the index does not exist.
211    /// - `Err(e)`: if there is an error loading indices.
212    ///
213    async fn load_indices_by_name(&self, name: &str) -> Result<Vec<IndexMetadata>> {
214        self.load_indices().await.map(|indices| {
215            indices
216                .iter()
217                .filter(|idx| idx.name == name)
218                .cloned()
219                .collect()
220        })
221    }
222
223    /// Loads a specific index with the given index name.
224    /// This function only works for indices that are unique.
225    /// If there are multiple indices sharing the same name, please use [`Self::load_indices_by_name`]
226    ///
227    /// Returns
228    /// -------
229    /// - `Ok(Some(index))`: if the index exists, returns the index.
230    /// - `Ok(None)`: if the index does not exist.
231    /// - `Err(e)`: Index error if there are multiple indexes sharing the same name.
232    ///
233    async fn load_index_by_name(&self, name: &str) -> Result<Option<IndexMetadata>> {
234        let indices = self.load_indices_by_name(name).await?;
235        if indices.is_empty() {
236            Ok(None)
237        } else if indices.len() == 1 {
238            Ok(Some(indices[0].clone()))
239        } else {
240            Err(Error::index(format!(
241                "Found multiple indices of the same name: {:?}, please use load_indices_by_name",
242                indices.iter().map(|idx| &idx.name).collect::<Vec<_>>()
243            )))
244        }
245    }
246
247    /// Describes indexes in a dataset
248    ///
249    /// This method should only access the index metadata and should not load the index into memory.
250    ///
251    /// More detailed information may be available from `index_statistics` but that will require
252    /// loading the index into memory.
253    async fn describe_indices<'a, 'b>(
254        &'a self,
255        criteria: Option<IndexCriteria<'b>>,
256    ) -> Result<Vec<Arc<dyn IndexDescription>>>;
257
258    /// Loads a specific index with the given index name.
259    async fn load_scalar_index<'a, 'b>(
260        &'a self,
261        criteria: IndexCriteria<'b>,
262    ) -> Result<Option<IndexMetadata>>;
263
264    /// Optimize indices.
265    async fn optimize_indices(&mut self, options: &OptimizeOptions) -> Result<()>;
266
267    /// Find index with a given index_name and return its serialized statistics.
268    ///
269    /// If the index does not exist, return Error.
270    async fn index_statistics(&self, index_name: &str) -> Result<String>;
271
272    async fn commit_existing_index(
273        &mut self,
274        index_name: &str,
275        column: &str,
276        index_id: Uuid,
277    ) -> Result<()>;
278
279    async fn read_index_partition(
280        &self,
281        index_name: &str,
282        partition_id: usize,
283        with_vector: bool,
284    ) -> Result<SendableRecordBatchStream>;
285}