lance_index/traits.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::sync::Arc;
5
6use async_trait::async_trait;
7use datafusion::execution::SendableRecordBatchStream;
8use lance_core::{Error, Result};
9use snafu::location;
10
11use crate::{optimize::OptimizeOptions, scalar::ScalarIndexType, IndexParams, IndexType};
12use lance_table::format::Index;
13use uuid::Uuid;
14
15/// A set of criteria used to filter potential indices to use for a query
16#[derive(Debug, Default)]
17pub struct ScalarIndexCriteria<'a> {
18 /// Only consider indices for this column (this also means the index
19 /// maps to a single column)
20 pub for_column: Option<&'a str>,
21 /// Only consider indices with this name
22 pub has_name: Option<&'a str>,
23 /// Only consider indices with this type
24 pub has_type: Option<ScalarIndexType>,
25 /// Only consider indices that support exact equality
26 pub supports_exact_equality: bool,
27}
28
29impl<'a> ScalarIndexCriteria<'a> {
30 /// Only consider indices for this column (this also means the index
31 /// maps to a single column)
32 pub fn for_column(mut self, column: &'a str) -> Self {
33 self.for_column = Some(column);
34 self
35 }
36
37 /// Only consider indices with this name
38 pub fn with_name(mut self, name: &'a str) -> Self {
39 self.has_name = Some(name);
40 self
41 }
42
43 /// Only consider indices with this type
44 pub fn with_type(mut self, ty: ScalarIndexType) -> Self {
45 self.has_type = Some(ty);
46 self
47 }
48
49 /// Only consider indices that support exact equality
50 ///
51 /// This will disqualify, for example, the ngram and inverted indices
52 /// or an index like a bloom filter
53 pub fn supports_exact_equality(mut self) -> Self {
54 self.supports_exact_equality = true;
55 self
56 }
57}
58
59// Extends Lance Dataset with secondary index.
60#[async_trait]
61pub trait DatasetIndexExt {
62 /// Create indices on columns.
63 ///
64 /// Upon finish, a new dataset version is generated.
65 ///
66 /// Parameters:
67 ///
68 /// - `columns`: the columns to build the indices on.
69 /// - `index_type`: specify [`IndexType`].
70 /// - `name`: optional index name. Must be unique in the dataset.
71 /// if not provided, it will auto-generate one.
72 /// - `params`: index parameters.
73 /// - `replace`: replace the existing index if it exists.
74 async fn create_index(
75 &mut self,
76 columns: &[&str],
77 index_type: IndexType,
78 name: Option<String>,
79 params: &dyn IndexParams,
80 replace: bool,
81 ) -> Result<()>;
82
83 /// Drop indices by name.
84 ///
85 /// Upon finish, a new dataset version is generated.
86 ///
87 /// Parameters:
88 ///
89 /// - `name`: the name of the index to drop.
90 async fn drop_index(&mut self, name: &str) -> Result<()>;
91
92 /// Prewarm an index by name.
93 ///
94 /// This will load the index into memory and cache it.
95 ///
96 /// Generally, this should only be called when it is known the entire index will
97 /// fit into the index cache.
98 ///
99 /// This is a hint that is not enforced by all indices today. Some indices may choose
100 /// to ignore this hint.
101 async fn prewarm_index(&self, name: &str) -> Result<()>;
102
103 /// Read all indices of this Dataset version.
104 ///
105 /// The indices are lazy loaded and cached in memory within the [`Dataset`] instance.
106 /// The cache is invalidated when the dataset version (Manifest) is changed.
107 async fn load_indices(&self) -> Result<Arc<Vec<Index>>>;
108
109 /// Loads all the indies of a given UUID.
110 ///
111 /// Note that it is possible to have multiple indices with the same UUID,
112 /// as they are the deltas of the same index.
113 async fn load_index(&self, uuid: &str) -> Result<Option<Index>> {
114 self.load_indices().await.map(|indices| {
115 indices
116 .iter()
117 .find(|idx| idx.uuid.to_string() == uuid)
118 .cloned()
119 })
120 }
121
122 /// Loads a specific index with the given index name
123 ///
124 /// Returns
125 /// -------
126 /// - `Ok(indices)`: if the index exists, returns the index.
127 /// - `Ok(vec![])`: if the index does not exist.
128 /// - `Err(e)`: if there is an error loading indices.
129 ///
130 async fn load_indices_by_name(&self, name: &str) -> Result<Vec<Index>> {
131 self.load_indices().await.map(|indices| {
132 indices
133 .iter()
134 .filter(|idx| idx.name == name)
135 .cloned()
136 .collect()
137 })
138 }
139
140 /// Loads a specific index with the given index name.
141 /// This function only works for indices that are unique.
142 /// If there are multiple indices sharing the same name, please use [load_indices_by_name]
143 ///
144 /// Returns
145 /// -------
146 /// - `Ok(Some(index))`: if the index exists, returns the index.
147 /// - `Ok(None)`: if the index does not exist.
148 /// - `Err(e)`: Index error if there are multiple indexes sharing the same name.
149 ///
150 async fn load_index_by_name(&self, name: &str) -> Result<Option<Index>> {
151 let indices = self.load_indices_by_name(name).await?;
152 if indices.is_empty() {
153 Ok(None)
154 } else if indices.len() == 1 {
155 Ok(Some(indices[0].clone()))
156 } else {
157 Err(Error::Index {
158 message: format!("Found multiple indices of the same name: {:?}, please use load_indices_by_name",
159 indices.iter().map(|idx| &idx.name).collect::<Vec<_>>()),
160 location: location!(),
161 })
162 }
163 }
164
165 /// Loads a specific index with the given index name.
166 async fn load_scalar_index<'a, 'b>(
167 &'a self,
168 criteria: ScalarIndexCriteria<'b>,
169 ) -> Result<Option<Index>>;
170
171 /// Optimize indices.
172 async fn optimize_indices(&mut self, options: &OptimizeOptions) -> Result<()>;
173
174 /// Find index with a given index_name and return its serialized statistics.
175 ///
176 /// If the index does not exist, return Error.
177 async fn index_statistics(&self, index_name: &str) -> Result<String>;
178
179 async fn commit_existing_index(
180 &mut self,
181 index_name: &str,
182 column: &str,
183 index_id: Uuid,
184 ) -> Result<()>;
185
186 async fn read_index_partition(
187 &self,
188 index_name: &str,
189 partition_id: usize,
190 with_vector: bool,
191 ) -> Result<SendableRecordBatchStream>;
192}