1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
use std::sync::Arc;
use async_trait::async_trait;
use datafusion::execution::SendableRecordBatchStream;
use lance_core::{Error, Result};
use crate::{IndexParams, IndexType, optimize::OptimizeOptions, types::IndexSegment};
use lance_table::format::IndexMetadata;
/// A set of criteria used to filter potential indices to use for a query
#[derive(Debug, Default)]
pub struct IndexCriteria<'a> {
/// Only consider indices for this column (this also means the index
/// maps to a single column)
pub for_column: Option<&'a str>,
/// Only consider indices with this name
pub has_name: Option<&'a str>,
/// If true, only consider indices that support FTS
pub must_support_fts: bool,
/// If true, only consider indices that support exact equality
pub must_support_exact_equality: bool,
}
impl<'a> IndexCriteria<'a> {
/// Only consider indices for this column (this also means the index
/// maps to a single column)
pub fn for_column(mut self, column: &'a str) -> Self {
self.for_column = Some(column);
self
}
/// Only consider indices with this name
pub fn with_name(mut self, name: &'a str) -> Self {
self.has_name = Some(name);
self
}
/// Only consider indices that support FTS
pub fn supports_fts(mut self) -> Self {
self.must_support_fts = true;
self
}
/// Only consider indices that support exact equality
///
/// This will disqualify, for example, the ngram and inverted indices
/// or an index like a bloom filter
pub fn supports_exact_equality(mut self) -> Self {
self.must_support_exact_equality = true;
self
}
}
#[deprecated(since = "0.39.0", note = "Use IndexCriteria instead")]
pub type ScalarIndexCriteria<'a> = IndexCriteria<'a>;
/// Additional information about an index
///
/// Note that a single index might consist of multiple segments. Each segment has its own
/// UUID and collection of files and covers some subset of the data fragments.
///
/// All segments in an index should have the same index type and index details.
pub trait IndexDescription: Send + Sync {
/// Returns the index name
///
/// This is the user-defined name of the index. It is shared by all segments of the index
/// and is what is used to refer to the index in the API. It is guaranteed to be unique
/// within the dataset.
fn name(&self) -> &str;
/// Returns the index metadata
///
/// This is the raw metadata information stored in the manifest. There is one
/// IndexMetadata for each segment of the index.
fn metadata(&self) -> &[IndexMetadata];
/// Returns the index type URL
///
/// This is extracted from the type url of the index details
fn type_url(&self) -> &str;
/// Returns the index type
///
/// This is a short string identifier that is friendlier than the type URL but not
/// guaranteed to be unique.
///
/// This is calculated by the plugin and will be "Unknown" if no plugin could be found
/// for the type URL.
fn index_type(&self) -> &str;
/// Returns the number of rows indexed by the index, across all segments.
///
/// This is an approximate count and may include rows that have been
/// deleted.
fn rows_indexed(&self) -> u64;
/// Returns the ids of the fields that the index is built on.
fn field_ids(&self) -> &[u32];
/// Returns a JSON string representation of the index details
///
/// The format of these details will vary depending on the index type and
/// since indexes can be provided by plugins we cannot fully define it here.
///
/// However, plugins should do their best to maintain backwards compatibility
/// and consider this method part of the public API.
///
/// See individual index plugins for more description of the expected format.
///
/// The conversion from Any to JSON is controlled by the index
/// plugin. As a result, this method may fail if there is no plugin
/// available for the index.
fn details(&self) -> Result<String>;
/// Returns the total size in bytes of all files across all segments.
///
/// Returns `None` if file size information is not available for any segment
/// (for backward compatibility with indices created before file tracking was added).
fn total_size_bytes(&self) -> Option<u64>;
}
// Extends Lance Dataset with secondary index.
#[async_trait]
pub trait DatasetIndexExt {
type IndexBuilder<'a>
where
Self: 'a;
type IndexSegmentBuilder<'a>
where
Self: 'a;
/// Create a builder for creating an index on columns.
///
/// This returns a builder that can be configured with additional options
/// like `name()`, `replace()`, and `train()` before awaiting to execute.
///
/// # Parameters
/// - `columns`: the columns to build the indices on.
/// - `index_type`: specify [`IndexType`].
/// - `params`: index parameters.
fn create_index_builder<'a>(
&'a mut self,
columns: &'a [&'a str],
index_type: IndexType,
params: &'a dyn IndexParams,
) -> Self::IndexBuilder<'a>;
/// Create a builder for building physical index segments from uncommitted
/// vector index outputs.
///
/// The caller supplies the uncommitted index metadata returned by
/// `execute_uncommitted()` so the builder can plan segment grouping without
/// rediscovering fragment coverage.
///
/// This is the canonical entry point for distributed vector segment build.
/// After building the physical segments, publish them as a
/// logical index with [`Self::commit_existing_index_segments`].
fn create_index_segment_builder<'a>(&'a self) -> Self::IndexSegmentBuilder<'a>;
/// Create indices on columns.
///
/// Upon finish, a new dataset version is generated.
///
/// Parameters:
///
/// - `columns`: the columns to build the indices on.
/// - `index_type`: specify [`IndexType`].
/// - `name`: optional index name. Must be unique in the dataset.
/// if not provided, it will auto-generate one.
/// - `params`: index parameters.
/// - `replace`: replace the existing index if it exists.
///
/// Returns the metadata of the created index.
async fn create_index(
&mut self,
columns: &[&str],
index_type: IndexType,
name: Option<String>,
params: &dyn IndexParams,
replace: bool,
) -> Result<IndexMetadata>;
/// Drop indices by name.
///
/// Upon finish, a new dataset version is generated.
///
/// Parameters:
///
/// - `name`: the name of the index to drop.
async fn drop_index(&mut self, name: &str) -> Result<()>;
/// Prewarm an index by name.
///
/// This will load the index into memory and cache it.
///
/// Generally, this should only be called when it is known the entire index will
/// fit into the index cache.
///
/// This is a hint that is not enforced by all indices today. Some indices may choose
/// to ignore this hint.
async fn prewarm_index(&self, name: &str) -> Result<()>;
/// Read all indices of this Dataset version.
///
/// The indices are lazy loaded and cached in memory within the `Dataset` instance.
/// The cache is invalidated when the dataset version (Manifest) is changed.
async fn load_indices(&self) -> Result<Arc<Vec<IndexMetadata>>>;
/// Loads all the indies of a given UUID.
///
/// Note that it is possible to have multiple indices with the same UUID,
/// as they are the deltas of the same index.
async fn load_index(&self, uuid: &str) -> Result<Option<IndexMetadata>> {
self.load_indices().await.map(|indices| {
indices
.iter()
.find(|idx| idx.uuid.to_string() == uuid)
.cloned()
})
}
/// Loads a specific index with the given index name
///
/// Returns
/// -------
/// - `Ok(indices)`: if the index exists, returns the index.
/// - `Ok(vec![])`: if the index does not exist.
/// - `Err(e)`: if there is an error loading indices.
///
async fn load_indices_by_name(&self, name: &str) -> Result<Vec<IndexMetadata>> {
self.load_indices().await.map(|indices| {
indices
.iter()
.filter(|idx| idx.name == name)
.cloned()
.collect()
})
}
/// Loads a specific index with the given index name.
/// This function only works for indices that are unique.
/// If there are multiple indices sharing the same name, please use [`Self::load_indices_by_name`]
///
/// Returns
/// -------
/// - `Ok(Some(index))`: if the index exists, returns the index.
/// - `Ok(None)`: if the index does not exist.
/// - `Err(e)`: Index error if there are multiple indexes sharing the same name.
///
async fn load_index_by_name(&self, name: &str) -> Result<Option<IndexMetadata>> {
let indices = self.load_indices_by_name(name).await?;
if indices.is_empty() {
Ok(None)
} else if indices.len() == 1 {
Ok(Some(indices[0].clone()))
} else {
Err(Error::index(format!(
"Found multiple indices of the same name: {:?}, please use load_indices_by_name",
indices.iter().map(|idx| &idx.name).collect::<Vec<_>>()
)))
}
}
/// Describes indexes in a dataset
///
/// This method should only access the index metadata and should not load the index into memory.
///
/// More detailed information may be available from `index_statistics` but that will require
/// loading the index into memory.
async fn describe_indices<'a, 'b>(
&'a self,
criteria: Option<IndexCriteria<'b>>,
) -> Result<Vec<Arc<dyn IndexDescription>>>;
/// Loads a specific index with the given index name.
async fn load_scalar_index<'a, 'b>(
&'a self,
criteria: IndexCriteria<'b>,
) -> Result<Option<IndexMetadata>>;
/// Optimize indices.
async fn optimize_indices(&mut self, options: &OptimizeOptions) -> Result<()>;
/// Find index with a given index_name and return its serialized statistics.
///
/// If the index does not exist, return Error.
async fn index_statistics(&self, index_name: &str) -> Result<String>;
/// Commit one or more existing physical index segments as a logical index.
///
/// This publishes already-built physical segments. It does not build
/// or merge index data; callers should first build segments with
/// [`Self::create_index_segment_builder`] or another index-specific build
/// path and then pass the resulting segments here.
async fn commit_existing_index_segments(
&mut self,
index_name: &str,
column: &str,
segments: Vec<IndexSegment>,
) -> Result<()>;
async fn read_index_partition(
&self,
index_name: &str,
partition_id: usize,
with_vector: bool,
) -> Result<SendableRecordBatchStream>;
}