lance_index/scalar/registry.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::borrow::Cow;
5use std::sync::Arc;
6
7use arrow_schema::Field;
8use async_trait::async_trait;
9use datafusion::execution::SendableRecordBatchStream;
10use lance_core::{
11 Result,
12 cache::{LanceCache, UnsizedCacheKey},
13};
14
15use crate::progress::IndexBuildProgress;
16use crate::registry::IndexPluginRegistry;
17use crate::{
18 frag_reuse::FragReuseIndex,
19 scalar::{CreatedIndex, IndexStore, ScalarIndex, expression::ScalarQueryParser},
20};
21
22pub const VALUE_COLUMN_NAME: &str = "value";
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub enum TrainingOrdering {
26 /// The input will arrive sorted by the value column in ascending order
27 Values,
28 /// The input will arrive sorted by the address column in ascending order
29 Addresses,
30 /// The input will arrive in an arbitrary order
31 None,
32}
33
34#[derive(Debug, Clone)]
35pub struct TrainingCriteria {
36 pub ordering: TrainingOrdering,
37 pub needs_row_ids: bool,
38 pub needs_row_addrs: bool,
39}
40
41impl TrainingCriteria {
42 pub fn new(ordering: TrainingOrdering) -> Self {
43 Self {
44 ordering,
45 needs_row_ids: false,
46 needs_row_addrs: false,
47 }
48 }
49
50 pub fn with_row_id(mut self) -> Self {
51 self.needs_row_ids = true;
52 self
53 }
54
55 pub fn with_row_addr(mut self) -> Self {
56 self.needs_row_addrs = true;
57 self
58 }
59}
60
61/// A trait that describes what criteria is needed to train an index
62///
63/// The training process has two steps. First, the parameters are given to the
64/// plugin and it creates a TrainingRequest. Then, the caller prepares the training
65/// data and calls train_index.
66///
67/// The call to train_index will include the training request. This allows the plugin
68/// to stash any deserialized parameter info in the request and fetch it later during
69/// training by downcasting to the appropriate type.
70pub trait TrainingRequest: std::any::Any + Send + Sync {
71 fn as_any(&self) -> &dyn std::any::Any;
72 fn criteria(&self) -> &TrainingCriteria;
73}
74
75/// A default training request impl for indexes that don't need any parameters
76pub(crate) struct DefaultTrainingRequest {
77 criteria: TrainingCriteria,
78}
79
80impl DefaultTrainingRequest {
81 pub fn new(criteria: TrainingCriteria) -> Self {
82 Self { criteria }
83 }
84}
85
86impl TrainingRequest for DefaultTrainingRequest {
87 fn as_any(&self) -> &dyn std::any::Any {
88 self
89 }
90
91 fn criteria(&self) -> &TrainingCriteria {
92 &self.criteria
93 }
94}
95
96/// A trait for scalar index plugins
97#[async_trait]
98pub trait ScalarIndexPlugin: Send + Sync + std::fmt::Debug {
99 /// Creates a new training request from the given parameters
100 ///
101 /// This training request specifies the criteria that the data must satisfy to train the index.
102 /// For example, does the index require the input data to be sorted?
103 fn new_training_request(&self, params: &str, field: &Field)
104 -> Result<Box<dyn TrainingRequest>>;
105
106 /// Train a new index
107 ///
108 /// The provided data must fulfill all the criteria returned by `training_criteria`.
109 /// It is the caller's responsibility to ensure this.
110 ///
111 /// Returns index details that describe the index. These details can potentially be
112 /// useful for planning (although this will currently require inside information on
113 /// the index type) and they will need to be provided when loading the index.
114 ///
115 /// It is the caller's responsibility to store these details somewhere.
116 async fn train_index(
117 &self,
118 data: SendableRecordBatchStream,
119 index_store: &dyn IndexStore,
120 request: Box<dyn TrainingRequest>,
121 fragment_ids: Option<Vec<u32>>,
122 progress: Arc<dyn IndexBuildProgress>,
123 ) -> Result<CreatedIndex>;
124
125 /// A short name for the index
126 ///
127 /// This is a friendly name for display purposes and also can be used as an alias for
128 /// the index type URL. If multiple plugins have the same name, then the first one
129 /// found will be used.
130 ///
131 /// By convention this is MixedCase with no spaces. When used as an alias, it will be
132 /// compared case-insensitively.
133 fn name(&self) -> &str;
134
135 /// Returns true if the index returns an exact answer (e.g. not AtMost)
136 fn provides_exact_answer(&self) -> bool;
137
138 /// The version of the index plugin
139 ///
140 /// We assume that indexes are not forwards compatible. If an index was written with a
141 /// newer version than this, it cannot be read
142 fn version(&self) -> u32;
143
144 /// Returns a new query parser for the index
145 ///
146 /// Can return None if this index cannot participate in query optimization
147 fn new_query_parser(
148 &self,
149 index_name: String,
150 index_details: &prost_types::Any,
151 ) -> Option<Box<dyn ScalarQueryParser>>;
152
153 /// Load an index from storage
154 ///
155 /// The index details should match the details that were returned when the index was
156 /// originally trained.
157 async fn load_index(
158 &self,
159 index_store: Arc<dyn IndexStore>,
160 index_details: &prost_types::Any,
161 frag_reuse_index: Option<Arc<FragReuseIndex>>,
162 cache: &LanceCache,
163 ) -> Result<Arc<dyn ScalarIndex>>;
164
165 /// Look up a previously-opened index in the cache.
166 ///
167 /// `cache` is already per-index namespaced by the caller, so a plugin's key
168 /// only needs to disambiguate entries within a single index.
169 ///
170 /// The default implementation reads an in-memory `Arc<dyn ScalarIndex>` entry.
171 /// Plugins whose index has a serializable representation should override this
172 /// (together with [`put_in_cache`](Self::put_in_cache)) to store that
173 /// representation under a sized [`CacheKey`](lance_core::cache::CacheKey) with
174 /// a codec, and reconstruct the index here. `index_store` and
175 /// `frag_reuse_index` are provided so the override can rebuild the index
176 /// without re-reading metadata.
177 async fn get_from_cache(
178 &self,
179 _index_store: Arc<dyn IndexStore>,
180 _frag_reuse_index: Option<Arc<FragReuseIndex>>,
181 cache: &LanceCache,
182 ) -> Result<Option<Arc<dyn ScalarIndex>>> {
183 Ok(cache.get_unsized_with_key(&ScalarIndexCacheKey).await)
184 }
185
186 /// Store a freshly-opened index in the cache.
187 ///
188 /// `cache` is already per-index namespaced; see
189 /// [`get_from_cache`](Self::get_from_cache).
190 ///
191 /// The default implementation stores the `Arc<dyn ScalarIndex>` in-memory.
192 async fn put_in_cache(&self, cache: &LanceCache, index: Arc<dyn ScalarIndex>) -> Result<()> {
193 cache
194 .insert_unsized_with_key(&ScalarIndexCacheKey, index)
195 .await;
196 Ok(())
197 }
198
199 /// Optional hook allowing a plugin to provide statistics without loading the index.
200 async fn load_statistics(
201 &self,
202 _index_store: Arc<dyn IndexStore>,
203 _index_details: &prost_types::Any,
204 ) -> Result<Option<serde_json::Value>> {
205 Ok(None)
206 }
207
208 /// Optional hook that plugins can use if they need to be aware of the registry
209 fn attach_registry(&self, _registry: Arc<IndexPluginRegistry>) {}
210
211 /// Returns a JSON string representation of the provided index details
212 ///
213 /// These details will be user-visible and should be considered part of the public
214 /// API. As a result, efforts should be made to ensure the information is backwards
215 /// compatible and avoid breaking changes.
216 fn details_as_json(&self, _details: &prost_types::Any) -> Result<serde_json::Value> {
217 // Return an empty JSON object as the default implementation
218 Ok(serde_json::json!({}))
219 }
220}
221
222/// In-memory cache key for a whole `Arc<dyn ScalarIndex>`.
223///
224/// Used by the default [`ScalarIndexPlugin::get_from_cache`] /
225/// [`ScalarIndexPlugin::put_in_cache`] implementations. The cache is already
226/// per-index namespaced by the caller, so a constant key suffices. Trait objects
227/// cannot be serialized, so this is an [`UnsizedCacheKey`] with no codec —
228/// plugins that want a persistable cache entry override those methods with a
229/// sized key.
230pub struct ScalarIndexCacheKey;
231
232impl UnsizedCacheKey for ScalarIndexCacheKey {
233 type ValueType = dyn ScalarIndex;
234
235 fn key(&self) -> Cow<'_, str> {
236 Cow::Borrowed("scalar_index")
237 }
238
239 fn type_name() -> &'static str {
240 "ScalarIndex"
241 }
242}