Skip to main content

lance_index/scalar/
registry.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::borrow::Cow;
5use std::sync::Arc;
6
7use arrow_schema::Field;
8use async_trait::async_trait;
9use datafusion::execution::SendableRecordBatchStream;
10use lance_core::{
11    Result,
12    cache::{LanceCache, UnsizedCacheKey},
13};
14
15use crate::progress::IndexBuildProgress;
16use crate::registry::IndexPluginRegistry;
17use crate::{
18    frag_reuse::FragReuseIndex,
19    scalar::{CreatedIndex, IndexStore, ScalarIndex, expression::ScalarQueryParser},
20};
21
22pub const VALUE_COLUMN_NAME: &str = "value";
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub enum TrainingOrdering {
26    /// The input will arrive sorted by the value column in ascending order
27    Values,
28    /// The input will arrive sorted by the address column in ascending order
29    Addresses,
30    /// The input will arrive in an arbitrary order
31    None,
32}
33
34#[derive(Debug, Clone)]
35pub struct TrainingCriteria {
36    pub ordering: TrainingOrdering,
37    pub needs_row_ids: bool,
38    pub needs_row_addrs: bool,
39}
40
41impl TrainingCriteria {
42    pub fn new(ordering: TrainingOrdering) -> Self {
43        Self {
44            ordering,
45            needs_row_ids: false,
46            needs_row_addrs: false,
47        }
48    }
49
50    pub fn with_row_id(mut self) -> Self {
51        self.needs_row_ids = true;
52        self
53    }
54
55    pub fn with_row_addr(mut self) -> Self {
56        self.needs_row_addrs = true;
57        self
58    }
59}
60
61/// A trait that describes what criteria is needed to train an index
62///
63/// The training process has two steps.  First, the parameters are given to the
64/// plugin and it creates a TrainingRequest.  Then, the caller prepares the training
65/// data and calls train_index.
66///
67/// The call to train_index will include the training request.  This allows the plugin
68/// to stash any deserialized parameter info in the request and fetch it later during
69/// training by downcasting to the appropriate type.
70pub trait TrainingRequest: std::any::Any + Send + Sync {
71    fn as_any(&self) -> &dyn std::any::Any;
72    fn criteria(&self) -> &TrainingCriteria;
73}
74
75/// A default training request impl for indexes that don't need any parameters
76pub(crate) struct DefaultTrainingRequest {
77    criteria: TrainingCriteria,
78}
79
80impl DefaultTrainingRequest {
81    pub fn new(criteria: TrainingCriteria) -> Self {
82        Self { criteria }
83    }
84}
85
86impl TrainingRequest for DefaultTrainingRequest {
87    fn as_any(&self) -> &dyn std::any::Any {
88        self
89    }
90
91    fn criteria(&self) -> &TrainingCriteria {
92        &self.criteria
93    }
94}
95
96/// A trait for scalar index plugins
97#[async_trait]
98pub trait ScalarIndexPlugin: Send + Sync + std::fmt::Debug {
99    /// Creates a new training request from the given parameters
100    ///
101    /// This training request specifies the criteria that the data must satisfy to train the index.
102    /// For example, does the index require the input data to be sorted?
103    fn new_training_request(&self, params: &str, field: &Field)
104    -> Result<Box<dyn TrainingRequest>>;
105
106    /// Train a new index
107    ///
108    /// The provided data must fulfill all the criteria returned by `training_criteria`.
109    /// It is the caller's responsibility to ensure this.
110    ///
111    /// Returns index details that describe the index.  These details can potentially be
112    /// useful for planning (although this will currently require inside information on
113    /// the index type) and they will need to be provided when loading the index.
114    ///
115    /// It is the caller's responsibility to store these details somewhere.
116    async fn train_index(
117        &self,
118        data: SendableRecordBatchStream,
119        index_store: &dyn IndexStore,
120        request: Box<dyn TrainingRequest>,
121        fragment_ids: Option<Vec<u32>>,
122        progress: Arc<dyn IndexBuildProgress>,
123    ) -> Result<CreatedIndex>;
124
125    /// A short name for the index
126    ///
127    /// This is a friendly name for display purposes and also can be used as an alias for
128    /// the index type URL.  If multiple plugins have the same name, then the first one
129    /// found will be used.
130    ///
131    /// By convention this is MixedCase with no spaces.  When used as an alias, it will be
132    /// compared case-insensitively.
133    fn name(&self) -> &str;
134
135    /// Returns true if the index returns an exact answer (e.g. not AtMost)
136    fn provides_exact_answer(&self) -> bool;
137
138    /// The version of the index plugin
139    ///
140    /// We assume that indexes are not forwards compatible.  If an index was written with a
141    /// newer version than this, it cannot be read
142    fn version(&self) -> u32;
143
144    /// Returns a new query parser for the index
145    ///
146    /// Can return None if this index cannot participate in query optimization
147    fn new_query_parser(
148        &self,
149        index_name: String,
150        index_details: &prost_types::Any,
151    ) -> Option<Box<dyn ScalarQueryParser>>;
152
153    /// Load an index from storage
154    ///
155    /// The index details should match the details that were returned when the index was
156    /// originally trained.
157    async fn load_index(
158        &self,
159        index_store: Arc<dyn IndexStore>,
160        index_details: &prost_types::Any,
161        frag_reuse_index: Option<Arc<FragReuseIndex>>,
162        cache: &LanceCache,
163    ) -> Result<Arc<dyn ScalarIndex>>;
164
165    /// Look up a previously-opened index in the cache.
166    ///
167    /// `cache` is already per-index namespaced by the caller, so a plugin's key
168    /// only needs to disambiguate entries within a single index.
169    ///
170    /// The default implementation reads an in-memory `Arc<dyn ScalarIndex>` entry.
171    /// Plugins whose index has a serializable representation should override this
172    /// (together with [`put_in_cache`](Self::put_in_cache)) to store that
173    /// representation under a sized [`CacheKey`](lance_core::cache::CacheKey) with
174    /// a codec, and reconstruct the index here. `index_store` and
175    /// `frag_reuse_index` are provided so the override can rebuild the index
176    /// without re-reading metadata.
177    async fn get_from_cache(
178        &self,
179        _index_store: Arc<dyn IndexStore>,
180        _frag_reuse_index: Option<Arc<FragReuseIndex>>,
181        cache: &LanceCache,
182    ) -> Result<Option<Arc<dyn ScalarIndex>>> {
183        Ok(cache.get_unsized_with_key(&ScalarIndexCacheKey).await)
184    }
185
186    /// Store a freshly-opened index in the cache.
187    ///
188    /// `cache` is already per-index namespaced; see
189    /// [`get_from_cache`](Self::get_from_cache).
190    ///
191    /// The default implementation stores the `Arc<dyn ScalarIndex>` in-memory.
192    async fn put_in_cache(&self, cache: &LanceCache, index: Arc<dyn ScalarIndex>) -> Result<()> {
193        cache
194            .insert_unsized_with_key(&ScalarIndexCacheKey, index)
195            .await;
196        Ok(())
197    }
198
199    /// Optional hook allowing a plugin to provide statistics without loading the index.
200    async fn load_statistics(
201        &self,
202        _index_store: Arc<dyn IndexStore>,
203        _index_details: &prost_types::Any,
204    ) -> Result<Option<serde_json::Value>> {
205        Ok(None)
206    }
207
208    /// Optional hook that plugins can use if they need to be aware of the registry
209    fn attach_registry(&self, _registry: Arc<IndexPluginRegistry>) {}
210
211    /// Returns a JSON string representation of the provided index details
212    ///
213    /// These details will be user-visible and should be considered part of the public
214    /// API.  As a result, efforts should be made to ensure the information is backwards
215    /// compatible and avoid breaking changes.
216    fn details_as_json(&self, _details: &prost_types::Any) -> Result<serde_json::Value> {
217        // Return an empty JSON object as the default implementation
218        Ok(serde_json::json!({}))
219    }
220}
221
222/// In-memory cache key for a whole `Arc<dyn ScalarIndex>`.
223///
224/// Used by the default [`ScalarIndexPlugin::get_from_cache`] /
225/// [`ScalarIndexPlugin::put_in_cache`] implementations. The cache is already
226/// per-index namespaced by the caller, so a constant key suffices. Trait objects
227/// cannot be serialized, so this is an [`UnsizedCacheKey`] with no codec —
228/// plugins that want a persistable cache entry override those methods with a
229/// sized key.
230pub struct ScalarIndexCacheKey;
231
232impl UnsizedCacheKey for ScalarIndexCacheKey {
233    type ValueType = dyn ScalarIndex;
234
235    fn key(&self) -> Cow<'_, str> {
236        Cow::Borrowed("scalar_index")
237    }
238
239    fn type_name() -> &'static str {
240        "ScalarIndex"
241    }
242}