lance_index/scalar/
registry.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::sync::Arc;
5
6use arrow_schema::Field;
7use async_trait::async_trait;
8use datafusion::execution::SendableRecordBatchStream;
9use lance_core::{cache::LanceCache, Result};
10
11use crate::registry::IndexPluginRegistry;
12use crate::{
13    frag_reuse::FragReuseIndex,
14    scalar::{expression::ScalarQueryParser, CreatedIndex, IndexStore, ScalarIndex},
15};
16
17pub const VALUE_COLUMN_NAME: &str = "value";
18
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum TrainingOrdering {
21    /// The input will arrive sorted by the value column in ascending order
22    Values,
23    /// The input will arrive sorted by the address column in ascending order
24    Addresses,
25    /// The input will arrive in an arbitrary order
26    None,
27}
28
29#[derive(Debug, Clone)]
30pub struct TrainingCriteria {
31    pub ordering: TrainingOrdering,
32    pub needs_row_ids: bool,
33    pub needs_row_addrs: bool,
34}
35
36impl TrainingCriteria {
37    pub fn new(ordering: TrainingOrdering) -> Self {
38        Self {
39            ordering,
40            needs_row_ids: false,
41            needs_row_addrs: false,
42        }
43    }
44
45    pub fn with_row_id(mut self) -> Self {
46        self.needs_row_ids = true;
47        self
48    }
49
50    pub fn with_row_addr(mut self) -> Self {
51        self.needs_row_addrs = true;
52        self
53    }
54}
55
56/// A trait that describes what criteria is needed to train an index
57///
58/// The training process has two steps.  First, the parameters are given to the
59/// plugin and it creates a TrainingRequest.  Then, the caller prepares the training
60/// data and calls train_index.
61///
62/// The call to train_index will include the training request.  This allows the plugin
63/// to stash any deserialized parameter info in the request and fetch it later during
64/// training by downcasting to the appropriate type.
65pub trait TrainingRequest: std::any::Any + Send + Sync {
66    fn as_any(&self) -> &dyn std::any::Any;
67    fn criteria(&self) -> &TrainingCriteria;
68}
69
70/// A default training request impl for indexes that don't need any parameters
71pub(crate) struct DefaultTrainingRequest {
72    criteria: TrainingCriteria,
73}
74
75impl DefaultTrainingRequest {
76    pub fn new(criteria: TrainingCriteria) -> Self {
77        Self { criteria }
78    }
79}
80
81impl TrainingRequest for DefaultTrainingRequest {
82    fn as_any(&self) -> &dyn std::any::Any {
83        self
84    }
85
86    fn criteria(&self) -> &TrainingCriteria {
87        &self.criteria
88    }
89}
90
91/// A trait for scalar index plugins
92#[async_trait]
93pub trait ScalarIndexPlugin: Send + Sync + std::fmt::Debug {
94    /// Creates a new training request from the given parameters
95    ///
96    /// This training request specifies the criteria that the data must satisfy to train the index.
97    /// For example, does the index require the input data to be sorted?
98    fn new_training_request(&self, params: &str, field: &Field)
99        -> Result<Box<dyn TrainingRequest>>;
100
101    /// Train a new index
102    ///
103    /// The provided data must fulfill all the criteria returned by `training_criteria`.
104    /// It is the caller's responsibility to ensure this.
105    ///
106    /// Returns index details that describe the index.  These details can potentially be
107    /// useful for planning (although this will currently require inside information on
108    /// the index type) and they will need to be provided when loading the index.
109    ///
110    /// It is the caller's responsibility to store these details somewhere.
111    async fn train_index(
112        &self,
113        data: SendableRecordBatchStream,
114        index_store: &dyn IndexStore,
115        request: Box<dyn TrainingRequest>,
116        fragment_ids: Option<Vec<u32>>,
117    ) -> Result<CreatedIndex>;
118
119    /// A short name for the index
120    ///
121    /// This is a friendly name for display purposes and also can be used as an alias for
122    /// the index type URL.  If multiple plugins have the same name, then the first one
123    /// found will be used.
124    ///
125    /// By convention this is MixedCase with no spaces.  When used as an alias, it will be
126    /// compared case-insensitively.
127    fn name(&self) -> &str;
128
129    /// Returns true if the index returns an exact answer (e.g. not AtMost)
130    fn provides_exact_answer(&self) -> bool;
131
132    /// The version of the index plugin
133    ///
134    /// We assume that indexes are not forwards compatible.  If an index was written with a
135    /// newer version than this, it cannot be read
136    fn version(&self) -> u32;
137
138    /// Returns a new query parser for the index
139    ///
140    /// Can return None if this index cannot participate in query optimization
141    fn new_query_parser(
142        &self,
143        index_name: String,
144        index_details: &prost_types::Any,
145    ) -> Option<Box<dyn ScalarQueryParser>>;
146
147    /// Load an index from storage
148    ///
149    /// The index details should match the details that were returned when the index was
150    /// originally trained.
151    async fn load_index(
152        &self,
153        index_store: Arc<dyn IndexStore>,
154        index_details: &prost_types::Any,
155        frag_reuse_index: Option<Arc<FragReuseIndex>>,
156        cache: &LanceCache,
157    ) -> Result<Arc<dyn ScalarIndex>>;
158
159    /// Optional hook that plugins can use if they need to be aware of the registry
160    fn attach_registry(&self, _registry: Arc<IndexPluginRegistry>) {}
161
162    /// Returns a JSON string representation of the provided index details
163    ///
164    /// These details will be user-visible and should be considered part of the public
165    /// API.  As a result, efforts should be made to ensure the information is backwards
166    /// compatible and avoid breaking changes.
167    fn details_as_json(&self, _details: &prost_types::Any) -> Result<serde_json::Value> {
168        // Return an empty JSON object as the default implementation
169        Ok(serde_json::json!({}))
170    }
171}