lance_index/scalar/registry.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::sync::Arc;
5
6use arrow_schema::Field;
7use async_trait::async_trait;
8use datafusion::execution::SendableRecordBatchStream;
9use lance_core::{cache::LanceCache, Result};
10
11use crate::registry::IndexPluginRegistry;
12use crate::{
13 frag_reuse::FragReuseIndex,
14 scalar::{expression::ScalarQueryParser, CreatedIndex, IndexStore, ScalarIndex},
15};
16
17pub const VALUE_COLUMN_NAME: &str = "value";
18
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum TrainingOrdering {
21 /// The input will arrive sorted by the value column in ascending order
22 Values,
23 /// The input will arrive sorted by the address column in ascending order
24 Addresses,
25 /// The input will arrive in an arbitrary order
26 None,
27}
28
29#[derive(Debug, Clone)]
30pub struct TrainingCriteria {
31 pub ordering: TrainingOrdering,
32 pub needs_row_ids: bool,
33 pub needs_row_addrs: bool,
34}
35
36impl TrainingCriteria {
37 pub fn new(ordering: TrainingOrdering) -> Self {
38 Self {
39 ordering,
40 needs_row_ids: false,
41 needs_row_addrs: false,
42 }
43 }
44
45 pub fn with_row_id(mut self) -> Self {
46 self.needs_row_ids = true;
47 self
48 }
49
50 pub fn with_row_addr(mut self) -> Self {
51 self.needs_row_addrs = true;
52 self
53 }
54}
55
56/// A trait that describes what criteria is needed to train an index
57///
58/// The training process has two steps. First, the parameters are given to the
59/// plugin and it creates a TrainingRequest. Then, the caller prepares the training
60/// data and calls train_index.
61///
62/// The call to train_index will include the training request. This allows the plugin
63/// to stash any deserialized parameter info in the request and fetch it later during
64/// training by downcasting to the appropriate type.
65pub trait TrainingRequest: std::any::Any + Send + Sync {
66 fn as_any(&self) -> &dyn std::any::Any;
67 fn criteria(&self) -> &TrainingCriteria;
68}
69
70/// A default training request impl for indexes that don't need any parameters
71pub(crate) struct DefaultTrainingRequest {
72 criteria: TrainingCriteria,
73}
74
75impl DefaultTrainingRequest {
76 pub fn new(criteria: TrainingCriteria) -> Self {
77 Self { criteria }
78 }
79}
80
81impl TrainingRequest for DefaultTrainingRequest {
82 fn as_any(&self) -> &dyn std::any::Any {
83 self
84 }
85
86 fn criteria(&self) -> &TrainingCriteria {
87 &self.criteria
88 }
89}
90
91/// A trait for scalar index plugins
92#[async_trait]
93pub trait ScalarIndexPlugin: Send + Sync + std::fmt::Debug {
94 /// Creates a new training request from the given parameters
95 ///
96 /// This training request specifies the criteria that the data must satisfy to train the index.
97 /// For example, does the index require the input data to be sorted?
98 fn new_training_request(&self, params: &str, field: &Field)
99 -> Result<Box<dyn TrainingRequest>>;
100
101 /// Train a new index
102 ///
103 /// The provided data must fulfill all the criteria returned by `training_criteria`.
104 /// It is the caller's responsibility to ensure this.
105 ///
106 /// Returns index details that describe the index. These details can potentially be
107 /// useful for planning (although this will currently require inside information on
108 /// the index type) and they will need to be provided when loading the index.
109 ///
110 /// It is the caller's responsibility to store these details somewhere.
111 async fn train_index(
112 &self,
113 data: SendableRecordBatchStream,
114 index_store: &dyn IndexStore,
115 request: Box<dyn TrainingRequest>,
116 fragment_ids: Option<Vec<u32>>,
117 ) -> Result<CreatedIndex>;
118
119 /// A short name for the index
120 ///
121 /// This is a friendly name for display purposes and also can be used as an alias for
122 /// the index type URL. If multiple plugins have the same name, then the first one
123 /// found will be used.
124 ///
125 /// By convention this is MixedCase with no spaces. When used as an alias, it will be
126 /// compared case-insensitively.
127 fn name(&self) -> &str;
128
129 /// Returns true if the index returns an exact answer (e.g. not AtMost)
130 fn provides_exact_answer(&self) -> bool;
131
132 /// The version of the index plugin
133 ///
134 /// We assume that indexes are not forwards compatible. If an index was written with a
135 /// newer version than this, it cannot be read
136 fn version(&self) -> u32;
137
138 /// Returns a new query parser for the index
139 ///
140 /// Can return None if this index cannot participate in query optimization
141 fn new_query_parser(
142 &self,
143 index_name: String,
144 index_details: &prost_types::Any,
145 ) -> Option<Box<dyn ScalarQueryParser>>;
146
147 /// Load an index from storage
148 ///
149 /// The index details should match the details that were returned when the index was
150 /// originally trained.
151 async fn load_index(
152 &self,
153 index_store: Arc<dyn IndexStore>,
154 index_details: &prost_types::Any,
155 frag_reuse_index: Option<Arc<FragReuseIndex>>,
156 cache: &LanceCache,
157 ) -> Result<Arc<dyn ScalarIndex>>;
158
159 /// Optional hook that plugins can use if they need to be aware of the registry
160 fn attach_registry(&self, _registry: Arc<IndexPluginRegistry>) {}
161
162 /// Returns a JSON string representation of the provided index details
163 ///
164 /// These details will be user-visible and should be considered part of the public
165 /// API. As a result, efforts should be made to ensure the information is backwards
166 /// compatible and avoid breaking changes.
167 fn details_as_json(&self, _details: &prost_types::Any) -> Result<serde_json::Value> {
168 // Return an empty JSON object as the default implementation
169 Ok(serde_json::json!({}))
170 }
171}