lance_index/
scalar.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Scalar indices for metadata search & filtering
5
6use std::collections::{HashMap, HashSet};
7use std::fmt::Debug;
8use std::{any::Any, ops::Bound, sync::Arc};
9
10use arrow::buffer::{OffsetBuffer, ScalarBuffer};
11use arrow_array::{ListArray, RecordBatch};
12use arrow_schema::{Field, Schema};
13use async_trait::async_trait;
14use datafusion::functions::string::contains::ContainsFunc;
15use datafusion::functions_array::array_has;
16use datafusion::physical_plan::SendableRecordBatchStream;
17use datafusion_common::{scalar::ScalarValue, Column};
18
19use datafusion_expr::expr::ScalarFunction;
20use datafusion_expr::Expr;
21use deepsize::DeepSizeOf;
22use inverted::query::{fill_fts_query_column, FtsQuery, FtsQueryNode, FtsSearchParams, MatchQuery};
23use lance_core::utils::mask::RowIdTreeMap;
24use lance_core::{Error, Result};
25use serde::Serialize;
26use snafu::location;
27
28use crate::metrics::MetricsCollector;
29use crate::scalar::registry::TrainingCriteria;
30use crate::{Index, IndexParams, IndexType};
31
32pub mod bitmap;
33pub mod btree;
34pub mod expression;
35pub mod flat;
36pub mod inverted;
37pub mod json;
38pub mod label_list;
39pub mod lance_format;
40pub mod ngram;
41pub mod registry;
42pub mod zonemap;
43
44use crate::frag_reuse::FragReuseIndex;
45pub use inverted::tokenizer::InvertedIndexParams;
46use lance_datafusion::udf::CONTAINS_TOKENS_UDF;
47
48pub const LANCE_SCALAR_INDEX: &str = "__lance_scalar_index";
49
50/// Builtin index types supported by the Lance library
51///
52/// This is primarily for convenience to avoid a bunch of string
53/// constants and provide some auto-complete.  This type should not
54/// be used in the manifest as plugins cannot add new entries.
55#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf)]
56pub enum BuiltinIndexType {
57    BTree,
58    Bitmap,
59    LabelList,
60    NGram,
61    ZoneMap,
62    Inverted,
63}
64
65impl BuiltinIndexType {
66    pub fn as_str(&self) -> &str {
67        match self {
68            Self::BTree => "btree",
69            Self::Bitmap => "bitmap",
70            Self::LabelList => "labellist",
71            Self::NGram => "ngram",
72            Self::ZoneMap => "zonemap",
73            Self::Inverted => "inverted",
74        }
75    }
76}
77
78impl TryFrom<IndexType> for BuiltinIndexType {
79    type Error = Error;
80
81    fn try_from(value: IndexType) -> Result<Self> {
82        match value {
83            IndexType::BTree => Ok(Self::BTree),
84            IndexType::Bitmap => Ok(Self::Bitmap),
85            IndexType::LabelList => Ok(Self::LabelList),
86            IndexType::NGram => Ok(Self::NGram),
87            IndexType::ZoneMap => Ok(Self::ZoneMap),
88            IndexType::Inverted => Ok(Self::Inverted),
89            _ => Err(Error::Index {
90                message: "Invalid index type".to_string(),
91                location: location!(),
92            }),
93        }
94    }
95}
96
97#[derive(Debug, Clone, PartialEq)]
98pub struct ScalarIndexParams {
99    /// The type of index to create
100    ///
101    /// Builtin indexes are "btree", "ngram", "bitmap", "inverted", "labellist", and "zonemap"
102    ///
103    /// Plugins may add additional index types.  Index type lookup is case-insensitive.
104    pub index_type: String,
105    /// The parameters to train the index
106    ///
107    /// This should be a JSON string.  The contents of the JSON string will be specific to the
108    /// index type.  If not set, then default parameters will be used for the index type.
109    pub params: Option<String>,
110}
111
112impl Default for ScalarIndexParams {
113    fn default() -> Self {
114        Self {
115            index_type: BuiltinIndexType::BTree.as_str().to_string(),
116            params: None,
117        }
118    }
119}
120
121impl ScalarIndexParams {
122    /// Creates a new ScalarIndexParams from one of the builtin index types
123    pub fn for_builtin(index_type: BuiltinIndexType) -> Self {
124        Self {
125            index_type: index_type.as_str().to_string(),
126            params: None,
127        }
128    }
129
130    /// Create a new ScalarIndexParams with the given index type
131    pub fn new(index_type: String) -> Self {
132        Self {
133            index_type,
134            params: None,
135        }
136    }
137
138    /// Set the parameters for the index
139    pub fn with_params<ParamsType: Serialize>(mut self, params: ParamsType) -> Self {
140        self.params = Some(serde_json::to_string(&params).unwrap());
141        self
142    }
143}
144
145impl IndexParams for ScalarIndexParams {
146    fn as_any(&self) -> &dyn std::any::Any {
147        self
148    }
149
150    fn index_name(&self) -> &str {
151        LANCE_SCALAR_INDEX
152    }
153}
154
155impl IndexParams for InvertedIndexParams {
156    fn as_any(&self) -> &dyn std::any::Any {
157        self
158    }
159
160    fn index_name(&self) -> &str {
161        "INVERTED"
162    }
163}
164
165/// Trait for storing an index (or parts of an index) into storage
166#[async_trait]
167pub trait IndexWriter: Send {
168    /// Writes a record batch into the file, returning the 0-based index of the batch in the file
169    ///
170    /// E.g. if this is the third time this is called this method will return 2
171    async fn write_record_batch(&mut self, batch: RecordBatch) -> Result<u64>;
172    /// Finishes writing the file and closes the file
173    async fn finish(&mut self) -> Result<()>;
174    /// Finishes writing the file and closes the file with additional metadata
175    async fn finish_with_metadata(&mut self, metadata: HashMap<String, String>) -> Result<()>;
176}
177
178/// Trait for reading an index (or parts of an index) from storage
179#[async_trait]
180pub trait IndexReader: Send + Sync {
181    /// Read the n-th record batch from the file
182    async fn read_record_batch(&self, n: u64, batch_size: u64) -> Result<RecordBatch>;
183    /// Read the range of rows from the file.
184    /// If projection is Some, only return the columns in the projection,
185    /// nested columns like Some(&["x.y"]) are not supported.
186    /// If projection is None, return all columns.
187    async fn read_range(
188        &self,
189        range: std::ops::Range<usize>,
190        projection: Option<&[&str]>,
191    ) -> Result<RecordBatch>;
192    /// Return the number of batches in the file
193    async fn num_batches(&self, batch_size: u64) -> u32;
194    /// Return the number of rows in the file
195    fn num_rows(&self) -> usize;
196    /// Return the metadata of the file
197    fn schema(&self) -> &lance_core::datatypes::Schema;
198}
199
200/// Trait abstracting I/O away from index logic
201///
202/// Scalar indices are currently serialized as indexable arrow record batches stored in
203/// named "files".  The index store is responsible for serializing and deserializing
204/// these batches into file data (e.g. as .lance files or .parquet files, etc.)
205#[async_trait]
206pub trait IndexStore: std::fmt::Debug + Send + Sync + DeepSizeOf {
207    fn as_any(&self) -> &dyn Any;
208
209    /// Suggested I/O parallelism for the store
210    fn io_parallelism(&self) -> usize;
211
212    /// Create a new file and return a writer to store data in the file
213    async fn new_index_file(&self, name: &str, schema: Arc<Schema>)
214        -> Result<Box<dyn IndexWriter>>;
215
216    /// Open an existing file for retrieval
217    async fn open_index_file(&self, name: &str) -> Result<Arc<dyn IndexReader>>;
218
219    /// Copy a range of batches from an index file from this store to another
220    ///
221    /// This is often useful when remapping or updating
222    async fn copy_index_file(&self, name: &str, dest_store: &dyn IndexStore) -> Result<()>;
223
224    /// Rename an index file
225    async fn rename_index_file(&self, name: &str, new_name: &str) -> Result<()>;
226
227    /// Delete an index file (used in the tmp spill store to keep tmp size down)
228    async fn delete_index_file(&self, name: &str) -> Result<()>;
229}
230
231/// Different scalar indices may support different kinds of queries
232///
233/// For example, a btree index can support a wide range of queries (e.g. x > 7)
234/// while an index based on FTS only supports queries like "x LIKE 'foo'"
235///
236/// This trait is used when we need an object that can represent any kind of query
237///
238/// Note: if you are implementing this trait for a query type then you probably also
239/// need to implement the [crate::scalar::expression::ScalarQueryParser] trait to
240/// create instances of your query at parse time.
241pub trait AnyQuery: std::fmt::Debug + Any + Send + Sync {
242    /// Cast the query as Any to allow for downcasting
243    fn as_any(&self) -> &dyn Any;
244    /// Format the query as a string for display purposes
245    fn format(&self, col: &str) -> String;
246    /// Convert the query to a datafusion expression
247    fn to_expr(&self, col: String) -> Expr;
248    /// Compare this query to another query
249    fn dyn_eq(&self, other: &dyn AnyQuery) -> bool;
250}
251
252impl PartialEq for dyn AnyQuery {
253    fn eq(&self, other: &Self) -> bool {
254        self.dyn_eq(other)
255    }
256}
257/// A full text search query
258#[derive(Debug, Clone, PartialEq)]
259pub struct FullTextSearchQuery {
260    pub query: FtsQuery,
261
262    /// The maximum number of results to return
263    pub limit: Option<i64>,
264
265    /// The wand factor to use for ranking
266    /// if None, use the default value of 1.0
267    /// Increasing this value will reduce the recall and improve the performance
268    /// 1.0 is the value that would give the best performance without recall loss
269    pub wand_factor: Option<f32>,
270}
271
272impl FullTextSearchQuery {
273    /// Create a new terms query
274    pub fn new(query: String) -> Self {
275        let query = MatchQuery::new(query).into();
276        Self {
277            query,
278            limit: None,
279            wand_factor: None,
280        }
281    }
282
283    /// Create a new fuzzy query
284    pub fn new_fuzzy(term: String, max_distance: Option<u32>) -> Self {
285        let query = MatchQuery::new(term).with_fuzziness(max_distance).into();
286        Self {
287            query,
288            limit: None,
289            wand_factor: None,
290        }
291    }
292
293    /// Create a new compound query
294    pub fn new_query(query: FtsQuery) -> Self {
295        Self {
296            query,
297            limit: None,
298            wand_factor: None,
299        }
300    }
301
302    /// Set the column to search over
303    /// This is available for only MatchQuery and PhraseQuery
304    pub fn with_column(mut self, column: String) -> Result<Self> {
305        self.query = fill_fts_query_column(&self.query, &[column], true)?;
306        Ok(self)
307    }
308
309    /// Set the column to search over
310    /// This is available for only MatchQuery
311    pub fn with_columns(mut self, columns: &[String]) -> Result<Self> {
312        self.query = fill_fts_query_column(&self.query, columns, true)?;
313        Ok(self)
314    }
315
316    /// limit the number of results to return
317    /// if None, return all results
318    pub fn limit(mut self, limit: Option<i64>) -> Self {
319        self.limit = limit;
320        self
321    }
322
323    pub fn wand_factor(mut self, wand_factor: Option<f32>) -> Self {
324        self.wand_factor = wand_factor;
325        self
326    }
327
328    pub fn columns(&self) -> HashSet<String> {
329        self.query.columns()
330    }
331
332    pub fn params(&self) -> FtsSearchParams {
333        let params = FtsSearchParams::new()
334            .with_limit(self.limit.map(|limit| limit as usize))
335            .with_wand_factor(self.wand_factor.unwrap_or(1.0));
336        match self.query {
337            FtsQuery::Phrase(ref query) => params.with_phrase_slop(Some(query.slop)),
338            _ => params,
339        }
340    }
341}
342
343/// A query that a basic scalar index (e.g. btree / bitmap) can satisfy
344///
345/// This is a subset of expression operators that is often referred to as the
346/// "sargable" operators
347///
348/// Note that negation is not included.  Negation should be applied later.  For
349/// example, to invert an equality query (e.g. all rows where the value is not 7)
350/// you can grab all rows where the value = 7 and then do an inverted take (or use
351/// a block list instead of an allow list for prefiltering)
352#[derive(Debug, Clone, PartialEq)]
353pub enum SargableQuery {
354    /// Retrieve all row ids where the value is in the given [min, max) range
355    Range(Bound<ScalarValue>, Bound<ScalarValue>),
356    /// Retrieve all row ids where the value is in the given set of values
357    IsIn(Vec<ScalarValue>),
358    /// Retrieve all row ids where the value is exactly the given value
359    Equals(ScalarValue),
360    /// Retrieve all row ids where the value matches the given full text search query
361    FullTextSearch(FullTextSearchQuery),
362    /// Retrieve all row ids where the value is null
363    IsNull(),
364}
365
366impl AnyQuery for SargableQuery {
367    fn as_any(&self) -> &dyn Any {
368        self
369    }
370
371    fn format(&self, col: &str) -> String {
372        match self {
373            Self::Range(lower, upper) => match (lower, upper) {
374                (Bound::Unbounded, Bound::Unbounded) => "true".to_string(),
375                (Bound::Unbounded, Bound::Included(rhs)) => format!("{} <= {}", col, rhs),
376                (Bound::Unbounded, Bound::Excluded(rhs)) => format!("{} < {}", col, rhs),
377                (Bound::Included(lhs), Bound::Unbounded) => format!("{} >= {}", col, lhs),
378                (Bound::Included(lhs), Bound::Included(rhs)) => {
379                    format!("{} >= {} && {} <= {}", col, lhs, col, rhs)
380                }
381                (Bound::Included(lhs), Bound::Excluded(rhs)) => {
382                    format!("{} >= {} && {} < {}", col, lhs, col, rhs)
383                }
384                (Bound::Excluded(lhs), Bound::Unbounded) => format!("{} > {}", col, lhs),
385                (Bound::Excluded(lhs), Bound::Included(rhs)) => {
386                    format!("{} > {} && {} <= {}", col, lhs, col, rhs)
387                }
388                (Bound::Excluded(lhs), Bound::Excluded(rhs)) => {
389                    format!("{} > {} && {} < {}", col, lhs, col, rhs)
390                }
391            },
392            Self::IsIn(values) => {
393                format!(
394                    "{} IN [{}]",
395                    col,
396                    values
397                        .iter()
398                        .map(|val| val.to_string())
399                        .collect::<Vec<_>>()
400                        .join(",")
401                )
402            }
403            Self::FullTextSearch(query) => {
404                format!("fts({})", query.query)
405            }
406            Self::IsNull() => {
407                format!("{} IS NULL", col)
408            }
409            Self::Equals(val) => {
410                format!("{} = {}", col, val)
411            }
412        }
413    }
414
415    fn to_expr(&self, col: String) -> Expr {
416        let col_expr = Expr::Column(Column::new_unqualified(col));
417        match self {
418            Self::Range(lower, upper) => match (lower, upper) {
419                (Bound::Unbounded, Bound::Unbounded) => {
420                    Expr::Literal(ScalarValue::Boolean(Some(true)), None)
421                }
422                (Bound::Unbounded, Bound::Included(rhs)) => {
423                    col_expr.lt_eq(Expr::Literal(rhs.clone(), None))
424                }
425                (Bound::Unbounded, Bound::Excluded(rhs)) => {
426                    col_expr.lt(Expr::Literal(rhs.clone(), None))
427                }
428                (Bound::Included(lhs), Bound::Unbounded) => {
429                    col_expr.gt_eq(Expr::Literal(lhs.clone(), None))
430                }
431                (Bound::Included(lhs), Bound::Included(rhs)) => col_expr.between(
432                    Expr::Literal(lhs.clone(), None),
433                    Expr::Literal(rhs.clone(), None),
434                ),
435                (Bound::Included(lhs), Bound::Excluded(rhs)) => col_expr
436                    .clone()
437                    .gt_eq(Expr::Literal(lhs.clone(), None))
438                    .and(col_expr.lt(Expr::Literal(rhs.clone(), None))),
439                (Bound::Excluded(lhs), Bound::Unbounded) => {
440                    col_expr.gt(Expr::Literal(lhs.clone(), None))
441                }
442                (Bound::Excluded(lhs), Bound::Included(rhs)) => col_expr
443                    .clone()
444                    .gt(Expr::Literal(lhs.clone(), None))
445                    .and(col_expr.lt_eq(Expr::Literal(rhs.clone(), None))),
446                (Bound::Excluded(lhs), Bound::Excluded(rhs)) => col_expr
447                    .clone()
448                    .gt(Expr::Literal(lhs.clone(), None))
449                    .and(col_expr.lt(Expr::Literal(rhs.clone(), None))),
450            },
451            Self::IsIn(values) => col_expr.in_list(
452                values
453                    .iter()
454                    .map(|val| Expr::Literal(val.clone(), None))
455                    .collect::<Vec<_>>(),
456                false,
457            ),
458            Self::FullTextSearch(query) => col_expr.like(Expr::Literal(
459                ScalarValue::Utf8(Some(query.query.to_string())),
460                None,
461            )),
462            Self::IsNull() => col_expr.is_null(),
463            Self::Equals(value) => col_expr.eq(Expr::Literal(value.clone(), None)),
464        }
465    }
466
467    fn dyn_eq(&self, other: &dyn AnyQuery) -> bool {
468        match other.as_any().downcast_ref::<Self>() {
469            Some(o) => self == o,
470            None => false,
471        }
472    }
473}
474
475/// A query that a LabelListIndex can satisfy
476#[derive(Debug, Clone, PartialEq)]
477pub enum LabelListQuery {
478    /// Retrieve all row ids where every label is in the list of values for the row
479    HasAllLabels(Vec<ScalarValue>),
480    /// Retrieve all row ids where at least one of the given labels is in the list of values for the row
481    HasAnyLabel(Vec<ScalarValue>),
482}
483
484impl AnyQuery for LabelListQuery {
485    fn as_any(&self) -> &dyn Any {
486        self
487    }
488
489    fn format(&self, col: &str) -> String {
490        format!("{}", self.to_expr(col.to_string()))
491    }
492
493    fn to_expr(&self, col: String) -> Expr {
494        match self {
495            Self::HasAllLabels(labels) => {
496                let labels_arr = ScalarValue::iter_to_array(labels.iter().cloned()).unwrap();
497                let offsets_buffer =
498                    OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, labels_arr.len() as i32]));
499                let labels_list = ListArray::try_new(
500                    Arc::new(Field::new("item", labels_arr.data_type().clone(), false)),
501                    offsets_buffer,
502                    labels_arr,
503                    None,
504                )
505                .unwrap();
506                let labels_arr = Arc::new(labels_list);
507                Expr::ScalarFunction(ScalarFunction {
508                    func: Arc::new(array_has::ArrayHasAll::new().into()),
509                    args: vec![
510                        Expr::Column(Column::new_unqualified(col)),
511                        Expr::Literal(ScalarValue::List(labels_arr), None),
512                    ],
513                })
514            }
515            Self::HasAnyLabel(labels) => {
516                let labels_arr = ScalarValue::iter_to_array(labels.iter().cloned()).unwrap();
517                let offsets_buffer =
518                    OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, labels_arr.len() as i32]));
519                let labels_list = ListArray::try_new(
520                    Arc::new(Field::new("item", labels_arr.data_type().clone(), false)),
521                    offsets_buffer,
522                    labels_arr,
523                    None,
524                )
525                .unwrap();
526                let labels_arr = Arc::new(labels_list);
527                Expr::ScalarFunction(ScalarFunction {
528                    func: Arc::new(array_has::ArrayHasAny::new().into()),
529                    args: vec![
530                        Expr::Column(Column::new_unqualified(col)),
531                        Expr::Literal(ScalarValue::List(labels_arr), None),
532                    ],
533                })
534            }
535        }
536    }
537
538    fn dyn_eq(&self, other: &dyn AnyQuery) -> bool {
539        match other.as_any().downcast_ref::<Self>() {
540            Some(o) => self == o,
541            None => false,
542        }
543    }
544}
545
546/// A query that a NGramIndex can satisfy
547#[derive(Debug, Clone, PartialEq)]
548pub enum TextQuery {
549    /// Retrieve all row ids where the text contains the given string
550    StringContains(String),
551    // TODO: In the future we should be able to do string-insensitive contains
552    // as well as partial matches (e.g. LIKE 'foo%') and potentially even
553    // some regular expressions
554}
555
556impl AnyQuery for TextQuery {
557    fn as_any(&self) -> &dyn Any {
558        self
559    }
560
561    fn format(&self, col: &str) -> String {
562        format!("{}", self.to_expr(col.to_string()))
563    }
564
565    fn to_expr(&self, col: String) -> Expr {
566        match self {
567            Self::StringContains(substr) => Expr::ScalarFunction(ScalarFunction {
568                func: Arc::new(ContainsFunc::new().into()),
569                args: vec![
570                    Expr::Column(Column::new_unqualified(col)),
571                    Expr::Literal(ScalarValue::Utf8(Some(substr.clone())), None),
572                ],
573            }),
574        }
575    }
576
577    fn dyn_eq(&self, other: &dyn AnyQuery) -> bool {
578        match other.as_any().downcast_ref::<Self>() {
579            Some(o) => self == o,
580            None => false,
581        }
582    }
583}
584
585/// A query that a InvertedIndex can satisfy
586#[derive(Debug, Clone, PartialEq)]
587pub enum TokenQuery {
588    /// Retrieve all row ids where the text contains all tokens parsed from given string. The tokens
589    /// are separated by punctuations and white spaces.
590    TokensContains(String),
591}
592
593impl AnyQuery for TokenQuery {
594    fn as_any(&self) -> &dyn Any {
595        self
596    }
597
598    fn format(&self, col: &str) -> String {
599        format!("{}", self.to_expr(col.to_string()))
600    }
601
602    fn to_expr(&self, col: String) -> Expr {
603        match self {
604            Self::TokensContains(substr) => Expr::ScalarFunction(ScalarFunction {
605                func: Arc::new(CONTAINS_TOKENS_UDF.clone()),
606                args: vec![
607                    Expr::Column(Column::new_unqualified(col)),
608                    Expr::Literal(ScalarValue::Utf8(Some(substr.clone())), None),
609                ],
610            }),
611        }
612    }
613
614    fn dyn_eq(&self, other: &dyn AnyQuery) -> bool {
615        match other.as_any().downcast_ref::<Self>() {
616            Some(o) => self == o,
617            None => false,
618        }
619    }
620}
621
622/// The result of a search operation against a scalar index
623#[derive(Debug, PartialEq)]
624pub enum SearchResult {
625    /// The exact row ids that satisfy the query
626    Exact(RowIdTreeMap),
627    /// Any row id satisfying the query will be in this set but not every
628    /// row id in this set will satisfy the query, a further recheck step
629    /// is needed
630    AtMost(RowIdTreeMap),
631    /// All of the given row ids satisfy the query but there may be more
632    ///
633    /// No scalar index actually returns this today but it can arise from
634    /// boolean operations (e.g. NOT(AtMost(x)) == AtLeast(NOT(x)))
635    AtLeast(RowIdTreeMap),
636}
637
638impl SearchResult {
639    pub fn row_ids(&self) -> &RowIdTreeMap {
640        match self {
641            Self::Exact(row_ids) => row_ids,
642            Self::AtMost(row_ids) => row_ids,
643            Self::AtLeast(row_ids) => row_ids,
644        }
645    }
646
647    pub fn is_exact(&self) -> bool {
648        matches!(self, Self::Exact(_))
649    }
650}
651
652/// Brief information about an index that was created
653pub struct CreatedIndex {
654    /// The details of the index that was created
655    ///
656    /// These should be stored somewhere as they will be needed to
657    /// load the index later.
658    pub index_details: prost_types::Any,
659    /// The version of the index that was created
660    ///
661    /// This can be used to determine if a reader is able to load the index.
662    pub index_version: u32,
663}
664
665/// The criteria that specifies how to update an index
666pub struct UpdateCriteria {
667    /// If true, then we need to read the old data to update the index
668    ///
669    /// This should be avoided if possible but is left in for some legacy paths
670    pub requires_old_data: bool,
671    /// The criteria required for data (both old and new)
672    pub data_criteria: TrainingCriteria,
673}
674
675impl UpdateCriteria {
676    pub fn requires_old_data(data_criteria: TrainingCriteria) -> Self {
677        Self {
678            requires_old_data: true,
679            data_criteria,
680        }
681    }
682
683    pub fn only_new_data(data_criteria: TrainingCriteria) -> Self {
684        Self {
685            requires_old_data: false,
686            data_criteria,
687        }
688    }
689}
690
691/// A trait for a scalar index, a structure that can determine row ids that satisfy scalar queries
692#[async_trait]
693pub trait ScalarIndex: Send + Sync + std::fmt::Debug + Index + DeepSizeOf {
694    /// Search the scalar index
695    ///
696    /// Returns all row ids that satisfy the query, these row ids are not necessarily ordered
697    async fn search(
698        &self,
699        query: &dyn AnyQuery,
700        metrics: &dyn MetricsCollector,
701    ) -> Result<SearchResult>;
702
703    /// Returns true if the remap operation is supported
704    fn can_remap(&self) -> bool;
705
706    /// Remap the row ids, creating a new remapped version of this index in `dest_store`
707    async fn remap(
708        &self,
709        mapping: &HashMap<u64, Option<u64>>,
710        dest_store: &dyn IndexStore,
711    ) -> Result<CreatedIndex>;
712
713    /// Add the new data into the index, creating an updated version of the index in `dest_store`
714    async fn update(
715        &self,
716        new_data: SendableRecordBatchStream,
717        dest_store: &dyn IndexStore,
718    ) -> Result<CreatedIndex>;
719
720    /// Returns the criteria that will be used to update the index
721    fn update_criteria(&self) -> UpdateCriteria;
722}