Skip to main content

VectorIndex

Trait VectorIndex 

Source
pub trait VectorIndex:
    Send
    + Sync
    + Debug
    + Index {
Show 23 methods // Required methods fn search<'life0, 'life1, 'life2, 'async_trait>( &'life0 self, query: &'life1 Query, pre_filter: Arc<dyn PreFilter>, metrics: &'life2 dyn MetricsCollector, ) -> Pin<Box<dyn Future<Output = Result<RecordBatch>> + Send + 'async_trait>> where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait, 'life2: 'async_trait; fn find_partitions( &self, query: &Query, ) -> Result<(UInt32Array, Float32Array)>; fn total_partitions(&self) -> usize; fn search_in_partition<'life0, 'life1, 'life2, 'async_trait>( &'life0 self, partition_id: usize, query: &'life1 Query, pre_filter: Arc<dyn PreFilter>, metrics: &'life2 dyn MetricsCollector, ) -> Pin<Box<dyn Future<Output = Result<RecordBatch>> + Send + 'async_trait>> where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait, 'life2: 'async_trait; fn is_loadable(&self) -> bool; fn use_residual(&self) -> bool; fn load<'life0, 'async_trait>( &'life0 self, reader: Arc<dyn Reader>, offset: usize, length: usize, ) -> Pin<Box<dyn Future<Output = Result<Box<dyn VectorIndex>>> + Send + 'async_trait>> where Self: 'async_trait, 'life0: 'async_trait; fn to_batch_stream<'life0, 'async_trait>( &'life0 self, with_vector: bool, ) -> Pin<Box<dyn Future<Output = Result<SendableRecordBatchStream>> + Send + 'async_trait>> where Self: 'async_trait, 'life0: 'async_trait; fn num_rows(&self) -> u64; fn row_ids(&self) -> Box<dyn Iterator<Item = &u64> + '_>; fn remap<'life0, 'life1, 'async_trait>( &'life0 mut self, mapping: &'life1 HashMap<u64, Option<u64>>, ) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>> where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait; fn metric_type(&self) -> DistanceType; fn ivf_model(&self) -> &IvfModel; fn quantizer(&self) -> Quantizer; fn partition_size(&self, part_id: usize) -> usize; fn sub_index_type(&self) -> (SubIndexType, QuantizationType); // Provided methods fn prepare_partition_search<'life0, 'life1, 'life2, 'async_trait>( &'life0 self, _partition_id: usize, _query: &'life1 Query, _pre_filter: Arc<dyn PreFilter>, _metrics: &'life2 dyn MetricsCollector, ) -> Pin<Box<dyn Future<Output = Result<PreparedPartitionSearchHandle>> + Send + 'async_trait>> where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait, 'life2: 'async_trait { ... } fn search_prepared_partition( &self, _prepared: PreparedPartitionSearchHandle, _metrics: &dyn MetricsCollector, ) -> Result<RecordBatch> { ... } fn supports_prepared_partition_search(&self) -> bool { ... } fn auto_query_parallelism(&self, _cpu_pool_size: usize) -> usize { ... } fn search_partitions<'async_trait>( self: Arc<Self>, query: Query, partitions: Arc<UInt32Array>, q_c_dists: Arc<Float32Array>, start_idx: usize, end_idx: usize, pre_filter: Arc<dyn PreFilter>, control: Option<Arc<dyn PartitionSearchControl>>, metrics: Arc<dyn MetricsCollector>, ) -> Pin<Box<dyn Future<Output = Result<SendableRecordBatchStream>> + Send + 'async_trait>> where Self: 'static + 'async_trait { ... } fn load_partition<'life0, 'async_trait>( &'life0 self, reader: Arc<dyn Reader>, offset: usize, length: usize, _partition_id: usize, ) -> Pin<Box<dyn Future<Output = Result<Box<dyn VectorIndex>>> + Send + 'async_trait>> where Self: 'async_trait, 'life0: 'async_trait { ... } fn partition_reader<'life0, 'life1, 'async_trait>( &'life0 self, _partition_id: usize, _with_vector: bool, _metrics: &'life1 dyn MetricsCollector, ) -> Pin<Box<dyn Future<Output = Result<SendableRecordBatchStream>> + Send + 'async_trait>> where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait { ... }
}
Expand description

Vector Index for (Approximate) Nearest Neighbor (ANN) Search.

Vector indices are often built as a chain of indices. For example, IVF -> PQ or IVF -> HNSW -> SQ.

We use one trait for both the top-level and the sub-indices. Typically the top-level search is a partition-aware search and all sub-indices are whole-index searches.

Required Methods§

Source

fn search<'life0, 'life1, 'life2, 'async_trait>( &'life0 self, query: &'life1 Query, pre_filter: Arc<dyn PreFilter>, metrics: &'life2 dyn MetricsCollector, ) -> Pin<Box<dyn Future<Output = Result<RecordBatch>> + Send + 'async_trait>>
where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait, 'life2: 'async_trait,

Search entire index for k nearest neighbors.

It returns a RecordBatch with Schema of:

use arrow_schema::{Schema, Field, DataType};

Schema::new(vec![
  Field::new("_rowid", DataType::UInt64, true),
  Field::new("_distance", DataType::Float32, true),
]);

The pre_filter argument is used to filter out row ids that we know are not relevant to the query. For example, it removes deleted rows or rows that do not match a user-provided filter.

Source

fn find_partitions(&self, query: &Query) -> Result<(UInt32Array, Float32Array)>

Find partitions that may contain nearest neighbors.

If maximum_nprobes is set then this method will return the partitions that are most likely to contain the nearest neighbors (e.g. the closest partitions to the query vector).

Return the partition ids and the distances between the query and the centroids, the results should be in sorted order from closest to farthest.

Source

fn total_partitions(&self) -> usize

Get the total number of partitions in the index.

Source

fn search_in_partition<'life0, 'life1, 'life2, 'async_trait>( &'life0 self, partition_id: usize, query: &'life1 Query, pre_filter: Arc<dyn PreFilter>, metrics: &'life2 dyn MetricsCollector, ) -> Pin<Box<dyn Future<Output = Result<RecordBatch>> + Send + 'async_trait>>
where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait, 'life2: 'async_trait,

Search a single partition for nearest neighbors.

This method should return the same results as VectorIndex::search method except that it will only search a single partition.

Source

fn is_loadable(&self) -> bool

If the index is loadable by IVF, so it can be a sub-index that is loaded on demand by IVF.

Source

fn use_residual(&self) -> bool

Use residual vector to search.

Source

fn load<'life0, 'async_trait>( &'life0 self, reader: Arc<dyn Reader>, offset: usize, length: usize, ) -> Pin<Box<dyn Future<Output = Result<Box<dyn VectorIndex>>> + Send + 'async_trait>>
where Self: 'async_trait, 'life0: 'async_trait,

Load the index from the reader on-demand.

Source

fn to_batch_stream<'life0, 'async_trait>( &'life0 self, with_vector: bool, ) -> Pin<Box<dyn Future<Output = Result<SendableRecordBatchStream>> + Send + 'async_trait>>
where Self: 'async_trait, 'life0: 'async_trait,

Source

fn num_rows(&self) -> u64

Source

fn row_ids(&self) -> Box<dyn Iterator<Item = &u64> + '_>

Return the IDs of rows in the index.

Source

fn remap<'life0, 'life1, 'async_trait>( &'life0 mut self, mapping: &'life1 HashMap<u64, Option<u64>>, ) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>
where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait,

Remap the index according to mapping

Each item in mapping describes an old row id -> new row id pair. If old row id -> None then that row id has been deleted and can be removed from the index.

If an old row id is not in the mapping then it should be left alone.

Source

fn metric_type(&self) -> DistanceType

The metric type of this vector index.

Source

fn ivf_model(&self) -> &IvfModel

Source

fn quantizer(&self) -> Quantizer

Source

fn partition_size(&self, part_id: usize) -> usize

Source

fn sub_index_type(&self) -> (SubIndexType, QuantizationType)

the index type of this vector index.

Provided Methods§

Asynchronously prepare a single-partition search so the CPU-heavy portion can be executed separately.

Source

fn search_prepared_partition( &self, _prepared: PreparedPartitionSearchHandle, _metrics: &dyn MetricsCollector, ) -> Result<RecordBatch>

Execute the synchronous portion of a previously prepared partition search.

Return true if the index supports splitting partition search into async prepare and sync execute phases.

Source

fn auto_query_parallelism(&self, _cpu_pool_size: usize) -> usize

Choose partition search concurrency for query_parallelism = 0.

The default keeps the single-worker sequential path. Index implementations can override this when their sub-index search work does not benefit from the sequential fast path.

Source

fn search_partitions<'async_trait>( self: Arc<Self>, query: Query, partitions: Arc<UInt32Array>, q_c_dists: Arc<Float32Array>, start_idx: usize, end_idx: usize, pre_filter: Arc<dyn PreFilter>, control: Option<Arc<dyn PartitionSearchControl>>, metrics: Arc<dyn MetricsCollector>, ) -> Pin<Box<dyn Future<Output = Result<SendableRecordBatchStream>> + Send + 'async_trait>>
where Self: 'static + 'async_trait,

Search a range of partitions and return a stream of per-partition result batches.

The default implementation searches each partition sequentially with VectorIndex::search_in_partition. Implementations can override this to use a more efficient execution strategy.

Source

fn load_partition<'life0, 'async_trait>( &'life0 self, reader: Arc<dyn Reader>, offset: usize, length: usize, _partition_id: usize, ) -> Pin<Box<dyn Future<Output = Result<Box<dyn VectorIndex>>> + Send + 'async_trait>>
where Self: 'async_trait, 'life0: 'async_trait,

Load the partition from the reader on-demand.

Source

fn partition_reader<'life0, 'life1, 'async_trait>( &'life0 self, _partition_id: usize, _with_vector: bool, _metrics: &'life1 dyn MetricsCollector, ) -> Pin<Box<dyn Future<Output = Result<SendableRecordBatchStream>> + Send + 'async_trait>>
where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait,

Dyn Compatibility§

This trait is dyn compatible.

In older versions of Rust, dyn compatibility was called "object safety".

Implementors§

Source§

impl<Q: Quantization + Send + Sync + 'static> VectorIndex for HNSWIndex<Q>