use std::collections::BTreeSet;
use std::mem::size_of;
use roaring::RoaringBitmap;
use rustc_hash::FxHashMap;
#[path = "vector_index/build.rs"]
mod build;
#[path = "vector_index/config.rs"]
mod config;
#[path = "vector_index/hnsw.rs"]
mod hnsw;
#[path = "vector_index/ivf.rs"]
mod ivf;
#[path = "vector_index/ivf_adapter.rs"]
mod ivf_adapter;
#[path = "vector_index/memory.rs"]
mod memory;
#[path = "vector_index/rebuild.rs"]
mod rebuild;
#[path = "vector_index/search_hit.rs"]
mod search_hit;
#[path = "vector_index/turbo_quant.rs"]
mod turbo_quant;
#[path = "vector_index/turbo_quant_adapter.rs"]
mod turbo_quant_adapter;
use selene_core::{
DbString, HnswIndexConfig, IvfIndexConfig, LabelSet, PropertyMap, Value, VectorMetric,
VectorValue,
};
use serde::{Deserialize, Serialize};
use crate::error::{GraphError, GraphResult};
use crate::graph::VectorIndexEntry;
pub(crate) use build::{
build_vector_index_lenient_with_configs, build_vector_index_with_configs,
maintain_vector_indexes_strict, rebuild_vector_indexes, rebuild_vector_indexes_strict,
};
pub(crate) use config::MAX_IVF_TARGET_CENTROIDS;
use config::{hnsw_config_for_kind, ivf_config_for_kind};
pub(crate) use hnsw::HnswSearchScratch;
use hnsw::HnswVectorIndex;
use ivf::IvfVectorIndex;
pub use memory::{
IVF_REBUILD_MIN_PENDING_RETRAIN_ENTRIES, IVF_REBUILD_PENDING_RETRAIN_BASIS_POINTS,
VectorIndexMemoryUsage,
};
pub use rebuild::{
VectorIndexMaintenancePolicy, VectorIndexRebuildEntry, VectorIndexRebuildReport,
};
pub(crate) use search_hit::VectorIndexSearchHit;
use search_hit::{hnsw_hits, ivf_hits};
use turbo_quant::TurboQuantVectorIndex;
type VectorIndexMap = FxHashMap<(DbString, DbString), VectorIndexEntry>;
#[derive(
Clone,
Copy,
Debug,
Deserialize,
Eq,
Hash,
PartialEq,
rkyv::Archive,
rkyv::Deserialize,
rkyv::Serialize,
Serialize,
)]
pub enum VectorIndexKind {
Flat,
HnswSquaredEuclidean,
HnswCosine,
HnswNegativeInnerProduct,
IvfSquaredEuclidean,
IvfCosine,
IvfNegativeInnerProduct,
TurboQuantCosine,
}
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub struct VectorIndexConfig {
pub hnsw: Option<HnswIndexConfig>,
pub ivf: Option<IvfIndexConfig>,
}
impl VectorIndexConfig {
#[must_use]
pub const fn new(hnsw: Option<HnswIndexConfig>, ivf: Option<IvfIndexConfig>) -> Self {
Self { hnsw, ivf }
}
#[must_use]
pub const fn hnsw(config: HnswIndexConfig) -> Self {
Self {
hnsw: Some(config),
ivf: None,
}
}
#[must_use]
pub const fn ivf(config: IvfIndexConfig) -> Self {
Self {
hnsw: None,
ivf: Some(config),
}
}
}
impl VectorIndexKind {
#[must_use]
pub const fn hnsw_metric(self) -> Option<VectorMetric> {
match self {
Self::Flat => None,
Self::HnswSquaredEuclidean => Some(VectorMetric::SquaredEuclidean),
Self::HnswCosine => Some(VectorMetric::Cosine),
Self::HnswNegativeInnerProduct => Some(VectorMetric::NegativeInnerProduct),
Self::IvfSquaredEuclidean
| Self::IvfCosine
| Self::IvfNegativeInnerProduct
| Self::TurboQuantCosine => None,
}
}
#[must_use]
pub const fn ivf_metric(self) -> Option<VectorMetric> {
match self {
Self::Flat
| Self::HnswSquaredEuclidean
| Self::HnswCosine
| Self::HnswNegativeInnerProduct
| Self::TurboQuantCosine => None,
Self::IvfSquaredEuclidean => Some(VectorMetric::SquaredEuclidean),
Self::IvfCosine => Some(VectorMetric::Cosine),
Self::IvfNegativeInnerProduct => Some(VectorMetric::NegativeInnerProduct),
}
}
#[must_use]
pub const fn ann_metric(self) -> Option<VectorMetric> {
match self {
Self::Flat => None,
Self::HnswSquaredEuclidean | Self::IvfSquaredEuclidean => {
Some(VectorMetric::SquaredEuclidean)
}
Self::HnswCosine | Self::IvfCosine => Some(VectorMetric::Cosine),
Self::HnswNegativeInnerProduct | Self::IvfNegativeInnerProduct => {
Some(VectorMetric::NegativeInnerProduct)
}
Self::TurboQuantCosine => Some(VectorMetric::Cosine),
}
}
}
#[derive(Clone, Debug)]
pub struct VectorIndex {
kind: VectorIndexKind,
dimension: u32,
hnsw_config: Option<HnswIndexConfig>,
ivf_config: Option<IvfIndexConfig>,
rows: RoaringBitmap,
hnsw: Option<HnswVectorIndex>,
ivf: Option<IvfVectorIndex>,
turbo_quant: Option<TurboQuantVectorIndex>,
}
impl VectorIndex {
pub fn new(kind: VectorIndexKind, dimension: u32) -> GraphResult<Self> {
Self::new_with_configs(kind, dimension, None, None)
}
pub fn new_with_hnsw_config(
kind: VectorIndexKind,
dimension: u32,
hnsw_config: Option<HnswIndexConfig>,
) -> GraphResult<Self> {
Self::new_with_configs(kind, dimension, hnsw_config, None)
}
pub fn new_with_configs(
kind: VectorIndexKind,
dimension: u32,
hnsw_config: Option<HnswIndexConfig>,
ivf_config: Option<IvfIndexConfig>,
) -> GraphResult<Self> {
ensure_dimension(dimension)?;
let hnsw_config = hnsw_config_for_kind(kind, hnsw_config)?;
let ivf_config = ivf_config_for_kind(kind, ivf_config)?;
let hnsw = kind.hnsw_metric().map(|metric| {
HnswVectorIndex::with_config(metric, hnsw_config.expect("HNSW kind stores config"))
});
let ivf = kind
.ivf_metric()
.map(|metric| IvfVectorIndex::with_config(metric, ivf_config));
let turbo_quant = match kind {
VectorIndexKind::TurboQuantCosine => Some(TurboQuantVectorIndex::new(dimension)?),
_ => None,
};
Ok(Self {
kind,
dimension,
hnsw_config,
ivf_config,
rows: RoaringBitmap::new(),
hnsw,
ivf,
turbo_quant,
})
}
#[must_use]
pub const fn kind(&self) -> VectorIndexKind {
self.kind
}
#[must_use]
pub const fn dimension(&self) -> u32 {
self.dimension
}
#[must_use]
pub const fn hnsw_config(&self) -> Option<HnswIndexConfig> {
self.hnsw_config
}
#[must_use]
pub const fn ivf_config(&self) -> Option<IvfIndexConfig> {
self.ivf_config
}
#[must_use]
pub fn cardinality(&self) -> u64 {
self.rows.len()
}
#[must_use]
pub const fn rows(&self) -> &RoaringBitmap {
&self.rows
}
#[must_use]
pub const fn is_hnsw(&self) -> bool {
self.kind.hnsw_metric().is_some()
}
#[must_use]
pub const fn is_ivf(&self) -> bool {
self.kind.ivf_metric().is_some()
}
#[must_use]
pub const fn is_turbo_quant(&self) -> bool {
matches!(self.kind, VectorIndexKind::TurboQuantCosine)
}
#[must_use]
pub const fn hnsw_metric(&self) -> Option<VectorMetric> {
self.kind.hnsw_metric()
}
#[must_use]
pub const fn ann_metric(&self) -> Option<VectorMetric> {
self.kind.ann_metric()
}
#[must_use]
pub fn memory_usage(&self) -> VectorIndexMemoryUsage {
let row_bitmap_bytes = roaring_heap_bytes(&self.rows);
let row_bitmap_serialized_bytes = self.rows.serialized_size();
let hnsw = self
.hnsw
.as_ref()
.map(HnswVectorIndex::memory_usage)
.unwrap_or_default();
let ivf = self
.ivf
.as_ref()
.map(IvfVectorIndex::memory_usage)
.unwrap_or_default();
let turbo_quant = self
.turbo_quant
.as_ref()
.map(TurboQuantVectorIndex::memory_usage)
.unwrap_or_default();
let estimated_index_bytes = size_of::<Self>()
.saturating_add(row_bitmap_bytes)
.saturating_add(hnsw.estimated_heap_bytes)
.saturating_add(ivf.estimated_heap_bytes)
.saturating_add(turbo_quant.estimated_heap_bytes);
VectorIndexMemoryUsage {
indexed_rows: self.cardinality(),
row_bitmap_bytes,
row_bitmap_serialized_bytes,
hnsw_index_bytes: hnsw.estimated_heap_bytes,
hnsw_referenced_vector_bytes: hnsw.referenced_vector_bytes,
hnsw_entries: hnsw.entries,
hnsw_live_entries: hnsw.live_entries,
hnsw_deleted_entries: hnsw.deleted_entries,
hnsw_link_count: hnsw.link_count,
hnsw_level_zero_link_count: hnsw.level_zero_link_count,
hnsw_upper_layer_link_count: hnsw.upper_layer_link_count,
hnsw_max_layer_count: hnsw.max_layer_count,
hnsw_max_links_per_layer: hnsw.max_links_per_layer,
hnsw_average_links_per_entry_basis_points: hnsw.average_links_per_entry_basis_points,
ivf_index_bytes: ivf.estimated_heap_bytes,
ivf_referenced_vector_bytes: ivf.referenced_vector_bytes,
ivf_entries: ivf.entries,
ivf_live_entries: ivf.live_entries,
ivf_deleted_entries: ivf.deleted_entries,
ivf_centroids: ivf.centroids,
ivf_list_count: ivf.list_count,
ivf_non_empty_list_count: ivf.non_empty_list_count,
ivf_max_list_len: ivf.max_list_len,
ivf_average_list_len_basis_points: ivf.average_list_len_basis_points,
ivf_assigned_entries: ivf.assigned_entries,
ivf_pending_retrain_entries: ivf.pending_retrain_entries,
turbo_quant_index_bytes: turbo_quant.estimated_heap_bytes,
turbo_quant_referenced_vector_bytes: turbo_quant.referenced_vector_bytes,
turbo_quant_entries: turbo_quant.entries,
turbo_quant_live_entries: turbo_quant.live_entries,
turbo_quant_deleted_entries: turbo_quant.deleted_entries,
turbo_quant_code_bytes: turbo_quant.code_bytes,
turbo_quant_codebook_bytes: turbo_quant.codebook_bytes,
turbo_quant_calibration_bytes: turbo_quant.calibration_bytes,
estimated_index_bytes,
estimated_reachable_bytes: estimated_index_bytes
.saturating_add(hnsw.referenced_vector_bytes)
.saturating_add(ivf.referenced_vector_bytes)
.saturating_add(turbo_quant.referenced_vector_bytes),
}
}
pub(crate) fn insert_value(&mut self, row: u32, vector: &VectorValue) -> GraphResult<()> {
let mut scratch = HnswSearchScratch::default();
self.insert_value_with_scratch(row, vector, &mut scratch)
}
pub(crate) fn insert_value_with_scratch(
&mut self,
row: u32,
vector: &VectorValue,
scratch: &mut HnswSearchScratch,
) -> GraphResult<()> {
self.rows.insert(row);
if let Some(hnsw) = &mut self.hnsw {
hnsw.insert_with_scratch(row, vector.clone(), scratch)?;
}
if let Some(ivf) = &mut self.ivf {
ivf.insert(row, vector.clone())?;
}
if let Some(turbo_quant) = &mut self.turbo_quant {
turbo_quant.insert(row, vector)?;
}
Ok(())
}
pub(crate) fn finish_bulk_load(&mut self) -> GraphResult<()> {
if let Some(hnsw) = &mut self.hnsw {
hnsw.finish_bulk_load();
}
if let Some(ivf) = &mut self.ivf {
ivf.finish_bulk_load()?;
}
if let Some(turbo_quant) = &mut self.turbo_quant {
turbo_quant.finish_bulk_load()?;
}
Ok(())
}
pub(crate) fn remove_row(&mut self, row: u32) {
self.rows.remove(row);
if let Some(hnsw) = &mut self.hnsw {
hnsw.remove(row);
}
if let Some(ivf) = &mut self.ivf {
ivf.remove(row);
}
if let Some(turbo_quant) = &mut self.turbo_quant {
turbo_quant.remove(row);
}
}
pub(crate) fn rows_eq(&self, reference: &Self) -> bool {
self.kind == reference.kind
&& self.dimension == reference.dimension
&& self.hnsw_config == reference.hnsw_config
&& self.ivf_config == reference.ivf_config
&& self.rows == reference.rows
}
pub(crate) fn ann_search(
&self,
query: &VectorValue,
k: usize,
search_width: usize,
) -> Option<selene_core::CoreResult<Vec<VectorIndexSearchHit>>> {
if let Some(hnsw) = &self.hnsw {
return Some(hnsw.search(query, k, search_width).map(hnsw_hits));
}
if let Some(ivf) = &self.ivf {
return Some(ivf.search(query, k, search_width).map(ivf_hits));
}
None
}
pub(crate) fn ann_search_with_scratch(
&self,
query: &VectorValue,
k: usize,
search_width: usize,
scratch: &mut HnswSearchScratch,
) -> Option<selene_core::CoreResult<Vec<VectorIndexSearchHit>>> {
if let Some(hnsw) = &self.hnsw {
return Some(
hnsw.search_with_scratch(query, k, search_width, scratch)
.map(hnsw_hits),
);
}
if let Some(ivf) = &self.ivf {
return Some(ivf.search(query, k, search_width).map(ivf_hits));
}
None
}
}
fn roaring_heap_bytes(rows: &RoaringBitmap) -> usize {
let statistics = rows.statistics();
u64_to_usize_saturating(
statistics
.n_bytes_array_containers
.saturating_add(statistics.n_bytes_run_containers)
.saturating_add(statistics.n_bytes_bitset_containers),
)
}
#[derive(Clone, Debug, Eq, PartialEq, thiserror::Error)]
pub enum VectorIndexValueError {
#[error("kind mismatch: observed {observed}")]
KindMismatch {
observed: &'static str,
},
#[error("dimension mismatch: expected {expected}, observed {observed}")]
DimensionMismatch {
expected: u32,
observed: usize,
},
#[error("metric rejection: {observed}")]
MetricRejected {
observed: String,
},
}
impl VectorIndexValueError {
fn observed(&self) -> String {
match self {
Self::KindMismatch { observed } => (*observed).to_owned(),
Self::DimensionMismatch { observed, .. } => format!("VECTOR<{observed}>"),
Self::MetricRejected { observed } => observed.clone(),
}
}
}
pub(crate) fn apply_node_create(
indexes: &mut VectorIndexMap,
labels: &LabelSet,
props: &PropertyMap,
row: u32,
) -> GraphResult<()> {
for label in labels.iter() {
for (property, value) in props.iter() {
if is_null(value) {
continue;
}
insert_commit(indexes, label.clone(), property.clone(), value, row)?;
}
}
Ok(())
}
pub(crate) fn apply_node_delete(
indexes: &mut VectorIndexMap,
labels: &LabelSet,
props: &PropertyMap,
row: u32,
) -> GraphResult<()> {
for label in labels.iter() {
for (property, value) in props.iter() {
if is_null(value) {
continue;
}
remove_commit(indexes, label.clone(), property.clone(), value, row)?;
}
}
Ok(())
}
pub(crate) fn apply_node_update(
indexes: &mut VectorIndexMap,
old_labels: &LabelSet,
old_props: &PropertyMap,
new_labels: &LabelSet,
new_props: &PropertyMap,
row: u32,
) -> GraphResult<()> {
let candidates = candidate_keys(indexes, old_labels, old_props, new_labels, new_props);
for (label, property) in candidates {
match (
indexable_value(old_labels, old_props, &label, &property),
indexable_value(new_labels, new_props, &label, &property),
) {
(Some(old_value), Some(new_value)) => {
replace_commit(
indexes,
label.clone(),
property.clone(),
old_value,
new_value,
row,
)?;
}
(Some(value), None) => {
remove_commit(indexes, label.clone(), property.clone(), value, row)?;
}
(None, Some(value)) => {
insert_commit(indexes, label.clone(), property.clone(), value, row)?;
}
(None, None) => {}
}
}
Ok(())
}
fn candidate_keys(
indexes: &VectorIndexMap,
old_labels: &LabelSet,
old_props: &PropertyMap,
new_labels: &LabelSet,
new_props: &PropertyMap,
) -> BTreeSet<(DbString, DbString)> {
if indexes.is_empty() {
return BTreeSet::new();
}
let mut labels: BTreeSet<DbString> = BTreeSet::new();
labels.extend(old_labels.iter().cloned());
labels.extend(new_labels.iter().cloned());
let mut properties: BTreeSet<DbString> = BTreeSet::new();
properties.extend(old_props.keys().cloned());
properties.extend(new_props.keys().cloned());
let mut candidates = BTreeSet::new();
for label in &labels {
for property in &properties {
let key = (label.clone(), property.clone());
if indexes.contains_key(&key) {
candidates.insert(key);
}
}
}
candidates
}
fn indexable_value<'a>(
labels: &LabelSet,
props: &'a PropertyMap,
label: &DbString,
property: &DbString,
) -> Option<&'a Value> {
if !labels.contains(label) {
return None;
}
props.get(property).filter(|value| !is_null(value))
}
fn insert_commit(
indexes: &mut VectorIndexMap,
label: DbString,
property: DbString,
value: &Value,
row: u32,
) -> GraphResult<()> {
if let Some(entry) = indexes.get_mut(&(label.clone(), property.clone())) {
let vector = admit(value, entry.kind(), entry.dimension())
.map_err(|err| index_rejection(label, property, entry.dimension(), err))?;
std::sync::Arc::make_mut(&mut entry.index).insert_value(row, vector)?;
}
Ok(())
}
fn remove_commit(
indexes: &mut VectorIndexMap,
label: DbString,
property: DbString,
value: &Value,
row: u32,
) -> GraphResult<()> {
if let Some(entry) = indexes.get_mut(&(label.clone(), property.clone())) {
admit(value, entry.kind(), entry.dimension())
.map_err(|err| index_rejection(label, property, entry.dimension(), err))?;
std::sync::Arc::make_mut(&mut entry.index).remove_row(row);
}
Ok(())
}
fn replace_commit(
indexes: &mut VectorIndexMap,
label: DbString,
property: DbString,
old_value: &Value,
new_value: &Value,
row: u32,
) -> GraphResult<()> {
if let Some(entry) = indexes.get_mut(&(label.clone(), property.clone())) {
admit(old_value, entry.kind(), entry.dimension()).map_err(|err| {
index_rejection(label.clone(), property.clone(), entry.dimension(), err)
})?;
let vector = admit(new_value, entry.kind(), entry.dimension())
.map_err(|err| index_rejection(label, property, entry.dimension(), err))?;
std::sync::Arc::make_mut(&mut entry.index).insert_value(row, vector)?;
}
Ok(())
}
fn admit(
value: &Value,
kind: VectorIndexKind,
expected_dimension: u32,
) -> Result<&VectorValue, VectorIndexValueError> {
let Value::Vector(vector) = value else {
return Err(VectorIndexValueError::KindMismatch {
observed: value_kind_name(value),
});
};
if vector.dimension() != expected_dimension as usize {
return Err(VectorIndexValueError::DimensionMismatch {
expected: expected_dimension,
observed: vector.dimension(),
});
}
if let Some(metric) = kind.ann_metric() {
metric
.distance(vector, vector)
.map_err(|err| VectorIndexValueError::MetricRejected {
observed: err.to_string(),
})?;
}
Ok(vector)
}
fn ensure_dimension(dimension: u32) -> GraphResult<()> {
if dimension == 0 || dimension as usize > selene_core::MAX_VECTOR_DIMENSION {
Err(GraphError::VectorIndexInvalidDimension { dimension })
} else {
Ok(())
}
}
fn u64_to_usize_saturating(value: u64) -> usize {
usize::try_from(value).unwrap_or(usize::MAX)
}
fn index_rejection(
label: DbString,
property: DbString,
expected_dimension: u32,
err: VectorIndexValueError,
) -> GraphError {
GraphError::VectorIndexValueRejected {
label,
property,
expected_dimension,
observed: err.observed(),
}
}
fn warn_rejected(
op: &'static str,
label: DbString,
property: DbString,
row: u32,
err: &VectorIndexValueError,
) {
tracing::warn!(
op,
%label,
%property,
row,
error = %err,
"skipped vector-index update for value that does not match the registered vector index"
);
}
const fn is_null(value: &Value) -> bool {
matches!(value, Value::Null)
}
const fn value_kind_name(value: &Value) -> &'static str {
match value {
Value::Null => "Null",
Value::Bool(_) => "Bool",
Value::Int(_) => "Int",
Value::Uint(_) => "Uint",
Value::Int128(_) => "Int128",
Value::Uint128(_) => "Uint128",
Value::Float(_) => "Float",
Value::Float32(_) => "Float32",
Value::Decimal(_) => "Decimal",
Value::String(_) => "String",
Value::Bytes(_) => "Bytes",
Value::List(_) => "List",
Value::Record(_) => "Record",
Value::RecordTyped(_) => "RecordTyped",
Value::Path(_) => "Path",
Value::NodeRef(_) => "NodeRef",
Value::EdgeRef(_) => "EdgeRef",
Value::GraphRef(_) => "GraphRef",
Value::TableRef(_) => "TableRef",
Value::ZonedDateTime(_) => "ZonedDateTime",
Value::LocalDateTime(_) => "LocalDateTime",
Value::Date(_) => "Date",
Value::ZonedTime(_) => "ZonedTime",
Value::LocalTime(_) => "LocalTime",
Value::Duration(_) => "Duration",
Value::Extended { .. } => "Extended",
Value::Uuid(_) => "Uuid",
Value::Vector(_) => "Vector",
Value::Json(_) => "Json",
_ => "Unknown",
}
}
#[cfg(test)]
#[path = "vector_index/config_tests.rs"]
mod config_tests;
#[cfg(test)]
#[path = "vector_index/tests.rs"]
mod tests;