Skip to main content

lance_index/vector/
hnsw.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! HNSW graph implementation.
5//!
6//! Hierarchical Navigable Small World (HNSW).
7//!
8
9use arrow_schema::{DataType, Field};
10use deepsize::DeepSizeOf;
11use itertools::Itertools;
12use serde::{Deserialize, Serialize};
13
14use self::builder::HnswBuildParams;
15use super::graph::OrderedNode;
16use super::storage::VectorStore;
17
18pub mod builder;
19pub mod index;
20
21pub use builder::HNSW;
22pub use index::HNSWIndex;
23
24const HNSW_TYPE: &str = "HNSW";
25const VECTOR_ID_COL: &str = "__vector_id";
26const POINTER_COL: &str = "__pointer";
27
28use std::sync::LazyLock;
29
30/// POINTER field.
31///
32pub static POINTER_FIELD: LazyLock<Field> =
33    LazyLock::new(|| Field::new(POINTER_COL, DataType::UInt32, true));
34
35/// Id of the vector in the `VectorStorage`.
36pub static VECTOR_ID_FIELD: LazyLock<Field> =
37    LazyLock::new(|| Field::new(VECTOR_ID_COL, DataType::UInt32, true));
38
39#[derive(Debug, Clone, Serialize, Deserialize, DeepSizeOf)]
40pub struct HnswMetadata {
41    pub entry_point: u32,
42    pub params: HnswBuildParams,
43    pub level_offsets: Vec<usize>,
44}
45
46impl Default for HnswMetadata {
47    fn default() -> Self {
48        let params = HnswBuildParams::default();
49        let level_offsets = vec![0; params.max_level as usize];
50        Self {
51            entry_point: 0,
52            params,
53            level_offsets,
54        }
55    }
56}
57
58/// Algorithm 4 in the HNSW paper.
59///
60/// # NOTE
61/// The results are not ordered.
62fn select_neighbors_heuristic(
63    storage: &impl VectorStore,
64    candidates: &[OrderedNode],
65    k: usize,
66) -> Vec<OrderedNode> {
67    if candidates.len() <= k {
68        return candidates.iter().cloned().collect_vec();
69    }
70    let mut candidates = candidates.to_vec();
71    candidates.sort_unstable();
72
73    let mut results: Vec<OrderedNode> = Vec::with_capacity(k);
74    for u in candidates.iter() {
75        if results.len() >= k {
76            break;
77        }
78
79        if results.is_empty() || storage.prefers_candidate(u, &results) {
80            results.push(u.clone());
81        }
82    }
83    results
84}