Skip to main content

claw_vector/index/
selector.rs

1// index/selector.rs — auto-selecting index that migrates from FlatIndex to HnswIndex
2// when the collection surpasses HNSW_THRESHOLD (1 000 vectors).
3use std::path::Path;
4
5use tracing::instrument;
6
7use crate::{
8    config::VectorConfig,
9    error::{VectorError, VectorResult},
10    index::{flat::FlatIndex, hnsw::HnswIndex},
11    types::DistanceMetric,
12};
13
14/// Collection size above which the selector automatically migrates to HNSW.
15pub const HNSW_THRESHOLD: usize = 1_000;
16
17/// Transparently routes between a [`FlatIndex`] and a [`HnswIndex`].
18pub enum IndexSelector {
19    /// Brute-force index for small collections.
20    Flat(FlatIndex),
21    /// Approximate NN index for larger collections.
22    Hnsw(Box<HnswIndex>),
23}
24
25impl IndexSelector {
26    /// Create a new selector (always starts as Flat).
27    pub fn new(dimensions: usize, distance: DistanceMetric, _config: &VectorConfig) -> Self {
28        IndexSelector::Flat(FlatIndex::new(dimensions, distance))
29    }
30
31    /// Insert a single vector, migrating to HNSW if the threshold is crossed.
32    #[instrument(skip(self, vector, config))]
33    pub fn insert(
34        &mut self,
35        id: usize,
36        vector: Vec<f32>,
37        config: &VectorConfig,
38    ) -> VectorResult<()> {
39        match self {
40            IndexSelector::Flat(flat) => {
41                flat.insert(id, vector)?;
42                if flat.len() > HNSW_THRESHOLD {
43                    self.migrate_to_hnsw(config)?;
44                }
45            }
46            IndexSelector::Hnsw(hnsw) => hnsw.insert(id, &vector)?,
47        }
48        Ok(())
49    }
50
51    /// Insert a batch of vectors, migrating to HNSW if the threshold is crossed.
52    #[instrument(skip(self, items, config))]
53    pub fn insert_batch(
54        &mut self,
55        items: Vec<(usize, Vec<f32>)>,
56        config: &VectorConfig,
57    ) -> VectorResult<()> {
58        match self {
59            IndexSelector::Flat(flat) => {
60                flat.insert_batch(items)?;
61                if flat.len() > HNSW_THRESHOLD {
62                    self.migrate_to_hnsw(config)?;
63                }
64            }
65            IndexSelector::Hnsw(hnsw) => hnsw.insert_batch(&items)?,
66        }
67        Ok(())
68    }
69
70    /// Search for `top_k` nearest neighbours of `query`.
71    #[instrument(skip(self, query))]
72    pub fn search(
73        &self,
74        query: &[f32],
75        top_k: usize,
76        ef_search: usize,
77    ) -> VectorResult<Vec<(usize, f32)>> {
78        match self {
79            IndexSelector::Flat(flat) => flat.search(query, top_k),
80            IndexSelector::Hnsw(hnsw) => hnsw.search(query, top_k, ef_search),
81        }
82    }
83
84    /// Delete a vector by id. Returns `true` if the id was present.
85    #[instrument(skip(self))]
86    pub fn delete(&mut self, id: usize) -> VectorResult<bool> {
87        match self {
88            IndexSelector::Flat(flat) => flat.delete(id),
89            IndexSelector::Hnsw(hnsw) => {
90                hnsw.delete(id)?;
91                Ok(true)
92            }
93        }
94    }
95
96    /// Return the number of live elements.
97    pub fn len(&self) -> usize {
98        match self {
99            IndexSelector::Flat(f) => f.len(),
100            IndexSelector::Hnsw(h) => h.len(),
101        }
102    }
103
104    /// Return `true` if the selector contains no live elements.
105    pub fn is_empty(&self) -> bool {
106        self.len() == 0
107    }
108
109    /// Return `true` if the selector is backed by HNSW.
110    pub fn is_hnsw(&self) -> bool {
111        matches!(self, IndexSelector::Hnsw(_))
112    }
113
114    /// Migrate from FlatIndex to HnswIndex, replacing `self`.
115    #[instrument(skip(self, config))]
116    pub fn migrate_to_hnsw(&mut self, config: &VectorConfig) -> VectorResult<()> {
117        let hnsw = match self {
118            IndexSelector::Flat(flat) => {
119                tracing::info!(elements = flat.len(), "migrating flat index to HNSW");
120                flat.to_hnsw(config)?
121            }
122            IndexSelector::Hnsw(_) => return Ok(()),
123        };
124        *self = IndexSelector::Hnsw(Box::new(hnsw));
125        Ok(())
126    }
127
128    /// Persist the index under `<dir>/<collection>/`.
129    #[instrument(skip(self))]
130    pub fn save(&self, dir: &Path, workspace_id: &str, collection: &str) -> VectorResult<()> {
131        let col_dir = dir.join(workspace_id).join(collection);
132        std::fs::create_dir_all(&col_dir)?;
133        let kind = if self.is_hnsw() { "hnsw" } else { "flat" };
134        std::fs::write(
135            col_dir.join("index.meta.json"),
136            serde_json::to_string(&serde_json::json!({ "index_type": kind }))?,
137        )?;
138        match self {
139            IndexSelector::Flat(flat) => {
140                std::fs::write(
141                    col_dir.join("flat.json"),
142                    serde_json::to_string(&flat.all_vectors()?)?,
143                )?;
144            }
145            IndexSelector::Hnsw(hnsw) => hnsw.save(&col_dir, collection)?,
146        }
147        Ok(())
148    }
149
150    /// Reload a previously saved index from `<dir>/<collection>/`.
151    #[instrument(skip(config))]
152    pub fn load(
153        dir: &Path,
154        workspace_id: &str,
155        collection: &str,
156        config: &VectorConfig,
157        distance: DistanceMetric,
158        dimensions: usize,
159    ) -> VectorResult<Self> {
160        let col_dir = dir.join(workspace_id).join(collection);
161        let meta: serde_json::Value =
162            serde_json::from_reader(std::fs::File::open(col_dir.join("index.meta.json"))?)?;
163        match meta["index_type"]
164            .as_str()
165            .ok_or_else(|| VectorError::Index("missing index_type".into()))?
166        {
167            "flat" => {
168                let vecs: Vec<(usize, Vec<f32>)> =
169                    serde_json::from_str(&std::fs::read_to_string(col_dir.join("flat.json"))?)?;
170                let flat = FlatIndex::new(dimensions, distance);
171                flat.insert_batch(vecs)?;
172                Ok(IndexSelector::Flat(flat))
173            }
174            "hnsw" => Ok(IndexSelector::Hnsw(Box::new(HnswIndex::load(
175                &col_dir, collection, config, distance,
176            )?))),
177            other => Err(VectorError::Index(format!("unknown index_type '{other}'"))),
178        }
179    }
180}