Skip to main content

vector/serde/
centroid_chunk.rs

1//! CentroidChunk value encoding/decoding.
2//!
3//! Stores a chunk of cluster centroids for SPANN-style approximate nearest neighbor search.
4//!
5//! ## Background: SPANN Index Architecture
6//!
7//! SPANN (Scalable Partition-based Approximate Nearest Neighbor) is a disk-based indexing
8//! strategy that enables billion-scale vector search with bounded memory. The key insight
9//! is to keep only cluster centroids in memory while storing the actual vectors on disk
10//! in posting lists.
11//!
12//! The index works by clustering vectors into groups, where each group has a representative
13//! **centroid** (typically the cluster's mean vector). During search:
14//!
15//! 1. Find the k nearest centroids to the query vector using an in-memory index (HNSW)
16//! 2. Load the posting lists for those centroids from disk
17//! 3. Compute exact distances for vectors in those posting lists
18//! 4. Return the top results
19//!
20//! This approach trades some recall for massive memory savings—instead of keeping all
21//! vectors in memory, only 0.1-1% (the centroids) need to be loaded.
22//!
23//! ## Centroid Chunking
24//!
25//! Centroids are split across multiple `CentroidChunk` records rather than stored
26//! individually. This chunking strategy:
27//!
28//! - **Reduces key count**: Storing one centroid per key would explode the number of
29//!   keys fetched on startup, increasing latency.
30//! - **Enables block-level caching**: SlateDB can cache entire chunks efficiently.
31//! - **Supports partial reads**: Only touched chunks need to be read from disk.
32//!
33//! All centroid chunks are fully scanned on startup (`scan(version|0x02|*)`) to load
34//! the in-memory HNSW graph. The `chunk_id` exists solely to give each chunk a distinct
35//! record key.
36//!
37//! ## Centroid IDs
38//!
39//! Each centroid carries an explicit `centroid_id` (u64) stored alongside its vector.
40//! These IDs:
41//!
42//! - Start at 1 (0 is reserved for the deleted vectors bitmap in PostingList)
43//! - Are never reassigned once issued
44//! - Remain stable when chunks are rewritten or reordered
45//! - Are referenced by PostingList keys to map centroids to their assigned vectors
46//!
47//! ## Sizing Guidelines
48//!
49//! From RFC 0001:
50//! - `CHUNK_TARGET` centroids per chunk (default 4096, ~25 MB per chunk at 1536 dims)
51//! - Centroid ratio typically 0.1-1% of total vectors
52//! - Example: 10M vectors → 10K-100K centroids → 3-25 chunks
53//! - Higher ratios improve recall at the cost of memory
54
55use super::{Encode, EncodingError, decode_fixed_element_array, encode_fixed_element_array};
56use bytes::{Bytes, BytesMut};
57
58/// A single centroid entry with its ID and vector.
59///
60/// Each centroid represents a cluster in the SPANN index. The centroid vector
61/// is typically the mean of all vectors assigned to that cluster. During search,
62/// the query vector is compared against centroids to find relevant clusters,
63/// then the posting lists for those clusters are loaded to find candidate vectors.
64///
65/// See `PostingList` for the mapping from `centroid_id` to vector IDs.
66#[derive(Debug, Clone, PartialEq)]
67pub struct CentroidEntry {
68    /// Stable identifier for this centroid (starts at 1, never reused).
69    ///
70    /// This ID is used as the key suffix in `PostingListKey` to look up the
71    /// vector IDs assigned to this cluster. ID 0 is reserved for the deleted
72    /// vectors bitmap.
73    pub centroid_id: u64,
74    /// Centroid vector with `dimensions` elements (from `CollectionMeta`).
75    ///
76    /// This is the representative vector for the cluster, typically computed
77    /// as the mean of assigned vectors. The dimensionality must match all
78    /// other vectors in the collection.
79    pub vector: Vec<f32>,
80}
81
82impl CentroidEntry {
83    pub fn new(centroid_id: u64, vector: Vec<f32>) -> Self {
84        Self {
85            centroid_id,
86            vector,
87        }
88    }
89
90    /// Returns the dimensionality of the centroid vector.
91    pub fn dimensions(&self) -> usize {
92        self.vector.len()
93    }
94}
95
96impl Encode for CentroidEntry {
97    fn encode(&self, buf: &mut BytesMut) {
98        buf.extend_from_slice(&self.centroid_id.to_le_bytes());
99        encode_fixed_element_array(&self.vector, buf);
100    }
101}
102
103/// Decode a centroid entry with known dimensionality.
104///
105/// Note: This is a standalone function rather than a `Decode` trait impl because
106/// decoding requires the `dimensions` parameter from `CollectionMeta`. The `Decode`
107/// trait signature doesn't allow passing external context.
108pub fn decode_centroid_entry(
109    buf: &mut &[u8],
110    dimensions: usize,
111) -> Result<CentroidEntry, EncodingError> {
112    if buf.len() < 8 {
113        return Err(EncodingError {
114            message: "Buffer too short for centroid_id".to_string(),
115        });
116    }
117
118    let centroid_id = u64::from_le_bytes([
119        buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7],
120    ]);
121    *buf = &buf[8..];
122
123    let vector_size = dimensions * 4;
124    if buf.len() < vector_size {
125        return Err(EncodingError {
126            message: format!(
127                "Buffer too short for centroid vector: need {} bytes, have {}",
128                vector_size,
129                buf.len()
130            ),
131        });
132    }
133
134    let vector_buf = &buf[..vector_size];
135    *buf = &buf[vector_size..];
136
137    let mut vector_slice = vector_buf;
138    let vector: Vec<f32> = decode_fixed_element_array(&mut vector_slice, 4)?;
139
140    Ok(CentroidEntry {
141        centroid_id,
142        vector,
143    })
144}
145
146/// CentroidChunk value storing a chunk of cluster centroids.
147///
148/// Multiple chunks together form the complete centroid index for a collection.
149/// On startup, all chunks are scanned and loaded into an in-memory HNSW graph
150/// for fast nearest-centroid search during queries.
151///
152/// ## Value Layout (little-endian)
153///
154/// ```text
155/// ┌───────────────────────────────────────────────────────────────┐
156/// │  count:   u16 (number of entries, <= CHUNK_TARGET)            │
157/// │  entries: Array<CentroidEntry>                                │
158/// │                                                               │
159/// │  CentroidEntry                                                │
160/// │  ┌──────────────────────────────────────────────────────────┐ │
161/// │  │  centroid_id: u64 (stable identifier)                    │ │
162/// │  │  vector:      FixedElementArray<f32>                     │ │
163/// │  │               (dimensions elements from CollectionMeta)  │ │
164/// │  └──────────────────────────────────────────────────────────┘ │
165/// └───────────────────────────────────────────────────────────────┘
166/// ```
167///
168/// ## Sizing
169///
170/// - `CHUNK_TARGET` (default 4096) centroids per chunk
171/// - At 1536 dimensions: ~25 MB per chunk (4096 × 1536 × 4 bytes + overhead)
172/// - Dimensionality is obtained from `CollectionMeta` at decode time
173#[derive(Debug, Clone, PartialEq)]
174pub struct CentroidChunkValue {
175    /// Centroid entries in this chunk (up to `CHUNK_TARGET` entries).
176    pub entries: Vec<CentroidEntry>,
177}
178
179impl CentroidChunkValue {
180    pub fn new(entries: Vec<CentroidEntry>) -> Self {
181        Self { entries }
182    }
183
184    /// Returns the number of centroids in this chunk.
185    pub fn len(&self) -> usize {
186        self.entries.len()
187    }
188
189    /// Returns true if this chunk is empty.
190    pub fn is_empty(&self) -> bool {
191        self.entries.is_empty()
192    }
193
194    /// Encode the chunk with known dimensionality.
195    pub fn encode_to_bytes(&self, dimensions: usize) -> Bytes {
196        let mut buf = BytesMut::new();
197
198        // Count prefix
199        let count = self.entries.len();
200        if count > u16::MAX as usize {
201            panic!("Too many centroids in chunk: {}", count);
202        }
203        buf.extend_from_slice(&(count as u16).to_le_bytes());
204
205        // Entries
206        for entry in &self.entries {
207            debug_assert_eq!(
208                entry.dimensions(),
209                dimensions,
210                "Centroid dimension mismatch"
211            );
212            entry.encode(&mut buf);
213        }
214
215        buf.freeze()
216    }
217
218    /// Decode the chunk with known dimensionality.
219    pub fn decode_from_bytes(buf: &[u8], dimensions: usize) -> Result<Self, EncodingError> {
220        if buf.len() < 2 {
221            return Err(EncodingError {
222                message: "Buffer too short for centroid chunk count".to_string(),
223            });
224        }
225
226        let count = u16::from_le_bytes([buf[0], buf[1]]) as usize;
227        let mut slice = &buf[2..];
228
229        let mut entries = Vec::with_capacity(count);
230        for _ in 0..count {
231            entries.push(decode_centroid_entry(&mut slice, dimensions)?);
232        }
233
234        Ok(CentroidChunkValue { entries })
235    }
236
237    /// Find a centroid by ID.
238    pub fn get_centroid(&self, centroid_id: u64) -> Option<&CentroidEntry> {
239        self.entries.iter().find(|e| e.centroid_id == centroid_id)
240    }
241}
242
243#[cfg(test)]
244mod tests {
245    use super::*;
246
247    #[test]
248    fn should_encode_and_decode_centroid_chunk() {
249        // given
250        let dimensions = 4;
251        let value = CentroidChunkValue::new(vec![
252            CentroidEntry::new(1, vec![1.0, 2.0, 3.0, 4.0]),
253            CentroidEntry::new(2, vec![5.0, 6.0, 7.0, 8.0]),
254        ]);
255
256        // when
257        let encoded = value.encode_to_bytes(dimensions);
258        let decoded = CentroidChunkValue::decode_from_bytes(&encoded, dimensions).unwrap();
259
260        // then
261        assert_eq!(decoded, value);
262    }
263
264    #[test]
265    fn should_handle_empty_chunk() {
266        // given
267        let dimensions = 128;
268        let value = CentroidChunkValue::new(vec![]);
269
270        // when
271        let encoded = value.encode_to_bytes(dimensions);
272        let decoded = CentroidChunkValue::decode_from_bytes(&encoded, dimensions).unwrap();
273
274        // then
275        assert!(decoded.is_empty());
276    }
277
278    #[test]
279    fn should_handle_high_dimensional_centroids() {
280        // given
281        let dimensions = 1536;
282        let vector: Vec<f32> = (0..dimensions).map(|i| i as f32).collect();
283        let value = CentroidChunkValue::new(vec![CentroidEntry::new(42, vector.clone())]);
284
285        // when
286        let encoded = value.encode_to_bytes(dimensions);
287        let decoded = CentroidChunkValue::decode_from_bytes(&encoded, dimensions).unwrap();
288
289        // then
290        assert_eq!(decoded.entries.len(), 1);
291        assert_eq!(decoded.entries[0].centroid_id, 42);
292        assert_eq!(decoded.entries[0].vector, vector);
293    }
294
295    #[test]
296    fn should_find_centroid_by_id() {
297        // given
298        let value = CentroidChunkValue::new(vec![
299            CentroidEntry::new(10, vec![1.0, 2.0]),
300            CentroidEntry::new(20, vec![3.0, 4.0]),
301            CentroidEntry::new(30, vec![5.0, 6.0]),
302        ]);
303
304        // when / then
305        assert_eq!(value.get_centroid(20).unwrap().centroid_id, 20);
306        assert!(value.get_centroid(99).is_none());
307    }
308
309    #[test]
310    fn should_preserve_centroid_ids() {
311        // given
312        let dimensions = 2;
313        let value = CentroidChunkValue::new(vec![
314            CentroidEntry::new(100, vec![1.0, 2.0]),
315            CentroidEntry::new(200, vec![3.0, 4.0]),
316            CentroidEntry::new(u64::MAX, vec![5.0, 6.0]),
317        ]);
318
319        // when
320        let encoded = value.encode_to_bytes(dimensions);
321        let decoded = CentroidChunkValue::decode_from_bytes(&encoded, dimensions).unwrap();
322
323        // then
324        assert_eq!(decoded.entries[0].centroid_id, 100);
325        assert_eq!(decoded.entries[1].centroid_id, 200);
326        assert_eq!(decoded.entries[2].centroid_id, u64::MAX);
327    }
328}