vector/serde/centroid_chunk.rs
1//! CentroidChunk value encoding/decoding.
2//!
3//! Stores a chunk of cluster centroids for SPANN-style approximate nearest neighbor search.
4//!
5//! ## Background: SPANN Index Architecture
6//!
7//! SPANN (Scalable Partition-based Approximate Nearest Neighbor) is a disk-based indexing
8//! strategy that enables billion-scale vector search with bounded memory. The key insight
9//! is to keep only cluster centroids in memory while storing the actual vectors on disk
10//! in posting lists.
11//!
12//! The index works by clustering vectors into groups, where each group has a representative
13//! **centroid** (typically the cluster's mean vector). During search:
14//!
15//! 1. Find the k nearest centroids to the query vector using an in-memory index (HNSW)
16//! 2. Load the posting lists for those centroids from disk
17//! 3. Compute exact distances for vectors in those posting lists
18//! 4. Return the top results
19//!
20//! This approach trades some recall for massive memory savings—instead of keeping all
21//! vectors in memory, only 0.1-1% (the centroids) need to be loaded.
22//!
23//! ## Centroid Chunking
24//!
25//! Centroids are split across multiple `CentroidChunk` records rather than stored
26//! individually. This chunking strategy:
27//!
28//! - **Reduces key count**: Storing one centroid per key would explode the number of
29//! keys fetched on startup, increasing latency.
30//! - **Enables block-level caching**: SlateDB can cache entire chunks efficiently.
31//! - **Supports partial reads**: Only touched chunks need to be read from disk.
32//!
33//! All centroid chunks are fully scanned on startup (`scan(version|0x02|*)`) to load
34//! the in-memory HNSW graph. The `chunk_id` exists solely to give each chunk a distinct
35//! record key.
36//!
37//! ## Centroid IDs
38//!
39//! Each centroid carries an explicit `centroid_id` (u64) stored alongside its vector.
40//! These IDs:
41//!
42//! - Start at 1 (0 is reserved for the deleted vectors bitmap in PostingList)
43//! - Are never reassigned once issued
44//! - Remain stable when chunks are rewritten or reordered
45//! - Are referenced by PostingList keys to map centroids to their assigned vectors
46//!
47//! ## Sizing Guidelines
48//!
49//! From RFC 0001:
50//! - `CHUNK_TARGET` centroids per chunk (default 4096, ~25 MB per chunk at 1536 dims)
51//! - Centroid ratio typically 0.1-1% of total vectors
52//! - Example: 10M vectors → 10K-100K centroids → 3-25 chunks
53//! - Higher ratios improve recall at the cost of memory
54
55use super::{Encode, EncodingError, decode_fixed_element_array, encode_fixed_element_array};
56use bytes::{Bytes, BytesMut};
57
58/// A single centroid entry with its ID and vector.
59///
60/// Each centroid represents a cluster in the SPANN index. The centroid vector
61/// is typically the mean of all vectors assigned to that cluster. During search,
62/// the query vector is compared against centroids to find relevant clusters,
63/// then the posting lists for those clusters are loaded to find candidate vectors.
64///
65/// See `PostingList` for the mapping from `centroid_id` to vector IDs.
66#[derive(Debug, Clone, PartialEq)]
67pub struct CentroidEntry {
68 /// Stable identifier for this centroid (starts at 1, never reused).
69 ///
70 /// This ID is used as the key suffix in `PostingListKey` to look up the
71 /// vector IDs assigned to this cluster. ID 0 is reserved for the deleted
72 /// vectors bitmap.
73 pub centroid_id: u64,
74 /// Centroid vector with `dimensions` elements (from `CollectionMeta`).
75 ///
76 /// This is the representative vector for the cluster, typically computed
77 /// as the mean of assigned vectors. The dimensionality must match all
78 /// other vectors in the collection.
79 pub vector: Vec<f32>,
80}
81
82impl CentroidEntry {
83 pub fn new(centroid_id: u64, vector: Vec<f32>) -> Self {
84 Self {
85 centroid_id,
86 vector,
87 }
88 }
89
90 /// Returns the dimensionality of the centroid vector.
91 pub fn dimensions(&self) -> usize {
92 self.vector.len()
93 }
94}
95
96impl Encode for CentroidEntry {
97 fn encode(&self, buf: &mut BytesMut) {
98 buf.extend_from_slice(&self.centroid_id.to_le_bytes());
99 encode_fixed_element_array(&self.vector, buf);
100 }
101}
102
103/// Decode a centroid entry with known dimensionality.
104///
105/// Note: This is a standalone function rather than a `Decode` trait impl because
106/// decoding requires the `dimensions` parameter from `CollectionMeta`. The `Decode`
107/// trait signature doesn't allow passing external context.
108pub fn decode_centroid_entry(
109 buf: &mut &[u8],
110 dimensions: usize,
111) -> Result<CentroidEntry, EncodingError> {
112 if buf.len() < 8 {
113 return Err(EncodingError {
114 message: "Buffer too short for centroid_id".to_string(),
115 });
116 }
117
118 let centroid_id = u64::from_le_bytes([
119 buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7],
120 ]);
121 *buf = &buf[8..];
122
123 let vector_size = dimensions * 4;
124 if buf.len() < vector_size {
125 return Err(EncodingError {
126 message: format!(
127 "Buffer too short for centroid vector: need {} bytes, have {}",
128 vector_size,
129 buf.len()
130 ),
131 });
132 }
133
134 let vector_buf = &buf[..vector_size];
135 *buf = &buf[vector_size..];
136
137 let mut vector_slice = vector_buf;
138 let vector: Vec<f32> = decode_fixed_element_array(&mut vector_slice, 4)?;
139
140 Ok(CentroidEntry {
141 centroid_id,
142 vector,
143 })
144}
145
146/// CentroidChunk value storing a chunk of cluster centroids.
147///
148/// Multiple chunks together form the complete centroid index for a collection.
149/// On startup, all chunks are scanned and loaded into an in-memory HNSW graph
150/// for fast nearest-centroid search during queries.
151///
152/// ## Value Layout (little-endian)
153///
154/// ```text
155/// ┌───────────────────────────────────────────────────────────────┐
156/// │ count: u16 (number of entries, <= CHUNK_TARGET) │
157/// │ entries: Array<CentroidEntry> │
158/// │ │
159/// │ CentroidEntry │
160/// │ ┌──────────────────────────────────────────────────────────┐ │
161/// │ │ centroid_id: u64 (stable identifier) │ │
162/// │ │ vector: FixedElementArray<f32> │ │
163/// │ │ (dimensions elements from CollectionMeta) │ │
164/// │ └──────────────────────────────────────────────────────────┘ │
165/// └───────────────────────────────────────────────────────────────┘
166/// ```
167///
168/// ## Sizing
169///
170/// - `CHUNK_TARGET` (default 4096) centroids per chunk
171/// - At 1536 dimensions: ~25 MB per chunk (4096 × 1536 × 4 bytes + overhead)
172/// - Dimensionality is obtained from `CollectionMeta` at decode time
173#[derive(Debug, Clone, PartialEq)]
174pub struct CentroidChunkValue {
175 /// Centroid entries in this chunk (up to `CHUNK_TARGET` entries).
176 pub entries: Vec<CentroidEntry>,
177}
178
179impl CentroidChunkValue {
180 pub fn new(entries: Vec<CentroidEntry>) -> Self {
181 Self { entries }
182 }
183
184 /// Returns the number of centroids in this chunk.
185 pub fn len(&self) -> usize {
186 self.entries.len()
187 }
188
189 /// Returns true if this chunk is empty.
190 pub fn is_empty(&self) -> bool {
191 self.entries.is_empty()
192 }
193
194 /// Encode the chunk with known dimensionality.
195 pub fn encode_to_bytes(&self, dimensions: usize) -> Bytes {
196 let mut buf = BytesMut::new();
197
198 // Count prefix
199 let count = self.entries.len();
200 if count > u16::MAX as usize {
201 panic!("Too many centroids in chunk: {}", count);
202 }
203 buf.extend_from_slice(&(count as u16).to_le_bytes());
204
205 // Entries
206 for entry in &self.entries {
207 debug_assert_eq!(
208 entry.dimensions(),
209 dimensions,
210 "Centroid dimension mismatch"
211 );
212 entry.encode(&mut buf);
213 }
214
215 buf.freeze()
216 }
217
218 /// Decode the chunk with known dimensionality.
219 pub fn decode_from_bytes(buf: &[u8], dimensions: usize) -> Result<Self, EncodingError> {
220 if buf.len() < 2 {
221 return Err(EncodingError {
222 message: "Buffer too short for centroid chunk count".to_string(),
223 });
224 }
225
226 let count = u16::from_le_bytes([buf[0], buf[1]]) as usize;
227 let mut slice = &buf[2..];
228
229 let mut entries = Vec::with_capacity(count);
230 for _ in 0..count {
231 entries.push(decode_centroid_entry(&mut slice, dimensions)?);
232 }
233
234 Ok(CentroidChunkValue { entries })
235 }
236
237 /// Find a centroid by ID.
238 pub fn get_centroid(&self, centroid_id: u64) -> Option<&CentroidEntry> {
239 self.entries.iter().find(|e| e.centroid_id == centroid_id)
240 }
241}
242
243#[cfg(test)]
244mod tests {
245 use super::*;
246
247 #[test]
248 fn should_encode_and_decode_centroid_chunk() {
249 // given
250 let dimensions = 4;
251 let value = CentroidChunkValue::new(vec![
252 CentroidEntry::new(1, vec![1.0, 2.0, 3.0, 4.0]),
253 CentroidEntry::new(2, vec![5.0, 6.0, 7.0, 8.0]),
254 ]);
255
256 // when
257 let encoded = value.encode_to_bytes(dimensions);
258 let decoded = CentroidChunkValue::decode_from_bytes(&encoded, dimensions).unwrap();
259
260 // then
261 assert_eq!(decoded, value);
262 }
263
264 #[test]
265 fn should_handle_empty_chunk() {
266 // given
267 let dimensions = 128;
268 let value = CentroidChunkValue::new(vec![]);
269
270 // when
271 let encoded = value.encode_to_bytes(dimensions);
272 let decoded = CentroidChunkValue::decode_from_bytes(&encoded, dimensions).unwrap();
273
274 // then
275 assert!(decoded.is_empty());
276 }
277
278 #[test]
279 fn should_handle_high_dimensional_centroids() {
280 // given
281 let dimensions = 1536;
282 let vector: Vec<f32> = (0..dimensions).map(|i| i as f32).collect();
283 let value = CentroidChunkValue::new(vec![CentroidEntry::new(42, vector.clone())]);
284
285 // when
286 let encoded = value.encode_to_bytes(dimensions);
287 let decoded = CentroidChunkValue::decode_from_bytes(&encoded, dimensions).unwrap();
288
289 // then
290 assert_eq!(decoded.entries.len(), 1);
291 assert_eq!(decoded.entries[0].centroid_id, 42);
292 assert_eq!(decoded.entries[0].vector, vector);
293 }
294
295 #[test]
296 fn should_find_centroid_by_id() {
297 // given
298 let value = CentroidChunkValue::new(vec![
299 CentroidEntry::new(10, vec![1.0, 2.0]),
300 CentroidEntry::new(20, vec![3.0, 4.0]),
301 CentroidEntry::new(30, vec![5.0, 6.0]),
302 ]);
303
304 // when / then
305 assert_eq!(value.get_centroid(20).unwrap().centroid_id, 20);
306 assert!(value.get_centroid(99).is_none());
307 }
308
309 #[test]
310 fn should_preserve_centroid_ids() {
311 // given
312 let dimensions = 2;
313 let value = CentroidChunkValue::new(vec![
314 CentroidEntry::new(100, vec![1.0, 2.0]),
315 CentroidEntry::new(200, vec![3.0, 4.0]),
316 CentroidEntry::new(u64::MAX, vec![5.0, 6.0]),
317 ]);
318
319 // when
320 let encoded = value.encode_to_bytes(dimensions);
321 let decoded = CentroidChunkValue::decode_from_bytes(&encoded, dimensions).unwrap();
322
323 // then
324 assert_eq!(decoded.entries[0].centroid_id, 100);
325 assert_eq!(decoded.entries[1].centroid_id, 200);
326 assert_eq!(decoded.entries[2].centroid_id, u64::MAX);
327 }
328}