Skip to main content

citadel_vector/
segment.rs

1//! The ANNSEG body format: a storage-agnostic byte encoding of everything a
2//! built [`AnnIndex`] holds EXCEPT the f32 vectors. The vectors are rehydrated
3//! at load time from the table rows themselves - the rows are the source of
4//! truth, and the rehydration scan doubles as the staleness proof (it computes
5//! the content fingerprint the storage layer compares against its header).
6//!
7//! Layout: a fixed sequence of REQUIRED sections, each
8//! `[tag u8][len u64 LE][payload][blake3(payload) 32B]`. Per-section hashes
9//! refuse corruption at the section that broke; the storage layer additionally
10//! hashes the whole body. All integers little-endian. Any layout change bumps
11//! the storage header's `format_version` - this module never reads old
12//! formats silently.
13//!
14//! `PointStore.vectors` order is PRISM-INTERNAL (cell-reordered): loaders must
15//! place each scanned row's vector at `inverse(id_map)[row_id]`, never in scan
16//! order - a scan-order fill silently corrupts every f32 rerank.
17
18use rustc_hash::FxHashMap;
19
20use crate::ann::AnnIndex;
21use crate::prism::{
22    BinaryStore, Cell, Graph, Metric, PartitionTree, PointStore, PrismConfig, PrismIndex, SQ8Store,
23};
24
25#[derive(Debug, thiserror::Error)]
26pub enum SegmentError {
27    #[error("segment truncated in {0}")]
28    Truncated(&'static str),
29    #[error("segment section tag mismatch: expected {expected}, got {got}")]
30    BadTag { expected: u8, got: u8 },
31    #[error("segment section {0} failed its BLAKE3 check (corrupt)")]
32    SectionHash(&'static str),
33    #[error("segment metric tag {0} unknown")]
34    BadMetric(u8),
35    #[error("rehydrated vectors length {got} != n*dim {expected}")]
36    VectorLen { expected: usize, got: usize },
37    #[error("rehydration filled {got} of {expected} vector slots")]
38    RehydrationIncomplete { expected: usize, got: usize },
39    #[error("segment internal inconsistency: {0}")]
40    Inconsistent(&'static str),
41}
42
43const TAG_GRAPH: u8 = 1;
44const TAG_LOCAL_GRAPH: u8 = 2;
45const TAG_SQ8: u8 = 3;
46const TAG_BINARY: u8 = 4;
47const TAG_TREE: u8 = 5;
48const TAG_IDS: u8 = 6;
49const TAG_ATTRS: u8 = 7;
50const TAG_VECTORS: u8 = 8;
51
52/// BLAKE3 of the canonical little-endian encoding of EVERY [`PrismConfig`]
53/// field, domain-separated. The storage header pins this; a binary whose
54/// active config differs must refuse the segment (the graph was built for a
55/// different search geometry). The domain string carries the search-geometry
56/// version: bump it whenever build or search semantics change shape.
57pub fn prism_config_hash(cfg: &PrismConfig) -> [u8; 32] {
58    let mut h = blake3::Hasher::new();
59    h.update(b"citadel-annseg-config-v2");
60    for v in [
61        cfg.m_local as u64,
62        cfg.m_greedy as u64,
63        cfg.m_random as u64,
64        cfg.t as u64,
65        cfg.beam_width as u64,
66        cfg.binary_rerank as u64,
67    ] {
68        h.update(&v.to_le_bytes());
69    }
70    for v in [
71        cfg.alpha,
72        cfg.vamana_alpha,
73        cfg.sigma_high,
74        cfg.sigma_low,
75        cfg.beta,
76        cfg.epsilon,
77    ] {
78        h.update(&v.to_le_bytes());
79    }
80    h.update(&[metric_tag(cfg.metric)]);
81    *h.finalize().as_bytes()
82}
83
84pub fn metric_tag(m: Metric) -> u8 {
85    match m {
86        Metric::L2 => 0,
87        Metric::InnerProduct => 1,
88        Metric::Cosine => 2,
89    }
90}
91
92fn metric_from_tag(t: u8) -> Result<Metric, SegmentError> {
93    Ok(match t {
94        0 => Metric::L2,
95        1 => Metric::InnerProduct,
96        2 => Metric::Cosine,
97        other => return Err(SegmentError::BadMetric(other)),
98    })
99}
100
101/// Encode everything but the vectors. The output is the segment BODY; the
102/// storage layer wraps it in its header (fingerprint, config hash, counts).
103pub fn encode(index: &AnnIndex) -> Vec<u8> {
104    let p = index.prism();
105    let mut out = Vec::new();
106
107    section(&mut out, TAG_GRAPH, |b| encode_graph(b, &p.graph));
108    section(&mut out, TAG_LOCAL_GRAPH, |b| {
109        encode_graph(b, &p.local_graph)
110    });
111    section(&mut out, TAG_SQ8, |b| {
112        push_u64(b, p.sq8.dim() as u64);
113        push_slice_u8(b, p.sq8.codes());
114        push_slice_f32(b, p.sq8.mins());
115        push_slice_f32(b, p.sq8.scales());
116    });
117    section(&mut out, TAG_BINARY, |b| {
118        push_u64(b, p.binary.code_words() as u64);
119        push_u64(b, p.binary.block_size() as u64);
120        push_slice_u64(b, p.binary.codes());
121        push_slice_f32(b, p.binary.signs());
122    });
123    section(&mut out, TAG_TREE, |b| {
124        push_u64(b, p.tree.k as u64);
125        push_u64(b, p.tree.split_order.len() as u64);
126        for &s in &p.tree.split_order {
127            push_u64(b, s as u64);
128        }
129        push_u64(b, p.tree.cells.len() as u64);
130        for cell in &p.tree.cells {
131            push_slice_u32(b, &cell.values);
132            push_slice_u32(b, &cell.point_ids);
133        }
134    });
135    section(&mut out, TAG_IDS, |b| {
136        push_u64(b, index.snapshot_max);
137        b.push(metric_tag(index.metric));
138        b.extend_from_slice(&index.dim.to_le_bytes());
139        push_u64(b, u64::from(p.global_medoid));
140        push_slice_u32(b, &p.medoids);
141        push_slice_u32(b, &p.point_cell);
142        push_slice_u32(b, &p.original_ids);
143        push_slice_u64(b, index.id_map());
144    });
145    section(&mut out, TAG_ATTRS, |b| {
146        push_u64(b, p.store.attrs.len() as u64);
147        push_u64(b, p.store.len as u64);
148        for col in &p.store.attrs {
149            push_slice_u32(b, col);
150        }
151    });
152    // The f32 vectors in PRISM slot order, so a cold load is a bulk read, not a rescan.
153    section(&mut out, TAG_VECTORS, |b| {
154        push_u64(b, p.store.dim as u64);
155        push_slice_f32(b, &p.store.vectors);
156    });
157    out
158}
159
160/// Everything a segment carries; vectors arrive separately via
161/// [`SegmentParts::into_index`].
162pub struct SegmentParts {
163    graph: Graph,
164    local_graph: Graph,
165    sq8: SQ8Store,
166    binary: BinaryStore,
167    tree: PartitionTree,
168    snapshot_max: u64,
169    metric: Metric,
170    dim: u16,
171    global_medoid: u32,
172    medoids: Vec<u32>,
173    point_cell: Vec<u32>,
174    original_ids: Vec<u32>,
175    id_map: Vec<u64>,
176    attrs: Vec<Vec<u32>>,
177    vectors: Vec<f32>,
178    n: usize,
179}
180
181impl SegmentParts {
182    pub fn n(&self) -> usize {
183        self.n
184    }
185
186    pub fn dim(&self) -> u16 {
187        self.dim
188    }
189
190    pub fn metric(&self) -> Metric {
191        self.metric
192    }
193
194    pub fn id_map(&self) -> &[u64] {
195        &self.id_map
196    }
197
198    /// `row_id -> PRISM-internal slot`: the PERMUTATION the rehydration loader uses.
199    pub fn internal_of_row(&self) -> FxHashMap<u64, u32> {
200        self.id_map
201            .iter()
202            .enumerate()
203            .map(|(internal, &row)| (row, internal as u32))
204            .collect()
205    }
206
207    /// Assemble the index from vectors ALREADY in PRISM-internal slot order.
208    fn build(self, vectors: Vec<f32>) -> AnnIndex {
209        let store = PointStore::from_parts(vectors, self.dim as usize, self.attrs);
210        let prism = PrismIndex {
211            store,
212            tree: self.tree,
213            graph: self.graph,
214            local_graph: self.local_graph,
215            medoids: self.medoids,
216            global_medoid: self.global_medoid,
217            point_cell: self.point_cell,
218            original_ids: self.original_ids,
219            sq8: self.sq8,
220            binary: self.binary,
221            config: AnnIndex::active_config(self.metric),
222        };
223        AnnIndex::from_parts(prism, self.id_map, self.snapshot_max, self.metric, self.dim)
224    }
225
226    /// Build the index from externally-rehydrated vectors (id_map order); the sealed-load path.
227    pub fn into_index(
228        self,
229        mut vectors: Vec<f32>,
230        filled: usize,
231    ) -> Result<AnnIndex, SegmentError> {
232        if filled != self.n {
233            return Err(SegmentError::RehydrationIncomplete {
234                expected: self.n,
235                got: filled,
236            });
237        }
238        if vectors.len() != self.n * self.dim as usize {
239            return Err(SegmentError::VectorLen {
240                expected: self.n * self.dim as usize,
241                got: vectors.len(),
242            });
243        }
244        if self.metric == Metric::Cosine {
245            crate::prism::distance::normalize_rows(&mut vectors, self.dim as usize);
246        }
247        Ok(self.build(vectors))
248    }
249
250    /// Build the index from the segment's embedded build-form vectors - the fast cold-load path.
251    pub fn into_index_embedded(mut self) -> AnnIndex {
252        let vectors = std::mem::take(&mut self.vectors);
253        self.build(vectors)
254    }
255}
256
257/// Decode a segment body. Every section's BLAKE3 must verify; any mismatch is
258/// a corruption refusal, never a partial result.
259pub fn decode(bytes: &[u8]) -> Result<SegmentParts, SegmentError> {
260    let mut r = Reader { buf: bytes, at: 0 };
261
262    let g = r.section(TAG_GRAPH, "graph")?;
263    let graph = decode_graph(&mut Reader { buf: g, at: 0 }, "graph")?;
264    let lg = r.section(TAG_LOCAL_GRAPH, "local_graph")?;
265    let local_graph = decode_graph(&mut Reader { buf: lg, at: 0 }, "local_graph")?;
266
267    let s = r.section(TAG_SQ8, "sq8")?;
268    let mut sr = Reader { buf: s, at: 0 };
269    let sq8_dim = sr.u64("sq8")? as usize;
270    let codes = sr.slice_u8("sq8")?.to_vec();
271    let mins = sr.slice_f32("sq8")?;
272    let scales = sr.slice_f32("sq8")?;
273    let sq8 = SQ8Store::from_parts(codes, mins, scales, sq8_dim);
274
275    let b = r.section(TAG_BINARY, "binary")?;
276    let mut br = Reader { buf: b, at: 0 };
277    let code_words = br.u64("binary")? as usize;
278    let block_size = br.u64("binary")? as usize;
279    let bcodes = br.slice_u64("binary")?;
280    let signs = br.slice_f32("binary")?;
281    let binary = BinaryStore::from_parts(bcodes, code_words, signs, block_size);
282
283    let t = r.section(TAG_TREE, "tree")?;
284    let mut tr = Reader { buf: t, at: 0 };
285    let k = tr.u64("tree")? as usize;
286    let so_len = tr.u64("tree")? as usize;
287    let mut split_order = Vec::with_capacity(so_len);
288    for _ in 0..so_len {
289        split_order.push(tr.u64("tree")? as usize);
290    }
291    let cells_len = tr.u64("tree")? as usize;
292    let mut cells = Vec::with_capacity(cells_len);
293    for _ in 0..cells_len {
294        let values = tr.slice_u32("tree")?;
295        let point_ids = tr.slice_u32("tree")?;
296        cells.push(Cell { values, point_ids });
297    }
298    let tree = PartitionTree {
299        cells,
300        split_order,
301        k,
302    };
303
304    let i = r.section(TAG_IDS, "ids")?;
305    let mut ir = Reader { buf: i, at: 0 };
306    let snapshot_max = ir.u64("ids")?;
307    let metric = metric_from_tag(ir.u8("ids")?)?;
308    let dim = ir.u16("ids")?;
309    let global_medoid = ir.u64("ids")? as u32;
310    let medoids = ir.slice_u32("ids")?;
311    let point_cell = ir.slice_u32("ids")?;
312    let original_ids = ir.slice_u32("ids")?;
313    let id_map = ir.slice_u64("ids")?;
314
315    let a = r.section(TAG_ATTRS, "attrs")?;
316    let mut ar = Reader { buf: a, at: 0 };
317    let attr_k = ar.u64("attrs")? as usize;
318    let n = ar.u64("attrs")? as usize;
319    let mut attrs = Vec::with_capacity(attr_k);
320    for _ in 0..attr_k {
321        let col = ar.slice_u32("attrs")?;
322        if col.len() != n {
323            return Err(SegmentError::Inconsistent("attr column length != n"));
324        }
325        attrs.push(col);
326    }
327
328    let v = r.section(TAG_VECTORS, "vectors")?;
329    let mut vr = Reader { buf: v, at: 0 };
330    let vdim = vr.u64("vectors")? as usize;
331    let vectors = vr.slice_f32("vectors")?;
332    if vdim != dim as usize || vectors.len() != n * dim as usize {
333        return Err(SegmentError::VectorLen {
334            expected: n * dim as usize,
335            got: vectors.len(),
336        });
337    }
338
339    if id_map.len() != n || original_ids.len() != n || point_cell.len() != n {
340        return Err(SegmentError::Inconsistent("id arrays disagree on n"));
341    }
342    Ok(SegmentParts {
343        graph,
344        local_graph,
345        sq8,
346        binary,
347        tree,
348        snapshot_max,
349        metric,
350        dim,
351        global_medoid,
352        medoids,
353        point_cell,
354        original_ids,
355        id_map,
356        attrs,
357        vectors,
358        n,
359    })
360}
361
362fn encode_graph(b: &mut Vec<u8>, g: &Graph) {
363    push_u64(b, g.n as u64);
364    push_slice_u32(b, &g.offsets);
365    push_slice_u32(b, &g.neighbors);
366}
367
368fn decode_graph(r: &mut Reader<'_>, what: &'static str) -> Result<Graph, SegmentError> {
369    let n = r.u64(what)? as usize;
370    let offsets = r.slice_u32(what)?;
371    let neighbors = r.slice_u32(what)?;
372    if offsets.len() != n + 1 {
373        return Err(SegmentError::Inconsistent("graph offsets length != n+1"));
374    }
375    Ok(Graph {
376        offsets,
377        neighbors,
378        n,
379    })
380}
381
382fn section(out: &mut Vec<u8>, tag: u8, fill: impl FnOnce(&mut Vec<u8>)) {
383    let mut payload = Vec::new();
384    fill(&mut payload);
385    out.push(tag);
386    push_u64(out, payload.len() as u64);
387    let hash = blake3::hash(&payload);
388    out.extend_from_slice(&payload);
389    out.extend_from_slice(hash.as_bytes());
390}
391
392fn push_u64(b: &mut Vec<u8>, v: u64) {
393    b.extend_from_slice(&v.to_le_bytes());
394}
395
396fn push_slice_u8(b: &mut Vec<u8>, s: &[u8]) {
397    push_u64(b, s.len() as u64);
398    b.extend_from_slice(s);
399}
400
401fn push_slice_u32(b: &mut Vec<u8>, s: &[u32]) {
402    push_u64(b, s.len() as u64);
403    for &v in s {
404        b.extend_from_slice(&v.to_le_bytes());
405    }
406}
407
408fn push_slice_u64(b: &mut Vec<u8>, s: &[u64]) {
409    push_u64(b, s.len() as u64);
410    for &v in s {
411        b.extend_from_slice(&v.to_le_bytes());
412    }
413}
414
415fn push_slice_f32(b: &mut Vec<u8>, s: &[f32]) {
416    push_u64(b, s.len() as u64);
417    for &v in s {
418        b.extend_from_slice(&v.to_le_bytes());
419    }
420}
421
422struct Reader<'a> {
423    buf: &'a [u8],
424    at: usize,
425}
426
427impl<'a> Reader<'a> {
428    fn take(&mut self, n: usize, what: &'static str) -> Result<&'a [u8], SegmentError> {
429        let end = self
430            .at
431            .checked_add(n)
432            .filter(|&e| e <= self.buf.len())
433            .ok_or(SegmentError::Truncated(what))?;
434        let s = &self.buf[self.at..end];
435        self.at = end;
436        Ok(s)
437    }
438
439    fn u8(&mut self, what: &'static str) -> Result<u8, SegmentError> {
440        Ok(self.take(1, what)?[0])
441    }
442
443    fn u16(&mut self, what: &'static str) -> Result<u16, SegmentError> {
444        Ok(u16::from_le_bytes(self.take(2, what)?.try_into().unwrap()))
445    }
446
447    fn u64(&mut self, what: &'static str) -> Result<u64, SegmentError> {
448        Ok(u64::from_le_bytes(self.take(8, what)?.try_into().unwrap()))
449    }
450
451    /// One framed section: tag + length + payload + verified BLAKE3.
452    fn section(&mut self, tag: u8, what: &'static str) -> Result<&'a [u8], SegmentError> {
453        let got = self.u8(what)?;
454        if got != tag {
455            return Err(SegmentError::BadTag { expected: tag, got });
456        }
457        let len = self.u64(what)? as usize;
458        let payload = self.take(len, what)?;
459        let hash: [u8; 32] = self.take(32, what)?.try_into().unwrap();
460        if *blake3::hash(payload).as_bytes() != hash {
461            return Err(SegmentError::SectionHash(what));
462        }
463        Ok(payload)
464    }
465
466    fn slice_u8(&mut self, what: &'static str) -> Result<&'a [u8], SegmentError> {
467        let len = self.u64(what)? as usize;
468        self.take(len, what)
469    }
470
471    fn slice_u32(&mut self, what: &'static str) -> Result<Vec<u32>, SegmentError> {
472        let len = self.u64(what)? as usize;
473        let raw = self.take(
474            len.checked_mul(4).ok_or(SegmentError::Truncated(what))?,
475            what,
476        )?;
477        Ok(raw
478            .chunks_exact(4)
479            .map(|c| u32::from_le_bytes(c.try_into().unwrap()))
480            .collect())
481    }
482
483    fn slice_u64(&mut self, what: &'static str) -> Result<Vec<u64>, SegmentError> {
484        let len = self.u64(what)? as usize;
485        let raw = self.take(
486            len.checked_mul(8).ok_or(SegmentError::Truncated(what))?,
487            what,
488        )?;
489        Ok(raw
490            .chunks_exact(8)
491            .map(|c| u64::from_le_bytes(c.try_into().unwrap()))
492            .collect())
493    }
494
495    fn slice_f32(&mut self, what: &'static str) -> Result<Vec<f32>, SegmentError> {
496        let len = self.u64(what)? as usize;
497        let raw = self.take(
498            len.checked_mul(4).ok_or(SegmentError::Truncated(what))?,
499            what,
500        )?;
501        Ok(raw
502            .chunks_exact(4)
503            .map(|c| f32::from_le_bytes(c.try_into().unwrap()))
504            .collect())
505    }
506}
507
508#[cfg(test)]
509mod tests {
510    use super::*;
511
512    /// Deterministic fixture rows: two attribute cells and NON-monotonic row
513    /// ids (so id_map order != insertion order). RAW vectors, exactly what a
514    /// table scan would yield.
515    fn fixture_rows() -> Vec<(u64, Vec<f32>, Vec<u32>)> {
516        (0..200u64)
517            .map(|i| {
518                // Reverse-ish ids: external order differs from internal.
519                let id = 1000 - i * 3;
520                let v: Vec<f32> = (0..8).map(|d| ((i * 7 + d) % 23) as f32 * 0.5).collect();
521                (id, v, vec![(i % 2) as u32])
522            })
523            .collect()
524    }
525
526    fn build_fixture() -> AnnIndex {
527        AnnIndex::build_with_attrs(fixture_rows(), 1, Metric::Cosine, 8).expect("build fixture")
528    }
529
530    /// Rehydrate exactly as the storage loader will: RAW row vectors placed by
531    /// the id_map PERMUTATION (the index re-applies any build normalization).
532    fn rehydrate(rows: &[(u64, Vec<f32>, Vec<u32>)], parts: &SegmentParts) -> (Vec<f32>, usize) {
533        let inv = parts.internal_of_row();
534        let dim = parts.dim() as usize;
535        let mut vectors = vec![0.0f32; parts.n() * dim];
536        let mut filled = 0;
537        for (row, v, _) in rows {
538            let slot = inv[row] as usize;
539            vectors[slot * dim..(slot + 1) * dim].copy_from_slice(v);
540            filled += 1;
541        }
542        (vectors, filled)
543    }
544
545    #[test]
546    fn roundtrip_preserves_filtered_search_results_exactly() {
547        // Attribute-filtered search exercises the persisted tree + attrs +
548        // dicts machinery, not just the graph.
549        let index = build_fixture();
550        let parts = decode(&encode(&index)).expect("decode");
551        let (vectors, filled) = rehydrate(&fixture_rows(), &parts);
552        let loaded = parts.into_index(vectors, filled).expect("into_index");
553        let query: Vec<f32> = (0..8).map(|d| d as f32 * 0.7).collect();
554        for code in [0u32, 1] {
555            let filter = crate::prism::Filter::new(vec![(0, vec![code])]);
556            let a = index.search_filtered(&query, 8, 64, &filter);
557            let b = loaded.search_filtered(&query, 8, 64, &filter);
558            assert_eq!(a, b, "filtered (attr0={code}) results identical");
559            assert!(!a.is_empty(), "filter {code} matches half the fixture");
560        }
561    }
562
563    #[test]
564    fn roundtrip_holds_for_every_metric() {
565        for metric in [Metric::L2, Metric::InnerProduct, Metric::Cosine] {
566            let rows: Vec<(u64, Vec<f32>, Vec<u32>)> = (0..60u64)
567                .map(|i| {
568                    let v: Vec<f32> = (0..4).map(|d| ((i + d) % 13) as f32 - 6.0).collect();
569                    (i * 2 + 1, v, vec![0])
570                })
571                .collect();
572            let index = AnnIndex::build_with_attrs(rows.clone(), 1, metric, 4).expect("build");
573            let parts = decode(&encode(&index)).expect("decode");
574            assert_eq!(parts.metric(), metric, "metric tag survives");
575            let (vectors, filled) = rehydrate(&rows, &parts);
576            let loaded = parts.into_index(vectors, filled).expect("into_index");
577            let q = [1.0f32, -2.0, 3.0, 0.5];
578            assert_eq!(index.search(&q, 5), loaded.search(&q, 5), "{metric:?}");
579        }
580    }
581
582    #[test]
583    fn single_row_index_roundtrips() {
584        let rows = vec![(42u64, vec![1.0f32, 2.0], vec![0u32])];
585        let index =
586            AnnIndex::build_with_attrs(rows.clone(), 1, Metric::L2, 2).expect("build single");
587        let parts = decode(&encode(&index)).expect("decode");
588        assert_eq!(parts.n(), 1);
589        let (vectors, filled) = rehydrate(&rows, &parts);
590        let loaded = parts.into_index(vectors, filled).expect("into_index");
591        assert_eq!(loaded.search(&[1.0, 2.0], 1), vec![(42, 0.0)]);
592    }
593
594    #[test]
595    fn truncation_at_every_byte_boundary_is_refused() {
596        // Cutting the segment ANYWHERE must produce an error, never a panic or
597        // a silently partial decode.
598        let index = AnnIndex::build_with_attrs(
599            (0..12u64)
600                .map(|i| (i, vec![i as f32, 1.0], vec![0]))
601                .collect(),
602            1,
603            Metric::L2,
604            2,
605        )
606        .expect("build");
607        let bytes = encode(&index);
608        for cut in 0..bytes.len() {
609            assert!(
610                decode(&bytes[..cut]).is_err(),
611                "truncation at {cut}/{} must refuse",
612                bytes.len()
613            );
614        }
615    }
616
617    #[test]
618    fn internal_of_row_is_a_complete_bijection() {
619        let index = build_fixture();
620        let parts = decode(&encode(&index)).expect("decode");
621        let map = parts.internal_of_row();
622        assert_eq!(map.len(), parts.n(), "every row maps");
623        let mut slots: Vec<u32> = map.values().copied().collect();
624        slots.sort_unstable();
625        let expected: Vec<u32> = (0..parts.n() as u32).collect();
626        assert_eq!(slots, expected, "slots form a permutation of 0..n");
627    }
628
629    #[test]
630    fn wrong_vector_length_is_refused() {
631        let index = build_fixture();
632        let parts = decode(&encode(&index)).expect("decode");
633        let n = parts.n();
634        let too_short = vec![0.0f32; (n - 1) * 8];
635        assert!(matches!(
636            parts.into_index(too_short, n),
637            Err(SegmentError::VectorLen { .. })
638        ));
639    }
640
641    #[test]
642    fn roundtrip_preserves_search_results_exactly() {
643        let index = build_fixture();
644        let bytes = encode(&index);
645        let parts = decode(&bytes).expect("decode");
646        let (vectors, filled) = rehydrate(&fixture_rows(), &parts);
647        let loaded = parts.into_index(vectors, filled).expect("into_index");
648
649        let query: Vec<f32> = (0..8).map(|d| d as f32 * 0.3).collect();
650        let a = index.search(&query, 10);
651        let b = loaded.search(&query, 10);
652        assert_eq!(a, b, "loaded index must answer EXACTLY like the original");
653        assert_eq!(index.snapshot_max, loaded.snapshot_max);
654        assert_eq!(index.id_map(), loaded.id_map());
655    }
656
657    #[test]
658    fn embedded_load_answers_like_the_original() {
659        // into_index_embedded (the fast path) must rebuild a search-identical index.
660        let index = build_fixture();
661        let parts = decode(&encode(&index)).expect("decode");
662        let loaded = parts.into_index_embedded();
663        let query: Vec<f32> = (0..8).map(|d| d as f32 * 0.3).collect();
664        assert_eq!(
665            index.search(&query, 10),
666            loaded.search(&query, 10),
667            "embedded-vector load must answer EXACTLY like the original"
668        );
669        assert_eq!(index.snapshot_max, loaded.snapshot_max);
670        assert_eq!(index.id_map(), loaded.id_map());
671    }
672
673    #[test]
674    fn every_section_corruption_is_refused() {
675        let index = build_fixture();
676        let bytes = encode(&index);
677        // Flip one byte inside each section's payload region and expect a
678        // refusal each time (walk the framing to find payload offsets).
679        let mut at = 0usize;
680        let mut payload_spots = Vec::new();
681        while at < bytes.len() {
682            let len = u64::from_le_bytes(bytes[at + 1..at + 9].try_into().unwrap()) as usize;
683            payload_spots.push(at + 9 + len / 2);
684            at += 1 + 8 + len + 32;
685        }
686        assert_eq!(payload_spots.len(), 8, "all eight sections present");
687        for spot in payload_spots {
688            let mut corrupt = bytes.clone();
689            corrupt[spot] ^= 0xFF;
690            assert!(
691                matches!(decode(&corrupt), Err(SegmentError::SectionHash(_))),
692                "corruption at {spot} must be refused"
693            );
694        }
695    }
696
697    #[test]
698    fn incomplete_rehydration_is_refused() {
699        let index = build_fixture();
700        let parts = decode(&encode(&index)).expect("decode");
701        let dim = parts.dim() as usize;
702        let n = parts.n();
703        let vectors = vec![0.0f32; n * dim];
704        assert!(matches!(
705            parts.into_index(vectors, n - 1),
706            Err(SegmentError::RehydrationIncomplete { .. })
707        ));
708    }
709
710    #[test]
711    fn config_hash_is_sensitive_to_every_field() {
712        let base = AnnIndex::active_config(Metric::Cosine);
713        let h0 = prism_config_hash(&base);
714        let variants: Vec<PrismConfig> = vec![
715            PrismConfig {
716                m_local: base.m_local + 1,
717                ..base.clone()
718            },
719            PrismConfig {
720                m_greedy: base.m_greedy + 1,
721                ..base.clone()
722            },
723            PrismConfig {
724                m_random: base.m_random + 2,
725                ..base.clone()
726            },
727            PrismConfig {
728                t: base.t + 1,
729                ..base.clone()
730            },
731            PrismConfig {
732                alpha: base.alpha + 0.5,
733                ..base.clone()
734            },
735            PrismConfig {
736                vamana_alpha: base.vamana_alpha + 0.5,
737                ..base.clone()
738            },
739            PrismConfig {
740                beam_width: base.beam_width + 1,
741                ..base.clone()
742            },
743            PrismConfig {
744                metric: Metric::L2,
745                ..base.clone()
746            },
747            PrismConfig {
748                sigma_high: base.sigma_high + 0.25,
749                ..base.clone()
750            },
751            PrismConfig {
752                sigma_low: base.sigma_low + 0.25,
753                ..base.clone()
754            },
755            PrismConfig {
756                beta: base.beta + 0.5,
757                ..base.clone()
758            },
759            PrismConfig {
760                epsilon: base.epsilon + 0.5,
761                ..base.clone()
762            },
763            PrismConfig {
764                binary_rerank: base.binary_rerank + 1,
765                ..base.clone()
766            },
767        ];
768        for (i, v) in variants.iter().enumerate() {
769            assert_ne!(
770                prism_config_hash(v),
771                h0,
772                "config field {i} must perturb the hash"
773            );
774        }
775    }
776}