Skip to main content

nodedb_vector/collection/
lifecycle_compact.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Compact and snapshot operations for `VectorCollection`.
4
5use nodedb_types::Surrogate;
6
7use super::lifecycle::VectorCollection;
8
9impl VectorCollection {
10    /// Compact sealed segments by removing tombstoned nodes.
11    ///
12    /// Rewrites `surrogate_map` and `multi_doc_map` for every sealed
13    /// segment so that global ids continue to resolve to the correct
14    /// surrogate after local-id renumbering.
15    pub fn compact(&mut self) -> usize {
16        let mut total_removed = 0;
17        for seg in &mut self.sealed {
18            let base_id = seg.base_id;
19            let (removed, id_map) = seg.index.compact_with_map();
20            total_removed += removed;
21            if removed == 0 {
22                continue;
23            }
24
25            let segment_end = base_id as u64 + id_map.len() as u64;
26            let global_keys: Vec<u32> = self
27                .surrogate_map
28                .keys()
29                .copied()
30                .filter(|&k| (k as u64) >= base_id as u64 && (k as u64) < segment_end)
31                .collect();
32            // Two-phase: remove old entries first, then insert new ones
33            // so we don't clobber a freshly-remapped entry with a later
34            // tombstone removal.
35            let mut new_entries: Vec<(u32, Surrogate)> = Vec::with_capacity(global_keys.len());
36            for old_global in &global_keys {
37                let surrogate = self.surrogate_map.remove(old_global);
38                let old_local = (old_global - base_id) as usize;
39                let new_local = id_map[old_local];
40                if new_local != u32::MAX
41                    && let Some(s) = surrogate
42                {
43                    new_entries.push((base_id + new_local, s));
44                } else if let Some(s) = surrogate {
45                    // Tombstoned — drop reverse mapping too.
46                    self.surrogate_to_local.remove(&s);
47                }
48            }
49            for (k, s) in new_entries {
50                self.surrogate_map.insert(k, s);
51                self.surrogate_to_local.insert(s, k);
52            }
53
54            // Rewrite multi_doc_map entries for this segment.
55            for ids in self.multi_doc_map.values_mut() {
56                ids.retain_mut(|vid| {
57                    let v = *vid;
58                    if (v as u64) >= base_id as u64 && (v as u64) < segment_end {
59                        let old_local = (v - base_id) as usize;
60                        let new_local = id_map[old_local];
61                        if new_local == u32::MAX {
62                            false
63                        } else {
64                            *vid = base_id + new_local;
65                            true
66                        }
67                    } else {
68                        true
69                    }
70                });
71            }
72        }
73        total_removed
74    }
75
76    /// Export all live vectors for snapshot.
77    pub fn export_snapshot(&self) -> Vec<(u32, Vec<f32>, Option<Surrogate>)> {
78        let mut result = Vec::new();
79
80        for i in 0..self.growing.len() as u32 {
81            let vid = self.growing_base_id + i;
82            if let Some(data) = self.growing.get_vector(i) {
83                let surrogate = self.surrogate_map.get(&vid).copied();
84                result.push((vid, data.to_vec(), surrogate));
85            }
86        }
87
88        for seg in &self.sealed {
89            let vectors = seg.index.export_vectors();
90            for (i, vec_data) in vectors.into_iter().enumerate() {
91                let vid = seg.base_id + i as u32;
92                let surrogate = self.surrogate_map.get(&vid).copied();
93                result.push((vid, vec_data, surrogate));
94            }
95        }
96
97        for seg in &self.building {
98            for i in 0..seg.flat.len() as u32 {
99                let vid = seg.base_id + i;
100                if let Some(data) = seg.flat.get_vector(i) {
101                    let surrogate = self.surrogate_map.get(&vid).copied();
102                    result.push((vid, data.to_vec(), surrogate));
103                }
104            }
105        }
106
107        result
108    }
109}