Skip to main content

luci/
deletion.rs

1//! Deletion bitmaps: roaring bitmap tracking of deleted documents.
2//!
3//! Segments are immutable once written. Deletions are tracked externally
4//! via per-segment roaring bitmaps. The `FilteredScorer` wraps a `Scorer`
5//! and skips deleted doc IDs during iteration.
6//!
7//! See [[architecture-segment-layout#Deletion Bitmap]] and [[roaring-bitmaps]].
8
9use std::collections::HashMap;
10
11use crate::core::{DocId, NO_MORE_DOCS, Scorer, SegmentId, TwoPhaseIterator};
12use roaring::RoaringBitmap;
13
14/// Tracks deleted documents per segment.
15#[derive(Clone, Debug, Default)]
16pub struct DeletionMap {
17    bitmaps: HashMap<SegmentId, RoaringBitmap>,
18}
19
20impl DeletionMap {
21    pub fn new() -> Self {
22        Self::default()
23    }
24
25    /// Mark a document as deleted in a segment.
26    pub fn mark_deleted(&mut self, segment_id: SegmentId, doc_id: DocId) {
27        self.bitmaps
28            .entry(segment_id)
29            .or_default()
30            .insert(doc_id.as_u32());
31    }
32
33    /// Check if a document is deleted.
34    pub fn is_deleted(&self, segment_id: SegmentId, doc_id: DocId) -> bool {
35        self.bitmaps
36            .get(&segment_id)
37            .is_some_and(|bm| bm.contains(doc_id.as_u32()))
38    }
39
40    /// Get the deletion bitmap for a segment (if any deletions exist).
41    pub fn bitmap(&self, segment_id: SegmentId) -> Option<&RoaringBitmap> {
42        self.bitmaps.get(&segment_id)
43    }
44
45    /// Number of deleted documents in a segment.
46    pub fn deleted_count(&self, segment_id: SegmentId) -> u64 {
47        self.bitmaps.get(&segment_id).map_or(0, |bm| bm.len())
48    }
49
50    /// Serialize to bytes for persistence.
51    pub fn to_bytes(&self) -> Vec<u8> {
52        let mut buf = Vec::new();
53        let count = self.bitmaps.len() as u32;
54        buf.extend_from_slice(&count.to_le_bytes());
55
56        for (&segment_id, bitmap) in &self.bitmaps {
57            buf.extend_from_slice(&segment_id.as_u64().to_le_bytes());
58            let mut bm_buf = Vec::new();
59            bitmap.serialize_into(&mut bm_buf).unwrap();
60            buf.extend_from_slice(&(bm_buf.len() as u32).to_le_bytes());
61            buf.extend_from_slice(&bm_buf);
62        }
63
64        buf
65    }
66
67    /// Deserialize from bytes.
68    pub fn from_bytes(data: &[u8]) -> crate::core::Result<Self> {
69        if data.len() < 4 {
70            return Ok(Self::new());
71        }
72
73        let count = u32::from_le_bytes(data[0..4].try_into().unwrap()) as usize;
74        let mut pos = 4;
75        let mut bitmaps = HashMap::with_capacity(count);
76
77        for _ in 0..count {
78            let seg_id = SegmentId::new(u64::from_le_bytes(data[pos..pos + 8].try_into().unwrap()));
79            pos += 8;
80            let bm_len = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap()) as usize;
81            pos += 4;
82            let bitmap =
83                RoaringBitmap::deserialize_from(&data[pos..pos + bm_len]).map_err(|e| {
84                    crate::core::LuciError::IndexCorrupted(format!(
85                        "failed to deserialize deletion bitmap: {e}"
86                    ))
87                })?;
88            pos += bm_len;
89            bitmaps.insert(seg_id, bitmap);
90        }
91
92        Ok(Self { bitmaps })
93    }
94}
95
96/// A scorer that wraps another scorer and skips deleted doc IDs.
97pub struct FilteredScorer<S: Scorer> {
98    inner: S,
99    deleted: RoaringBitmap,
100}
101
102impl<S: Scorer> FilteredScorer<S> {
103    pub fn new(inner: S, deleted: RoaringBitmap) -> Self {
104        // Advance past any initially deleted doc
105        let mut s = Self { inner, deleted };
106        s.skip_deleted();
107        s
108    }
109
110    fn skip_deleted(&mut self) {
111        while self.inner.doc_id() != NO_MORE_DOCS
112            && self.deleted.contains(self.inner.doc_id().as_u32())
113        {
114            self.inner.next();
115        }
116    }
117}
118
119impl<S: Scorer> Scorer for FilteredScorer<S> {
120    fn doc_id(&self) -> DocId {
121        self.inner.doc_id()
122    }
123
124    fn next(&mut self) -> DocId {
125        self.inner.next();
126        self.skip_deleted();
127        self.inner.doc_id()
128    }
129
130    fn advance(&mut self, target: DocId) -> DocId {
131        self.inner.advance(target);
132        self.skip_deleted();
133        self.inner.doc_id()
134    }
135
136    fn score(&mut self) -> f32 {
137        self.inner.score()
138    }
139
140    fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
141        self.inner.two_phase()
142    }
143
144    fn max_score(&self) -> f32 {
145        self.inner.max_score()
146    }
147}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152
153    #[test]
154    fn mark_and_check_deleted() {
155        let mut dm = DeletionMap::new();
156        let seg = SegmentId::new(1);
157
158        assert!(!dm.is_deleted(seg, DocId::new(0)));
159        dm.mark_deleted(seg, DocId::new(0));
160        assert!(dm.is_deleted(seg, DocId::new(0)));
161        assert!(!dm.is_deleted(seg, DocId::new(1)));
162    }
163
164    #[test]
165    fn deleted_count() {
166        let mut dm = DeletionMap::new();
167        let seg = SegmentId::new(1);
168
169        assert_eq!(dm.deleted_count(seg), 0);
170        dm.mark_deleted(seg, DocId::new(0));
171        dm.mark_deleted(seg, DocId::new(5));
172        dm.mark_deleted(seg, DocId::new(10));
173        assert_eq!(dm.deleted_count(seg), 3);
174    }
175
176    #[test]
177    fn multiple_segments() {
178        let mut dm = DeletionMap::new();
179        dm.mark_deleted(SegmentId::new(1), DocId::new(0));
180        dm.mark_deleted(SegmentId::new(2), DocId::new(5));
181
182        assert!(dm.is_deleted(SegmentId::new(1), DocId::new(0)));
183        assert!(!dm.is_deleted(SegmentId::new(1), DocId::new(5)));
184        assert!(!dm.is_deleted(SegmentId::new(2), DocId::new(0)));
185        assert!(dm.is_deleted(SegmentId::new(2), DocId::new(5)));
186    }
187
188    #[test]
189    fn serialization_round_trip() {
190        let mut dm = DeletionMap::new();
191        dm.mark_deleted(SegmentId::new(1), DocId::new(0));
192        dm.mark_deleted(SegmentId::new(1), DocId::new(100));
193        dm.mark_deleted(SegmentId::new(2), DocId::new(50));
194
195        let bytes = dm.to_bytes();
196        let dm2 = DeletionMap::from_bytes(&bytes).unwrap();
197
198        assert!(dm2.is_deleted(SegmentId::new(1), DocId::new(0)));
199        assert!(dm2.is_deleted(SegmentId::new(1), DocId::new(100)));
200        assert!(!dm2.is_deleted(SegmentId::new(1), DocId::new(50)));
201        assert!(dm2.is_deleted(SegmentId::new(2), DocId::new(50)));
202    }
203
204    #[test]
205    fn empty_serialization() {
206        let dm = DeletionMap::new();
207        let bytes = dm.to_bytes();
208        let dm2 = DeletionMap::from_bytes(&bytes).unwrap();
209        assert_eq!(dm2.deleted_count(SegmentId::new(1)), 0);
210    }
211
212    // --- FilteredScorer tests ---
213
214    /// Simple test scorer for FilteredScorer tests.
215    struct VecScorer {
216        docs: Vec<(DocId, f32)>,
217        pos: usize,
218    }
219
220    impl VecScorer {
221        fn new(docs: Vec<(DocId, f32)>) -> Self {
222            Self { docs, pos: 0 }
223        }
224
225        fn current(&self) -> (DocId, f32) {
226            if self.pos < self.docs.len() {
227                self.docs[self.pos]
228            } else {
229                (NO_MORE_DOCS, 0.0)
230            }
231        }
232    }
233
234    impl Scorer for VecScorer {
235        fn doc_id(&self) -> DocId {
236            self.current().0
237        }
238
239        fn next(&mut self) -> DocId {
240            if self.pos < self.docs.len() {
241                self.pos += 1;
242            }
243            self.current().0
244        }
245
246        fn advance(&mut self, target: DocId) -> DocId {
247            while self.pos < self.docs.len() && self.docs[self.pos].0 < target {
248                self.pos += 1;
249            }
250            self.current().0
251        }
252
253        fn score(&mut self) -> f32 {
254            self.current().1
255        }
256
257        fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
258            None
259        }
260    }
261
262    #[test]
263    fn filtered_scorer_skips_deleted() {
264        let scorer = VecScorer::new(vec![
265            (DocId::new(0), 1.0),
266            (DocId::new(1), 2.0),
267            (DocId::new(2), 3.0),
268            (DocId::new(3), 4.0),
269        ]);
270        let mut deleted = RoaringBitmap::new();
271        deleted.insert(0);
272        deleted.insert(2);
273
274        let mut filtered = FilteredScorer::new(scorer, deleted);
275
276        // Should skip doc 0, start at doc 1
277        assert_eq!(filtered.doc_id(), DocId::new(1));
278        assert_eq!(filtered.score(), 2.0);
279
280        // Next should skip doc 2, land on doc 3
281        assert_eq!(filtered.next(), DocId::new(3));
282        assert_eq!(filtered.score(), 4.0);
283
284        assert_eq!(filtered.next(), NO_MORE_DOCS);
285    }
286
287    #[test]
288    fn filtered_scorer_advance_past_deleted() {
289        let scorer = VecScorer::new(vec![
290            (DocId::new(0), 1.0),
291            (DocId::new(5), 2.0),
292            (DocId::new(10), 3.0),
293            (DocId::new(15), 4.0),
294        ]);
295        let mut deleted = RoaringBitmap::new();
296        deleted.insert(5);
297        deleted.insert(10);
298
299        let mut filtered = FilteredScorer::new(scorer, deleted);
300
301        assert_eq!(filtered.doc_id(), DocId::new(0));
302        // Advance to 5 should skip 5 and 10, land on 15
303        assert_eq!(filtered.advance(DocId::new(5)), DocId::new(15));
304    }
305
306    #[test]
307    fn filtered_scorer_all_deleted() {
308        let scorer = VecScorer::new(vec![(DocId::new(0), 1.0), (DocId::new(1), 2.0)]);
309        let mut deleted = RoaringBitmap::new();
310        deleted.insert(0);
311        deleted.insert(1);
312
313        let filtered = FilteredScorer::new(scorer, deleted);
314        assert_eq!(filtered.doc_id(), NO_MORE_DOCS);
315    }
316
317    #[test]
318    fn filtered_scorer_none_deleted() {
319        let scorer = VecScorer::new(vec![(DocId::new(0), 1.0), (DocId::new(1), 2.0)]);
320        let deleted = RoaringBitmap::new();
321
322        let mut filtered = FilteredScorer::new(scorer, deleted);
323        assert_eq!(filtered.doc_id(), DocId::new(0));
324        assert_eq!(filtered.next(), DocId::new(1));
325        assert_eq!(filtered.next(), NO_MORE_DOCS);
326    }
327}