lucisearch 0.8.0

Embeddable, in-process search engine — the SQLite/DuckDB of Elasticsearch
Documentation
//! Deletion bitmaps: roaring bitmap tracking of deleted documents.
//!
//! Segments are immutable once written. Deletions are tracked externally
//! via per-segment roaring bitmaps. The `FilteredScorer` wraps a `Scorer`
//! and skips deleted doc IDs during iteration.
//!
//! See [[architecture-segment-layout#Deletion Bitmap]] and [[roaring-bitmaps]].

use std::collections::HashMap;

use crate::core::{DocId, NO_MORE_DOCS, Scorer, SegmentId, TwoPhaseIterator};
use roaring::RoaringBitmap;

/// Tracks deleted documents per segment.
#[derive(Clone, Debug, Default)]
pub struct DeletionMap {
    bitmaps: HashMap<SegmentId, RoaringBitmap>,
}

impl DeletionMap {
    pub fn new() -> Self {
        Self::default()
    }

    /// Mark a document as deleted in a segment.
    pub fn mark_deleted(&mut self, segment_id: SegmentId, doc_id: DocId) {
        self.bitmaps
            .entry(segment_id)
            .or_default()
            .insert(doc_id.as_u32());
    }

    /// Check if a document is deleted.
    pub fn is_deleted(&self, segment_id: SegmentId, doc_id: DocId) -> bool {
        self.bitmaps
            .get(&segment_id)
            .is_some_and(|bm| bm.contains(doc_id.as_u32()))
    }

    /// Get the deletion bitmap for a segment (if any deletions exist).
    pub fn bitmap(&self, segment_id: SegmentId) -> Option<&RoaringBitmap> {
        self.bitmaps.get(&segment_id)
    }

    /// Number of deleted documents in a segment.
    pub fn deleted_count(&self, segment_id: SegmentId) -> u64 {
        self.bitmaps.get(&segment_id).map_or(0, |bm| bm.len())
    }

    /// Serialize to bytes for persistence.
    pub fn to_bytes(&self) -> Vec<u8> {
        let mut buf = Vec::new();
        let count = self.bitmaps.len() as u32;
        buf.extend_from_slice(&count.to_le_bytes());

        for (&segment_id, bitmap) in &self.bitmaps {
            buf.extend_from_slice(&segment_id.as_u64().to_le_bytes());
            let mut bm_buf = Vec::new();
            bitmap.serialize_into(&mut bm_buf).unwrap();
            buf.extend_from_slice(&(bm_buf.len() as u32).to_le_bytes());
            buf.extend_from_slice(&bm_buf);
        }

        buf
    }

    /// Deserialize from bytes.
    pub fn from_bytes(data: &[u8]) -> crate::core::Result<Self> {
        if data.len() < 4 {
            return Ok(Self::new());
        }

        let count = u32::from_le_bytes(data[0..4].try_into().unwrap()) as usize;
        let mut pos = 4;
        let mut bitmaps = HashMap::with_capacity(count);

        for _ in 0..count {
            let seg_id = SegmentId::new(u64::from_le_bytes(data[pos..pos + 8].try_into().unwrap()));
            pos += 8;
            let bm_len = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap()) as usize;
            pos += 4;
            let bitmap =
                RoaringBitmap::deserialize_from(&data[pos..pos + bm_len]).map_err(|e| {
                    crate::core::LuciError::IndexCorrupted(format!(
                        "failed to deserialize deletion bitmap: {e}"
                    ))
                })?;
            pos += bm_len;
            bitmaps.insert(seg_id, bitmap);
        }

        Ok(Self { bitmaps })
    }
}

/// A scorer that wraps another scorer and skips deleted doc IDs.
pub struct FilteredScorer<S: Scorer> {
    inner: S,
    deleted: RoaringBitmap,
}

impl<S: Scorer> FilteredScorer<S> {
    pub fn new(inner: S, deleted: RoaringBitmap) -> Self {
        // Advance past any initially deleted doc
        let mut s = Self { inner, deleted };
        s.skip_deleted();
        s
    }

    fn skip_deleted(&mut self) {
        while self.inner.doc_id() != NO_MORE_DOCS
            && self.deleted.contains(self.inner.doc_id().as_u32())
        {
            self.inner.next();
        }
    }
}

impl<S: Scorer> Scorer for FilteredScorer<S> {
    fn doc_id(&self) -> DocId {
        self.inner.doc_id()
    }

    fn next(&mut self) -> DocId {
        self.inner.next();
        self.skip_deleted();
        self.inner.doc_id()
    }

    fn advance(&mut self, target: DocId) -> DocId {
        self.inner.advance(target);
        self.skip_deleted();
        self.inner.doc_id()
    }

    fn score(&mut self) -> f32 {
        self.inner.score()
    }

    fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
        self.inner.two_phase()
    }

    fn max_score(&self) -> f32 {
        self.inner.max_score()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn mark_and_check_deleted() {
        let mut dm = DeletionMap::new();
        let seg = SegmentId::new(1);

        assert!(!dm.is_deleted(seg, DocId::new(0)));
        dm.mark_deleted(seg, DocId::new(0));
        assert!(dm.is_deleted(seg, DocId::new(0)));
        assert!(!dm.is_deleted(seg, DocId::new(1)));
    }

    #[test]
    fn deleted_count() {
        let mut dm = DeletionMap::new();
        let seg = SegmentId::new(1);

        assert_eq!(dm.deleted_count(seg), 0);
        dm.mark_deleted(seg, DocId::new(0));
        dm.mark_deleted(seg, DocId::new(5));
        dm.mark_deleted(seg, DocId::new(10));
        assert_eq!(dm.deleted_count(seg), 3);
    }

    #[test]
    fn multiple_segments() {
        let mut dm = DeletionMap::new();
        dm.mark_deleted(SegmentId::new(1), DocId::new(0));
        dm.mark_deleted(SegmentId::new(2), DocId::new(5));

        assert!(dm.is_deleted(SegmentId::new(1), DocId::new(0)));
        assert!(!dm.is_deleted(SegmentId::new(1), DocId::new(5)));
        assert!(!dm.is_deleted(SegmentId::new(2), DocId::new(0)));
        assert!(dm.is_deleted(SegmentId::new(2), DocId::new(5)));
    }

    #[test]
    fn serialization_round_trip() {
        let mut dm = DeletionMap::new();
        dm.mark_deleted(SegmentId::new(1), DocId::new(0));
        dm.mark_deleted(SegmentId::new(1), DocId::new(100));
        dm.mark_deleted(SegmentId::new(2), DocId::new(50));

        let bytes = dm.to_bytes();
        let dm2 = DeletionMap::from_bytes(&bytes).unwrap();

        assert!(dm2.is_deleted(SegmentId::new(1), DocId::new(0)));
        assert!(dm2.is_deleted(SegmentId::new(1), DocId::new(100)));
        assert!(!dm2.is_deleted(SegmentId::new(1), DocId::new(50)));
        assert!(dm2.is_deleted(SegmentId::new(2), DocId::new(50)));
    }

    #[test]
    fn empty_serialization() {
        let dm = DeletionMap::new();
        let bytes = dm.to_bytes();
        let dm2 = DeletionMap::from_bytes(&bytes).unwrap();
        assert_eq!(dm2.deleted_count(SegmentId::new(1)), 0);
    }

    // --- FilteredScorer tests ---

    /// Simple test scorer for FilteredScorer tests.
    struct VecScorer {
        docs: Vec<(DocId, f32)>,
        pos: usize,
    }

    impl VecScorer {
        fn new(docs: Vec<(DocId, f32)>) -> Self {
            Self { docs, pos: 0 }
        }

        fn current(&self) -> (DocId, f32) {
            if self.pos < self.docs.len() {
                self.docs[self.pos]
            } else {
                (NO_MORE_DOCS, 0.0)
            }
        }
    }

    impl Scorer for VecScorer {
        fn doc_id(&self) -> DocId {
            self.current().0
        }

        fn next(&mut self) -> DocId {
            if self.pos < self.docs.len() {
                self.pos += 1;
            }
            self.current().0
        }

        fn advance(&mut self, target: DocId) -> DocId {
            while self.pos < self.docs.len() && self.docs[self.pos].0 < target {
                self.pos += 1;
            }
            self.current().0
        }

        fn score(&mut self) -> f32 {
            self.current().1
        }

        fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
            None
        }
    }

    #[test]
    fn filtered_scorer_skips_deleted() {
        let scorer = VecScorer::new(vec![
            (DocId::new(0), 1.0),
            (DocId::new(1), 2.0),
            (DocId::new(2), 3.0),
            (DocId::new(3), 4.0),
        ]);
        let mut deleted = RoaringBitmap::new();
        deleted.insert(0);
        deleted.insert(2);

        let mut filtered = FilteredScorer::new(scorer, deleted);

        // Should skip doc 0, start at doc 1
        assert_eq!(filtered.doc_id(), DocId::new(1));
        assert_eq!(filtered.score(), 2.0);

        // Next should skip doc 2, land on doc 3
        assert_eq!(filtered.next(), DocId::new(3));
        assert_eq!(filtered.score(), 4.0);

        assert_eq!(filtered.next(), NO_MORE_DOCS);
    }

    #[test]
    fn filtered_scorer_advance_past_deleted() {
        let scorer = VecScorer::new(vec![
            (DocId::new(0), 1.0),
            (DocId::new(5), 2.0),
            (DocId::new(10), 3.0),
            (DocId::new(15), 4.0),
        ]);
        let mut deleted = RoaringBitmap::new();
        deleted.insert(5);
        deleted.insert(10);

        let mut filtered = FilteredScorer::new(scorer, deleted);

        assert_eq!(filtered.doc_id(), DocId::new(0));
        // Advance to 5 should skip 5 and 10, land on 15
        assert_eq!(filtered.advance(DocId::new(5)), DocId::new(15));
    }

    #[test]
    fn filtered_scorer_all_deleted() {
        let scorer = VecScorer::new(vec![(DocId::new(0), 1.0), (DocId::new(1), 2.0)]);
        let mut deleted = RoaringBitmap::new();
        deleted.insert(0);
        deleted.insert(1);

        let filtered = FilteredScorer::new(scorer, deleted);
        assert_eq!(filtered.doc_id(), NO_MORE_DOCS);
    }

    #[test]
    fn filtered_scorer_none_deleted() {
        let scorer = VecScorer::new(vec![(DocId::new(0), 1.0), (DocId::new(1), 2.0)]);
        let deleted = RoaringBitmap::new();

        let mut filtered = FilteredScorer::new(scorer, deleted);
        assert_eq!(filtered.doc_id(), DocId::new(0));
        assert_eq!(filtered.next(), DocId::new(1));
        assert_eq!(filtered.next(), NO_MORE_DOCS);
    }
}