use std::collections::HashMap;
use crate::core::{DocId, NO_MORE_DOCS, Scorer, SegmentId, TwoPhaseIterator};
use roaring::RoaringBitmap;
#[derive(Clone, Debug, Default)]
pub struct DeletionMap {
bitmaps: HashMap<SegmentId, RoaringBitmap>,
}
impl DeletionMap {
pub fn new() -> Self {
Self::default()
}
pub fn mark_deleted(&mut self, segment_id: SegmentId, doc_id: DocId) {
self.bitmaps
.entry(segment_id)
.or_default()
.insert(doc_id.as_u32());
}
pub fn is_deleted(&self, segment_id: SegmentId, doc_id: DocId) -> bool {
self.bitmaps
.get(&segment_id)
.is_some_and(|bm| bm.contains(doc_id.as_u32()))
}
pub fn bitmap(&self, segment_id: SegmentId) -> Option<&RoaringBitmap> {
self.bitmaps.get(&segment_id)
}
pub fn deleted_count(&self, segment_id: SegmentId) -> u64 {
self.bitmaps.get(&segment_id).map_or(0, |bm| bm.len())
}
pub fn to_bytes(&self) -> Vec<u8> {
let mut buf = Vec::new();
let count = self.bitmaps.len() as u32;
buf.extend_from_slice(&count.to_le_bytes());
for (&segment_id, bitmap) in &self.bitmaps {
buf.extend_from_slice(&segment_id.as_u64().to_le_bytes());
let mut bm_buf = Vec::new();
bitmap.serialize_into(&mut bm_buf).unwrap();
buf.extend_from_slice(&(bm_buf.len() as u32).to_le_bytes());
buf.extend_from_slice(&bm_buf);
}
buf
}
pub fn from_bytes(data: &[u8]) -> crate::core::Result<Self> {
if data.len() < 4 {
return Ok(Self::new());
}
let count = u32::from_le_bytes(data[0..4].try_into().unwrap()) as usize;
let mut pos = 4;
let mut bitmaps = HashMap::with_capacity(count);
for _ in 0..count {
let seg_id = SegmentId::new(u64::from_le_bytes(data[pos..pos + 8].try_into().unwrap()));
pos += 8;
let bm_len = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap()) as usize;
pos += 4;
let bitmap =
RoaringBitmap::deserialize_from(&data[pos..pos + bm_len]).map_err(|e| {
crate::core::LuciError::IndexCorrupted(format!(
"failed to deserialize deletion bitmap: {e}"
))
})?;
pos += bm_len;
bitmaps.insert(seg_id, bitmap);
}
Ok(Self { bitmaps })
}
}
pub struct FilteredScorer<S: Scorer> {
inner: S,
deleted: RoaringBitmap,
}
impl<S: Scorer> FilteredScorer<S> {
pub fn new(inner: S, deleted: RoaringBitmap) -> Self {
let mut s = Self { inner, deleted };
s.skip_deleted();
s
}
fn skip_deleted(&mut self) {
while self.inner.doc_id() != NO_MORE_DOCS
&& self.deleted.contains(self.inner.doc_id().as_u32())
{
self.inner.next();
}
}
}
impl<S: Scorer> Scorer for FilteredScorer<S> {
fn doc_id(&self) -> DocId {
self.inner.doc_id()
}
fn next(&mut self) -> DocId {
self.inner.next();
self.skip_deleted();
self.inner.doc_id()
}
fn advance(&mut self, target: DocId) -> DocId {
self.inner.advance(target);
self.skip_deleted();
self.inner.doc_id()
}
fn score(&mut self) -> f32 {
self.inner.score()
}
fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
self.inner.two_phase()
}
fn max_score(&self) -> f32 {
self.inner.max_score()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn mark_and_check_deleted() {
let mut dm = DeletionMap::new();
let seg = SegmentId::new(1);
assert!(!dm.is_deleted(seg, DocId::new(0)));
dm.mark_deleted(seg, DocId::new(0));
assert!(dm.is_deleted(seg, DocId::new(0)));
assert!(!dm.is_deleted(seg, DocId::new(1)));
}
#[test]
fn deleted_count() {
let mut dm = DeletionMap::new();
let seg = SegmentId::new(1);
assert_eq!(dm.deleted_count(seg), 0);
dm.mark_deleted(seg, DocId::new(0));
dm.mark_deleted(seg, DocId::new(5));
dm.mark_deleted(seg, DocId::new(10));
assert_eq!(dm.deleted_count(seg), 3);
}
#[test]
fn multiple_segments() {
let mut dm = DeletionMap::new();
dm.mark_deleted(SegmentId::new(1), DocId::new(0));
dm.mark_deleted(SegmentId::new(2), DocId::new(5));
assert!(dm.is_deleted(SegmentId::new(1), DocId::new(0)));
assert!(!dm.is_deleted(SegmentId::new(1), DocId::new(5)));
assert!(!dm.is_deleted(SegmentId::new(2), DocId::new(0)));
assert!(dm.is_deleted(SegmentId::new(2), DocId::new(5)));
}
#[test]
fn serialization_round_trip() {
let mut dm = DeletionMap::new();
dm.mark_deleted(SegmentId::new(1), DocId::new(0));
dm.mark_deleted(SegmentId::new(1), DocId::new(100));
dm.mark_deleted(SegmentId::new(2), DocId::new(50));
let bytes = dm.to_bytes();
let dm2 = DeletionMap::from_bytes(&bytes).unwrap();
assert!(dm2.is_deleted(SegmentId::new(1), DocId::new(0)));
assert!(dm2.is_deleted(SegmentId::new(1), DocId::new(100)));
assert!(!dm2.is_deleted(SegmentId::new(1), DocId::new(50)));
assert!(dm2.is_deleted(SegmentId::new(2), DocId::new(50)));
}
#[test]
fn empty_serialization() {
let dm = DeletionMap::new();
let bytes = dm.to_bytes();
let dm2 = DeletionMap::from_bytes(&bytes).unwrap();
assert_eq!(dm2.deleted_count(SegmentId::new(1)), 0);
}
struct VecScorer {
docs: Vec<(DocId, f32)>,
pos: usize,
}
impl VecScorer {
fn new(docs: Vec<(DocId, f32)>) -> Self {
Self { docs, pos: 0 }
}
fn current(&self) -> (DocId, f32) {
if self.pos < self.docs.len() {
self.docs[self.pos]
} else {
(NO_MORE_DOCS, 0.0)
}
}
}
impl Scorer for VecScorer {
fn doc_id(&self) -> DocId {
self.current().0
}
fn next(&mut self) -> DocId {
if self.pos < self.docs.len() {
self.pos += 1;
}
self.current().0
}
fn advance(&mut self, target: DocId) -> DocId {
while self.pos < self.docs.len() && self.docs[self.pos].0 < target {
self.pos += 1;
}
self.current().0
}
fn score(&mut self) -> f32 {
self.current().1
}
fn two_phase(&mut self) -> Option<&mut dyn TwoPhaseIterator> {
None
}
}
#[test]
fn filtered_scorer_skips_deleted() {
let scorer = VecScorer::new(vec![
(DocId::new(0), 1.0),
(DocId::new(1), 2.0),
(DocId::new(2), 3.0),
(DocId::new(3), 4.0),
]);
let mut deleted = RoaringBitmap::new();
deleted.insert(0);
deleted.insert(2);
let mut filtered = FilteredScorer::new(scorer, deleted);
assert_eq!(filtered.doc_id(), DocId::new(1));
assert_eq!(filtered.score(), 2.0);
assert_eq!(filtered.next(), DocId::new(3));
assert_eq!(filtered.score(), 4.0);
assert_eq!(filtered.next(), NO_MORE_DOCS);
}
#[test]
fn filtered_scorer_advance_past_deleted() {
let scorer = VecScorer::new(vec![
(DocId::new(0), 1.0),
(DocId::new(5), 2.0),
(DocId::new(10), 3.0),
(DocId::new(15), 4.0),
]);
let mut deleted = RoaringBitmap::new();
deleted.insert(5);
deleted.insert(10);
let mut filtered = FilteredScorer::new(scorer, deleted);
assert_eq!(filtered.doc_id(), DocId::new(0));
assert_eq!(filtered.advance(DocId::new(5)), DocId::new(15));
}
#[test]
fn filtered_scorer_all_deleted() {
let scorer = VecScorer::new(vec![(DocId::new(0), 1.0), (DocId::new(1), 2.0)]);
let mut deleted = RoaringBitmap::new();
deleted.insert(0);
deleted.insert(1);
let filtered = FilteredScorer::new(scorer, deleted);
assert_eq!(filtered.doc_id(), NO_MORE_DOCS);
}
#[test]
fn filtered_scorer_none_deleted() {
let scorer = VecScorer::new(vec![(DocId::new(0), 1.0), (DocId::new(1), 2.0)]);
let deleted = RoaringBitmap::new();
let mut filtered = FilteredScorer::new(scorer, deleted);
assert_eq!(filtered.doc_id(), DocId::new(0));
assert_eq!(filtered.next(), DocId::new(1));
assert_eq!(filtered.next(), NO_MORE_DOCS);
}
}