1use std::{collections::HashSet, ops::Range, sync::Arc};
5
6use arrow_array::BooleanArray;
7use deepsize::{Context, DeepSizeOf};
8use roaring::RoaringBitmap;
9
10const BITMAP_THRESDHOLD: usize = 5_000;
12#[derive(Debug, Clone, Default)]
16pub enum DeletionVector {
17 #[default]
18 NoDeletions,
19 Set(HashSet<u32>),
20 Bitmap(RoaringBitmap),
21}
22
23impl DeepSizeOf for DeletionVector {
24 fn deep_size_of_children(&self, context: &mut Context) -> usize {
25 match self {
26 Self::NoDeletions => 0,
27 Self::Set(set) => set.deep_size_of_children(context),
28 Self::Bitmap(bitmap) => bitmap.serialized_size(),
30 }
31 }
32}
33
34impl DeletionVector {
35 pub fn len(&self) -> usize {
36 match self {
37 Self::NoDeletions => 0,
38 Self::Set(set) => set.len(),
39 Self::Bitmap(bitmap) => bitmap.len() as usize,
40 }
41 }
42
43 pub fn is_empty(&self) -> bool {
44 self.len() == 0
45 }
46
47 pub fn contains(&self, i: u32) -> bool {
48 match self {
49 Self::NoDeletions => false,
50 Self::Set(set) => set.contains(&i),
51 Self::Bitmap(bitmap) => bitmap.contains(i),
52 }
53 }
54
55 pub fn contains_range(&self, mut range: Range<u32>) -> bool {
56 match self {
57 Self::NoDeletions => range.is_empty(),
58 Self::Set(set) => range.all(|i| set.contains(&i)),
59 Self::Bitmap(bitmap) => bitmap.contains_range(range),
60 }
61 }
62
63 fn range_cardinality(&self, range: Range<u32>) -> u64 {
64 match self {
65 Self::NoDeletions => 0,
66 Self::Set(set) => range.fold(0, |acc, i| acc + set.contains(&i) as u64),
67 Self::Bitmap(bitmap) => bitmap.range_cardinality(range),
68 }
69 }
70
71 pub fn iter(&self) -> Box<dyn Iterator<Item = u32> + Send + '_> {
72 match self {
73 Self::NoDeletions => Box::new(std::iter::empty()),
74 Self::Set(set) => Box::new(set.iter().copied()),
75 Self::Bitmap(bitmap) => Box::new(bitmap.iter()),
76 }
77 }
78
79 pub fn into_sorted_iter(self) -> Box<dyn Iterator<Item = u32> + Send + 'static> {
80 match self {
81 Self::NoDeletions => Box::new(std::iter::empty()),
82 Self::Set(set) => {
83 let mut values = Vec::from_iter(set);
86 values.sort();
87 Box::new(values.into_iter())
88 }
89 Self::Bitmap(bitmap) => Box::new(bitmap.into_iter()),
91 }
92 }
93
94 pub fn to_sorted_iter<'a>(&'a self) -> Box<dyn Iterator<Item = u32> + Send + 'a> {
96 match self {
97 Self::NoDeletions => Box::new(std::iter::empty()),
98 Self::Set(_) => self.clone().into_sorted_iter(),
101 Self::Bitmap(bitmap) => Box::new(bitmap.iter()),
102 }
103 }
104
105 pub fn build_predicate(&self, row_addrs: std::slice::Iter<u64>) -> Option<BooleanArray> {
109 match self {
110 Self::Bitmap(bitmap) => Some(
111 row_addrs
112 .map(|&id| !bitmap.contains(id as u32))
113 .collect::<Vec<_>>(),
114 ),
115 Self::Set(set) => Some(
116 row_addrs
117 .map(|&id| !set.contains(&(id as u32)))
118 .collect::<Vec<_>>(),
119 ),
120 Self::NoDeletions => None,
121 }
122 .map(BooleanArray::from)
123 }
124}
125
126pub struct OffsetMapper {
141 dv: Arc<DeletionVector>,
142 left: u32,
143 last_diff: u32,
144}
145
146impl OffsetMapper {
147 pub fn new(dv: Arc<DeletionVector>) -> Self {
148 Self {
149 dv,
150 left: 0,
151 last_diff: 0,
152 }
153 }
154
155 pub fn map_offset(&mut self, offset: u32) -> u32 {
156 let mut mid = offset + self.last_diff;
160 let mut right = offset + self.dv.len() as u32;
161 loop {
162 let deleted_in_range = self.dv.range_cardinality(0..(mid + 1)) as u32;
163 match mid.cmp(&(offset + deleted_in_range)) {
164 std::cmp::Ordering::Equal if !self.dv.contains(mid) => {
165 self.last_diff = mid - offset;
166 return mid;
167 }
168 std::cmp::Ordering::Less => {
169 assert_ne!(self.left, mid + 1);
170 self.left = mid + 1;
171 mid = self.left + (right - self.left) / 2;
172 }
173 std::cmp::Ordering::Greater | std::cmp::Ordering::Equal => {
177 right = mid;
178 mid = self.left + (right - self.left) / 2;
179 }
180 }
181 }
182 }
183}
184
185impl From<&DeletionVector> for RoaringBitmap {
186 fn from(value: &DeletionVector) -> Self {
187 match value {
188 DeletionVector::Bitmap(bitmap) => bitmap.clone(),
189 DeletionVector::Set(set) => Self::from_iter(set.iter()),
190 DeletionVector::NoDeletions => Self::new(),
191 }
192 }
193}
194
195impl PartialEq for DeletionVector {
196 fn eq(&self, other: &Self) -> bool {
197 match (self, other) {
198 (Self::NoDeletions, Self::NoDeletions) => true,
199 (Self::Set(set1), Self::Set(set2)) => set1 == set2,
200 (Self::Bitmap(bitmap1), Self::Bitmap(bitmap2)) => bitmap1 == bitmap2,
201 (Self::Set(set), Self::Bitmap(bitmap)) | (Self::Bitmap(bitmap), Self::Set(set)) => {
202 let set = set.iter().copied().collect::<RoaringBitmap>();
203 set == *bitmap
204 }
205 _ => false,
206 }
207 }
208}
209
210impl Extend<u32> for DeletionVector {
211 fn extend<T: IntoIterator<Item = u32>>(&mut self, iter: T) {
212 let iter = iter.into_iter();
213 *self = match (std::mem::take(self), iter.size_hint()) {
216 (Self::NoDeletions, (_, Some(0))) => Self::NoDeletions,
217 (Self::NoDeletions, (lower, _)) if lower >= BITMAP_THRESDHOLD => {
218 let bitmap = iter.collect::<RoaringBitmap>();
219 Self::Bitmap(bitmap)
220 }
221 (Self::NoDeletions, (_, Some(upper))) if upper < BITMAP_THRESDHOLD => {
222 let set = iter.collect::<HashSet<_>>();
223 Self::Set(set)
224 }
225 (Self::NoDeletions, _) => {
226 let set = iter.collect::<HashSet<_>>();
229 if set.len() > BITMAP_THRESDHOLD {
230 let bitmap = set.into_iter().collect::<RoaringBitmap>();
231 Self::Bitmap(bitmap)
232 } else {
233 Self::Set(set)
234 }
235 }
236 (Self::Set(mut set), _) => {
237 set.extend(iter);
238 if set.len() > BITMAP_THRESDHOLD {
239 let bitmap = set.drain().collect::<RoaringBitmap>();
240 Self::Bitmap(bitmap)
241 } else {
242 Self::Set(set)
243 }
244 }
245 (Self::Bitmap(mut bitmap), _) => {
246 bitmap.extend(iter);
247 Self::Bitmap(bitmap)
248 }
249 };
250 }
251}
252
253impl IntoIterator for DeletionVector {
259 type IntoIter = Box<dyn Iterator<Item = Self::Item> + Send>;
260 type Item = u32;
261
262 fn into_iter(self) -> Self::IntoIter {
263 match self {
264 Self::NoDeletions => Box::new(std::iter::empty()),
265 Self::Set(set) => {
266 let mut sorted = set.into_iter().collect::<Vec<_>>();
269 sorted.sort();
270 Box::new(sorted.into_iter())
271 }
272 Self::Bitmap(bitmap) => Box::new(bitmap.into_iter()),
273 }
274 }
275}
276
277impl FromIterator<u32> for DeletionVector {
278 fn from_iter<T: IntoIterator<Item = u32>>(iter: T) -> Self {
279 let mut deletion_vector = Self::default();
280 deletion_vector.extend(iter);
281 deletion_vector
282 }
283}
284
285impl From<RoaringBitmap> for DeletionVector {
286 fn from(bitmap: RoaringBitmap) -> Self {
287 if bitmap.is_empty() {
288 Self::NoDeletions
289 } else {
290 Self::Bitmap(bitmap)
291 }
292 }
293}
294
295#[cfg(test)]
296#[cfg_attr(coverage, coverage(off))]
297mod test {
298 use super::*;
299 use deepsize::DeepSizeOf;
300 use rstest::rstest;
301
302 fn set_dv(vals: impl IntoIterator<Item = u32>) -> DeletionVector {
303 DeletionVector::Set(HashSet::from_iter(vals))
304 }
305 fn bitmap_dv(vals: impl IntoIterator<Item = u32>) -> DeletionVector {
306 DeletionVector::Bitmap(RoaringBitmap::from_iter(vals))
307 }
308
309 #[test]
310 fn test_set_bitmap_equality() {
311 assert_eq!(set_dv(0..100), bitmap_dv(0..100));
312 }
313
314 #[test]
315 fn test_threshold_promotes_to_bitmap() {
316 let dv = DeletionVector::from_iter(0..(BITMAP_THRESDHOLD as u32));
317 assert!(matches!(dv, DeletionVector::Bitmap(_)));
318 }
319
320 #[rstest]
321 #[case::middle_deletions(&[3, 5], &[0, 1, 2, 4, 6, 7, 8])]
322 #[case::start_deletions(&[0, 1, 2], &[3, 4, 5, 6, 7, 8, 9])]
323 fn test_map_offsets(#[case] deleted: &[u32], #[case] expected: &[u32]) {
324 let dv = DeletionVector::from_iter(deleted.iter().copied());
325 let mut mapper = OffsetMapper::new(Arc::new(dv));
326 let output: Vec<_> = (0..expected.len() as u32)
327 .map(|o| mapper.map_offset(o))
328 .collect();
329 assert_eq!(output, expected);
330 }
331
332 #[test]
333 fn test_deep_size_of() {
334 assert_eq!(
335 DeletionVector::NoDeletions.deep_size_of(),
336 std::mem::size_of::<DeletionVector>()
337 );
338 assert!(set_dv([1, 2, 3]).deep_size_of() > std::mem::size_of::<DeletionVector>());
339 assert!(bitmap_dv([1, 2, 3]).deep_size_of() > std::mem::size_of::<DeletionVector>());
340 }
341
342 #[rstest]
343 #[case::no_deletions(DeletionVector::NoDeletions, 0, true)]
344 #[case::set(set_dv([1, 2, 3]), 3, false)]
345 #[case::bitmap(bitmap_dv([1, 2, 3, 4, 5]), 5, false)]
346 fn test_len_is_empty(#[case] dv: DeletionVector, #[case] len: usize, #[case] empty: bool) {
347 assert_eq!(dv.len(), len);
348 assert_eq!(dv.is_empty(), empty);
349 }
350
351 #[rstest]
352 #[case::no_deletions(DeletionVector::NoDeletions, 1, false)]
353 #[case::set_contains(set_dv([1, 2, 3]), 1, true)]
354 #[case::set_missing(set_dv([1, 2, 3]), 0, false)]
355 #[case::bitmap_contains(bitmap_dv([10, 20, 30]), 10, true)]
356 #[case::bitmap_missing(bitmap_dv([10, 20, 30]), 5, false)]
357 fn test_contains(#[case] dv: DeletionVector, #[case] val: u32, #[case] expected: bool) {
358 assert_eq!(dv.contains(val), expected);
359 }
360
361 #[rstest]
362 #[case::no_del_empty_range(DeletionVector::NoDeletions, 0..0, true)]
363 #[case::no_del_non_empty(DeletionVector::NoDeletions, 0..1, false)]
364 #[case::set_full_range(set_dv([1, 2, 3]), 1..4, true)]
365 #[case::set_partial(set_dv([1, 2, 3]), 0..2, false)]
366 #[case::bitmap_full(bitmap_dv([10, 11, 12]), 10..13, true)]
367 #[case::bitmap_partial(bitmap_dv([10, 11, 12]), 9..11, false)]
368 fn test_contains_range(
369 #[case] dv: DeletionVector,
370 #[case] range: std::ops::Range<u32>,
371 #[case] expected: bool,
372 ) {
373 assert_eq!(dv.contains_range(range), expected);
374 }
375
376 #[test]
377 fn test_range_cardinality() {
378 assert_eq!(DeletionVector::NoDeletions.range_cardinality(0..100), 0);
379 let bm = bitmap_dv([5, 10, 15]);
380 assert_eq!(bm.range_cardinality(0..20), 3);
381 assert_eq!(bm.range_cardinality(6..14), 1);
382 }
383
384 #[rstest]
385 #[case::no_deletions(DeletionVector::NoDeletions, vec![])]
386 #[case::set(set_dv([3, 1, 2]), vec![1, 2, 3])]
387 #[case::bitmap(bitmap_dv([30, 10, 20]), vec![10, 20, 30])]
388 fn test_iterators(#[case] dv: DeletionVector, #[case] expected: Vec<u32>) {
389 let mut items: Vec<_> = dv.iter().collect();
391 items.sort();
392 assert_eq!(items, expected);
393
394 assert_eq!(dv.to_sorted_iter().collect::<Vec<_>>(), expected);
396
397 assert_eq!(dv.clone().into_sorted_iter().collect::<Vec<_>>(), expected);
399 assert_eq!(dv.into_iter().collect::<Vec<_>>(), expected);
400 }
401
402 #[test]
403 fn test_build_predicate() {
404 let addrs = [0u64, 1, 2, 3, 4];
405 assert!(
406 DeletionVector::NoDeletions
407 .build_predicate(addrs.iter())
408 .is_none()
409 );
410
411 let pred = set_dv([1, 3]).build_predicate(addrs.iter()).unwrap();
412 assert_eq!(
413 pred.iter().map(|v| v.unwrap()).collect::<Vec<_>>(),
414 [true, false, true, false, true]
415 );
416
417 let pred = bitmap_dv([0, 2, 4]).build_predicate(addrs.iter()).unwrap();
418 assert_eq!(
419 pred.iter().map(|v| v.unwrap()).collect::<Vec<_>>(),
420 [false, true, false, true, false]
421 );
422 }
423
424 #[rstest]
425 #[case::no_deletions(DeletionVector::NoDeletions, 0)]
426 #[case::set(set_dv([1, 2, 3]), 3)]
427 #[case::bitmap(bitmap_dv([10, 20]), 2)]
428 fn test_to_roaring(#[case] dv: DeletionVector, #[case] len: u64) {
429 let bitmap: RoaringBitmap = (&dv).into();
430 assert_eq!(bitmap.len(), len);
431 }
432
433 #[test]
434 fn test_partial_eq() {
435 assert_eq!(DeletionVector::NoDeletions, DeletionVector::NoDeletions);
436 assert_eq!(set_dv([1, 2, 3]), set_dv([1, 2, 3]));
437 assert_eq!(bitmap_dv([1, 2, 3]), bitmap_dv([1, 2, 3]));
438 assert_eq!(set_dv([5, 6, 7]), bitmap_dv([5, 6, 7])); assert_eq!(bitmap_dv([5, 6, 7]), set_dv([5, 6, 7])); assert_ne!(DeletionVector::NoDeletions, set_dv([1]));
441 assert_ne!(DeletionVector::NoDeletions, bitmap_dv([1]));
442 }
443
444 #[test]
445 fn test_extend() {
446 let mut dv = DeletionVector::NoDeletions;
448 dv.extend(std::iter::empty::<u32>());
449 assert!(matches!(dv, DeletionVector::NoDeletions));
450
451 let mut dv = DeletionVector::NoDeletions;
453 dv.extend(std::iter::from_fn({
454 let mut i = 0u32;
455 move || {
456 i += 1;
457 (i <= 10).then_some(i - 1)
458 }
459 }));
460 assert!(matches!(dv, DeletionVector::Set(_)));
461
462 let mut dv = DeletionVector::NoDeletions;
464 dv.extend((0u32..10_000).filter(|_| true));
465 assert!(matches!(dv, DeletionVector::Bitmap(_)));
466
467 let mut dv = set_dv([1, 2, 3]);
469 dv.extend([4, 5, 6]);
470 assert!(matches!(dv, DeletionVector::Set(_)) && dv.len() == 6);
471
472 let mut dv = set_dv([1, 2, 3]);
474 dv.extend(100..(BITMAP_THRESDHOLD as u32 + 100));
475 assert!(matches!(dv, DeletionVector::Bitmap(_)));
476
477 let mut dv = bitmap_dv([1, 2, 3]);
479 dv.extend([4, 5, 6]);
480 assert!(matches!(dv, DeletionVector::Bitmap(_)) && dv.len() == 6);
481 }
482
483 #[test]
484 fn test_from_roaring() {
485 let dv: DeletionVector = RoaringBitmap::new().into();
486 assert!(matches!(dv, DeletionVector::NoDeletions));
487
488 let dv: DeletionVector = RoaringBitmap::from_iter([1, 2, 3]).into();
489 assert!(matches!(dv, DeletionVector::Bitmap(_)) && dv.len() == 3);
490 }
491
492 #[test]
493 fn test_map_offset_dense_then_sparse() {
494 let mut deleted = Vec::new();
497 for i in 0..500u32 {
499 if i % 5 != 0 {
500 deleted.push(i);
501 }
502 }
503 for i in 500..1000u32 {
505 if i % 5 == 0 {
506 deleted.push(i);
507 }
508 }
509 let dv = DeletionVector::Bitmap(RoaringBitmap::from_iter(deleted));
510 let mut mapper = OffsetMapper::new(Arc::new(dv));
511
512 assert_eq!(mapper.map_offset(0), 0);
514 assert_eq!(mapper.map_offset(1), 5);
515 assert_eq!(mapper.map_offset(99), 495);
516
517 assert_eq!(mapper.map_offset(100), 501);
521 }
522}