1use std::{collections::HashSet, ops::Range, sync::Arc};
5
6use arrow_array::BooleanArray;
7use deepsize::{Context, DeepSizeOf};
8use roaring::RoaringBitmap;
9
10const BITMAP_THRESDHOLD: usize = 5_000;
12#[derive(Debug, Clone, Default)]
16pub enum DeletionVector {
17 #[default]
18 NoDeletions,
19 Set(HashSet<u32>),
20 Bitmap(RoaringBitmap),
21}
22
23impl DeepSizeOf for DeletionVector {
24 fn deep_size_of_children(&self, context: &mut Context) -> usize {
25 match self {
26 Self::NoDeletions => 0,
27 Self::Set(set) => set.deep_size_of_children(context),
28 Self::Bitmap(bitmap) => bitmap.serialized_size(),
30 }
31 }
32}
33
34impl DeletionVector {
35 #[allow(dead_code)] pub fn len(&self) -> usize {
37 match self {
38 Self::NoDeletions => 0,
39 Self::Set(set) => set.len(),
40 Self::Bitmap(bitmap) => bitmap.len() as usize,
41 }
42 }
43
44 pub fn is_empty(&self) -> bool {
45 self.len() == 0
46 }
47
48 pub fn contains(&self, i: u32) -> bool {
49 match self {
50 Self::NoDeletions => false,
51 Self::Set(set) => set.contains(&i),
52 Self::Bitmap(bitmap) => bitmap.contains(i),
53 }
54 }
55
56 pub fn contains_range(&self, mut range: Range<u32>) -> bool {
57 match self {
58 Self::NoDeletions => range.is_empty(),
59 Self::Set(set) => range.all(|i| set.contains(&i)),
60 Self::Bitmap(bitmap) => bitmap.contains_range(range),
61 }
62 }
63
64 fn range_cardinality(&self, range: Range<u32>) -> u64 {
65 match self {
66 Self::NoDeletions => 0,
67 Self::Set(set) => range.fold(0, |acc, i| acc + set.contains(&i) as u64),
68 Self::Bitmap(bitmap) => bitmap.range_cardinality(range),
69 }
70 }
71
72 pub fn iter(&self) -> Box<dyn Iterator<Item = u32> + Send + '_> {
73 match self {
74 Self::NoDeletions => Box::new(std::iter::empty()),
75 Self::Set(set) => Box::new(set.iter().copied()),
76 Self::Bitmap(bitmap) => Box::new(bitmap.iter()),
77 }
78 }
79
80 pub fn into_sorted_iter(self) -> Box<dyn Iterator<Item = u32> + Send + 'static> {
81 match self {
82 Self::NoDeletions => Box::new(std::iter::empty()),
83 Self::Set(set) => {
84 let mut values = Vec::from_iter(set);
87 values.sort();
88 Box::new(values.into_iter())
89 }
90 Self::Bitmap(bitmap) => Box::new(bitmap.into_iter()),
92 }
93 }
94
95 pub fn to_sorted_iter<'a>(&'a self) -> Box<dyn Iterator<Item = u32> + Send + 'a> {
97 match self {
98 Self::NoDeletions => Box::new(std::iter::empty()),
99 Self::Set(_) => self.clone().into_sorted_iter(),
102 Self::Bitmap(bitmap) => Box::new(bitmap.iter()),
103 }
104 }
105
106 pub fn build_predicate(&self, row_addrs: std::slice::Iter<u64>) -> Option<BooleanArray> {
110 match self {
111 Self::Bitmap(bitmap) => Some(
112 row_addrs
113 .map(|&id| !bitmap.contains(id as u32))
114 .collect::<Vec<_>>(),
115 ),
116 Self::Set(set) => Some(
117 row_addrs
118 .map(|&id| !set.contains(&(id as u32)))
119 .collect::<Vec<_>>(),
120 ),
121 Self::NoDeletions => None,
122 }
123 .map(BooleanArray::from)
124 }
125}
126
127pub struct OffsetMapper {
142 dv: Arc<DeletionVector>,
143 left: u32,
144 last_diff: u32,
145}
146
147impl OffsetMapper {
148 pub fn new(dv: Arc<DeletionVector>) -> Self {
149 Self {
150 dv,
151 left: 0,
152 last_diff: 0,
153 }
154 }
155
156 pub fn map_offset(&mut self, offset: u32) -> u32 {
157 let mut mid = offset + self.last_diff;
161 let mut right = offset + self.dv.len() as u32;
162 loop {
163 let deleted_in_range = self.dv.range_cardinality(0..(mid + 1)) as u32;
164 match mid.cmp(&(offset + deleted_in_range)) {
165 std::cmp::Ordering::Equal if !self.dv.contains(mid) => {
166 self.last_diff = mid - offset;
167 return mid;
168 }
169 std::cmp::Ordering::Less => {
170 assert_ne!(self.left, mid + 1);
171 self.left = mid + 1;
172 mid = self.left + (right - self.left) / 2;
173 }
174 std::cmp::Ordering::Greater | std::cmp::Ordering::Equal => {
178 right = mid;
179 mid = self.left + (right - self.left) / 2;
180 }
181 }
182 }
183 }
184}
185
186impl From<&DeletionVector> for RoaringBitmap {
187 fn from(value: &DeletionVector) -> Self {
188 match value {
189 DeletionVector::Bitmap(bitmap) => bitmap.clone(),
190 DeletionVector::Set(set) => Self::from_iter(set.iter()),
191 DeletionVector::NoDeletions => Self::new(),
192 }
193 }
194}
195
196impl PartialEq for DeletionVector {
197 fn eq(&self, other: &Self) -> bool {
198 match (self, other) {
199 (Self::NoDeletions, Self::NoDeletions) => true,
200 (Self::Set(set1), Self::Set(set2)) => set1 == set2,
201 (Self::Bitmap(bitmap1), Self::Bitmap(bitmap2)) => bitmap1 == bitmap2,
202 (Self::Set(set), Self::Bitmap(bitmap)) | (Self::Bitmap(bitmap), Self::Set(set)) => {
203 let set = set.iter().copied().collect::<RoaringBitmap>();
204 set == *bitmap
205 }
206 _ => false,
207 }
208 }
209}
210
211impl Extend<u32> for DeletionVector {
212 fn extend<T: IntoIterator<Item = u32>>(&mut self, iter: T) {
213 let iter = iter.into_iter();
214 *self = match (std::mem::take(self), iter.size_hint()) {
217 (Self::NoDeletions, (_, Some(0))) => Self::NoDeletions,
218 (Self::NoDeletions, (lower, _)) if lower >= BITMAP_THRESDHOLD => {
219 let bitmap = iter.collect::<RoaringBitmap>();
220 Self::Bitmap(bitmap)
221 }
222 (Self::NoDeletions, (_, Some(upper))) if upper < BITMAP_THRESDHOLD => {
223 let set = iter.collect::<HashSet<_>>();
224 Self::Set(set)
225 }
226 (Self::NoDeletions, _) => {
227 let set = iter.collect::<HashSet<_>>();
230 if set.len() > BITMAP_THRESDHOLD {
231 let bitmap = set.into_iter().collect::<RoaringBitmap>();
232 Self::Bitmap(bitmap)
233 } else {
234 Self::Set(set)
235 }
236 }
237 (Self::Set(mut set), _) => {
238 set.extend(iter);
239 if set.len() > BITMAP_THRESDHOLD {
240 let bitmap = set.drain().collect::<RoaringBitmap>();
241 Self::Bitmap(bitmap)
242 } else {
243 Self::Set(set)
244 }
245 }
246 (Self::Bitmap(mut bitmap), _) => {
247 bitmap.extend(iter);
248 Self::Bitmap(bitmap)
249 }
250 };
251 }
252}
253
254impl IntoIterator for DeletionVector {
260 type IntoIter = Box<dyn Iterator<Item = Self::Item> + Send>;
261 type Item = u32;
262
263 fn into_iter(self) -> Self::IntoIter {
264 match self {
265 Self::NoDeletions => Box::new(std::iter::empty()),
266 Self::Set(set) => {
267 let mut sorted = set.into_iter().collect::<Vec<_>>();
270 sorted.sort();
271 Box::new(sorted.into_iter())
272 }
273 Self::Bitmap(bitmap) => Box::new(bitmap.into_iter()),
274 }
275 }
276}
277
278impl FromIterator<u32> for DeletionVector {
279 fn from_iter<T: IntoIterator<Item = u32>>(iter: T) -> Self {
280 let mut deletion_vector = Self::default();
281 deletion_vector.extend(iter);
282 deletion_vector
283 }
284}
285
286impl From<RoaringBitmap> for DeletionVector {
287 fn from(bitmap: RoaringBitmap) -> Self {
288 if bitmap.is_empty() {
289 Self::NoDeletions
290 } else {
291 Self::Bitmap(bitmap)
292 }
293 }
294}
295
296#[cfg(test)]
297#[cfg_attr(coverage, coverage(off))]
298mod test {
299 use super::*;
300 use deepsize::DeepSizeOf;
301 use rstest::rstest;
302
303 fn set_dv(vals: impl IntoIterator<Item = u32>) -> DeletionVector {
304 DeletionVector::Set(HashSet::from_iter(vals))
305 }
306 fn bitmap_dv(vals: impl IntoIterator<Item = u32>) -> DeletionVector {
307 DeletionVector::Bitmap(RoaringBitmap::from_iter(vals))
308 }
309
310 #[test]
311 fn test_set_bitmap_equality() {
312 assert_eq!(set_dv(0..100), bitmap_dv(0..100));
313 }
314
315 #[test]
316 fn test_threshold_promotes_to_bitmap() {
317 let dv = DeletionVector::from_iter(0..(BITMAP_THRESDHOLD as u32));
318 assert!(matches!(dv, DeletionVector::Bitmap(_)));
319 }
320
321 #[rstest]
322 #[case::middle_deletions(&[3, 5], &[0, 1, 2, 4, 6, 7, 8])]
323 #[case::start_deletions(&[0, 1, 2], &[3, 4, 5, 6, 7, 8, 9])]
324 fn test_map_offsets(#[case] deleted: &[u32], #[case] expected: &[u32]) {
325 let dv = DeletionVector::from_iter(deleted.iter().copied());
326 let mut mapper = OffsetMapper::new(Arc::new(dv));
327 let output: Vec<_> = (0..expected.len() as u32)
328 .map(|o| mapper.map_offset(o))
329 .collect();
330 assert_eq!(output, expected);
331 }
332
333 #[test]
334 fn test_deep_size_of() {
335 assert_eq!(
336 DeletionVector::NoDeletions.deep_size_of(),
337 std::mem::size_of::<DeletionVector>()
338 );
339 assert!(set_dv([1, 2, 3]).deep_size_of() > std::mem::size_of::<DeletionVector>());
340 assert!(bitmap_dv([1, 2, 3]).deep_size_of() > std::mem::size_of::<DeletionVector>());
341 }
342
343 #[rstest]
344 #[case::no_deletions(DeletionVector::NoDeletions, 0, true)]
345 #[case::set(set_dv([1, 2, 3]), 3, false)]
346 #[case::bitmap(bitmap_dv([1, 2, 3, 4, 5]), 5, false)]
347 fn test_len_is_empty(#[case] dv: DeletionVector, #[case] len: usize, #[case] empty: bool) {
348 assert_eq!(dv.len(), len);
349 assert_eq!(dv.is_empty(), empty);
350 }
351
352 #[rstest]
353 #[case::no_deletions(DeletionVector::NoDeletions, 1, false)]
354 #[case::set_contains(set_dv([1, 2, 3]), 1, true)]
355 #[case::set_missing(set_dv([1, 2, 3]), 0, false)]
356 #[case::bitmap_contains(bitmap_dv([10, 20, 30]), 10, true)]
357 #[case::bitmap_missing(bitmap_dv([10, 20, 30]), 5, false)]
358 fn test_contains(#[case] dv: DeletionVector, #[case] val: u32, #[case] expected: bool) {
359 assert_eq!(dv.contains(val), expected);
360 }
361
362 #[rstest]
363 #[case::no_del_empty_range(DeletionVector::NoDeletions, 0..0, true)]
364 #[case::no_del_non_empty(DeletionVector::NoDeletions, 0..1, false)]
365 #[case::set_full_range(set_dv([1, 2, 3]), 1..4, true)]
366 #[case::set_partial(set_dv([1, 2, 3]), 0..2, false)]
367 #[case::bitmap_full(bitmap_dv([10, 11, 12]), 10..13, true)]
368 #[case::bitmap_partial(bitmap_dv([10, 11, 12]), 9..11, false)]
369 fn test_contains_range(
370 #[case] dv: DeletionVector,
371 #[case] range: std::ops::Range<u32>,
372 #[case] expected: bool,
373 ) {
374 assert_eq!(dv.contains_range(range), expected);
375 }
376
377 #[test]
378 fn test_range_cardinality() {
379 assert_eq!(DeletionVector::NoDeletions.range_cardinality(0..100), 0);
380 let bm = bitmap_dv([5, 10, 15]);
381 assert_eq!(bm.range_cardinality(0..20), 3);
382 assert_eq!(bm.range_cardinality(6..14), 1);
383 }
384
385 #[rstest]
386 #[case::no_deletions(DeletionVector::NoDeletions, vec![])]
387 #[case::set(set_dv([3, 1, 2]), vec![1, 2, 3])]
388 #[case::bitmap(bitmap_dv([30, 10, 20]), vec![10, 20, 30])]
389 fn test_iterators(#[case] dv: DeletionVector, #[case] expected: Vec<u32>) {
390 let mut items: Vec<_> = dv.iter().collect();
392 items.sort();
393 assert_eq!(items, expected);
394
395 assert_eq!(dv.to_sorted_iter().collect::<Vec<_>>(), expected);
397
398 assert_eq!(dv.clone().into_sorted_iter().collect::<Vec<_>>(), expected);
400 assert_eq!(dv.into_iter().collect::<Vec<_>>(), expected);
401 }
402
403 #[test]
404 fn test_build_predicate() {
405 let addrs = [0u64, 1, 2, 3, 4];
406 assert!(DeletionVector::NoDeletions
407 .build_predicate(addrs.iter())
408 .is_none());
409
410 let pred = set_dv([1, 3]).build_predicate(addrs.iter()).unwrap();
411 assert_eq!(
412 pred.iter().map(|v| v.unwrap()).collect::<Vec<_>>(),
413 [true, false, true, false, true]
414 );
415
416 let pred = bitmap_dv([0, 2, 4]).build_predicate(addrs.iter()).unwrap();
417 assert_eq!(
418 pred.iter().map(|v| v.unwrap()).collect::<Vec<_>>(),
419 [false, true, false, true, false]
420 );
421 }
422
423 #[rstest]
424 #[case::no_deletions(DeletionVector::NoDeletions, 0)]
425 #[case::set(set_dv([1, 2, 3]), 3)]
426 #[case::bitmap(bitmap_dv([10, 20]), 2)]
427 fn test_to_roaring(#[case] dv: DeletionVector, #[case] len: u64) {
428 let bitmap: RoaringBitmap = (&dv).into();
429 assert_eq!(bitmap.len(), len);
430 }
431
432 #[test]
433 fn test_partial_eq() {
434 assert_eq!(DeletionVector::NoDeletions, DeletionVector::NoDeletions);
435 assert_eq!(set_dv([1, 2, 3]), set_dv([1, 2, 3]));
436 assert_eq!(bitmap_dv([1, 2, 3]), bitmap_dv([1, 2, 3]));
437 assert_eq!(set_dv([5, 6, 7]), bitmap_dv([5, 6, 7])); assert_eq!(bitmap_dv([5, 6, 7]), set_dv([5, 6, 7])); assert_ne!(DeletionVector::NoDeletions, set_dv([1]));
440 assert_ne!(DeletionVector::NoDeletions, bitmap_dv([1]));
441 }
442
443 #[test]
444 fn test_extend() {
445 let mut dv = DeletionVector::NoDeletions;
447 dv.extend(std::iter::empty::<u32>());
448 assert!(matches!(dv, DeletionVector::NoDeletions));
449
450 let mut dv = DeletionVector::NoDeletions;
452 dv.extend(std::iter::from_fn({
453 let mut i = 0u32;
454 move || {
455 i += 1;
456 (i <= 10).then_some(i - 1)
457 }
458 }));
459 assert!(matches!(dv, DeletionVector::Set(_)));
460
461 let mut dv = DeletionVector::NoDeletions;
463 dv.extend((0u32..10_000).filter(|_| true));
464 assert!(matches!(dv, DeletionVector::Bitmap(_)));
465
466 let mut dv = set_dv([1, 2, 3]);
468 dv.extend([4, 5, 6]);
469 assert!(matches!(dv, DeletionVector::Set(_)) && dv.len() == 6);
470
471 let mut dv = set_dv([1, 2, 3]);
473 dv.extend(100..(BITMAP_THRESDHOLD as u32 + 100));
474 assert!(matches!(dv, DeletionVector::Bitmap(_)));
475
476 let mut dv = bitmap_dv([1, 2, 3]);
478 dv.extend([4, 5, 6]);
479 assert!(matches!(dv, DeletionVector::Bitmap(_)) && dv.len() == 6);
480 }
481
482 #[test]
483 fn test_from_roaring() {
484 let dv: DeletionVector = RoaringBitmap::new().into();
485 assert!(matches!(dv, DeletionVector::NoDeletions));
486
487 let dv: DeletionVector = RoaringBitmap::from_iter([1, 2, 3]).into();
488 assert!(matches!(dv, DeletionVector::Bitmap(_)) && dv.len() == 3);
489 }
490
491 #[test]
492 fn test_map_offset_dense_then_sparse() {
493 let mut deleted = Vec::new();
496 for i in 0..500u32 {
498 if i % 5 != 0 {
499 deleted.push(i);
500 }
501 }
502 for i in 500..1000u32 {
504 if i % 5 == 0 {
505 deleted.push(i);
506 }
507 }
508 let dv = DeletionVector::Bitmap(RoaringBitmap::from_iter(deleted));
509 let mut mapper = OffsetMapper::new(Arc::new(dv));
510
511 assert_eq!(mapper.map_offset(0), 0);
513 assert_eq!(mapper.map_offset(1), 5);
514 assert_eq!(mapper.map_offset(99), 495);
515
516 assert_eq!(mapper.map_offset(100), 501);
520 }
521}