Skip to main content

lance_core/utils/
deletion.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::{collections::HashSet, ops::Range, sync::Arc};
5
6use arrow_array::BooleanArray;
7use deepsize::{Context, DeepSizeOf};
8use roaring::RoaringBitmap;
9
10/// Threshold for when a DeletionVector::Set should be promoted to a DeletionVector::Bitmap.
11const BITMAP_THRESDHOLD: usize = 5_000;
12// TODO: Benchmark to find a better value.
13
14/// Represents a set of deleted row offsets in a single fragment.
15#[derive(Debug, Clone, Default)]
16pub enum DeletionVector {
17    #[default]
18    NoDeletions,
19    Set(HashSet<u32>),
20    Bitmap(RoaringBitmap),
21}
22
23impl DeepSizeOf for DeletionVector {
24    fn deep_size_of_children(&self, context: &mut Context) -> usize {
25        match self {
26            Self::NoDeletions => 0,
27            Self::Set(set) => set.deep_size_of_children(context),
28            // Inexact but probably close enough
29            Self::Bitmap(bitmap) => bitmap.serialized_size(),
30        }
31    }
32}
33
34impl DeletionVector {
35    #[allow(dead_code)] // Used in tests
36    pub fn len(&self) -> usize {
37        match self {
38            Self::NoDeletions => 0,
39            Self::Set(set) => set.len(),
40            Self::Bitmap(bitmap) => bitmap.len() as usize,
41        }
42    }
43
44    pub fn is_empty(&self) -> bool {
45        self.len() == 0
46    }
47
48    pub fn contains(&self, i: u32) -> bool {
49        match self {
50            Self::NoDeletions => false,
51            Self::Set(set) => set.contains(&i),
52            Self::Bitmap(bitmap) => bitmap.contains(i),
53        }
54    }
55
56    pub fn contains_range(&self, mut range: Range<u32>) -> bool {
57        match self {
58            Self::NoDeletions => range.is_empty(),
59            Self::Set(set) => range.all(|i| set.contains(&i)),
60            Self::Bitmap(bitmap) => bitmap.contains_range(range),
61        }
62    }
63
64    fn range_cardinality(&self, range: Range<u32>) -> u64 {
65        match self {
66            Self::NoDeletions => 0,
67            Self::Set(set) => range.fold(0, |acc, i| acc + set.contains(&i) as u64),
68            Self::Bitmap(bitmap) => bitmap.range_cardinality(range),
69        }
70    }
71
72    pub fn iter(&self) -> Box<dyn Iterator<Item = u32> + Send + '_> {
73        match self {
74            Self::NoDeletions => Box::new(std::iter::empty()),
75            Self::Set(set) => Box::new(set.iter().copied()),
76            Self::Bitmap(bitmap) => Box::new(bitmap.iter()),
77        }
78    }
79
80    pub fn into_sorted_iter(self) -> Box<dyn Iterator<Item = u32> + Send + 'static> {
81        match self {
82            Self::NoDeletions => Box::new(std::iter::empty()),
83            Self::Set(set) => {
84                // If we're using a set we shouldn't have too many values
85                // and so this conversion should be affordable.
86                let mut values = Vec::from_iter(set);
87                values.sort();
88                Box::new(values.into_iter())
89            }
90            // Bitmaps always iterate in sorted order
91            Self::Bitmap(bitmap) => Box::new(bitmap.into_iter()),
92        }
93    }
94
95    /// Create an iterator that iterates over the values in the deletion vector in sorted order.
96    pub fn to_sorted_iter<'a>(&'a self) -> Box<dyn Iterator<Item = u32> + Send + 'a> {
97        match self {
98            Self::NoDeletions => Box::new(std::iter::empty()),
99            // We have to make a clone when we're using a set
100            // but sets should be relatively small.
101            Self::Set(_) => self.clone().into_sorted_iter(),
102            Self::Bitmap(bitmap) => Box::new(bitmap.iter()),
103        }
104    }
105
106    // Note: deletion vectors are based on 32-bit offsets.  However, this function works
107    // even when given 64-bit row addresses.  That is because `id as u32` returns the lower
108    // 32 bits (the row offset) and the upper 32 bits are ignored.
109    pub fn build_predicate(&self, row_addrs: std::slice::Iter<u64>) -> Option<BooleanArray> {
110        match self {
111            Self::Bitmap(bitmap) => Some(
112                row_addrs
113                    .map(|&id| !bitmap.contains(id as u32))
114                    .collect::<Vec<_>>(),
115            ),
116            Self::Set(set) => Some(
117                row_addrs
118                    .map(|&id| !set.contains(&(id as u32)))
119                    .collect::<Vec<_>>(),
120            ),
121            Self::NoDeletions => None,
122        }
123        .map(BooleanArray::from)
124    }
125}
126
127/// Maps a naive offset into a fragment to the local row offset that is
128/// not deleted.
129///
130/// For example, if the deletion vector is [0, 1, 2], then the mapping
131/// would be:
132///
133/// - 0 -> 3
134/// - 1 -> 4
135/// - 2 -> 5
136///
137/// and so on.
138///
139/// This expects a monotonically increasing sequence of input offsets. State
140/// is re-used between calls to `map_offset` to make the mapping more efficient.
141pub struct OffsetMapper {
142    dv: Arc<DeletionVector>,
143    left: u32,
144    last_diff: u32,
145}
146
147impl OffsetMapper {
148    pub fn new(dv: Arc<DeletionVector>) -> Self {
149        Self {
150            dv,
151            left: 0,
152            last_diff: 0,
153        }
154    }
155
156    pub fn map_offset(&mut self, offset: u32) -> u32 {
157        // The best initial guess is the offset + last diff. That's the right
158        // answer if there are no deletions in the range between the last
159        // offset and the current one.
160        let mut mid = offset + self.last_diff;
161        let mut right = offset + self.dv.len() as u32;
162        loop {
163            let deleted_in_range = self.dv.range_cardinality(0..(mid + 1)) as u32;
164            match mid.cmp(&(offset + deleted_in_range)) {
165                std::cmp::Ordering::Equal if !self.dv.contains(mid) => {
166                    self.last_diff = mid - offset;
167                    return mid;
168                }
169                std::cmp::Ordering::Less => {
170                    assert_ne!(self.left, mid + 1);
171                    self.left = mid + 1;
172                    mid = self.left + (right - self.left) / 2;
173                }
174                // Binary search left when the guess overshoots. This can happen when:
175                // - Greater: last_diff was calibrated for a denser deletion region
176                // - Equal with deleted mid: the guess lands exactly on a deleted row
177                std::cmp::Ordering::Greater | std::cmp::Ordering::Equal => {
178                    right = mid;
179                    mid = self.left + (right - self.left) / 2;
180                }
181            }
182        }
183    }
184}
185
186impl From<&DeletionVector> for RoaringBitmap {
187    fn from(value: &DeletionVector) -> Self {
188        match value {
189            DeletionVector::Bitmap(bitmap) => bitmap.clone(),
190            DeletionVector::Set(set) => Self::from_iter(set.iter()),
191            DeletionVector::NoDeletions => Self::new(),
192        }
193    }
194}
195
196impl PartialEq for DeletionVector {
197    fn eq(&self, other: &Self) -> bool {
198        match (self, other) {
199            (Self::NoDeletions, Self::NoDeletions) => true,
200            (Self::Set(set1), Self::Set(set2)) => set1 == set2,
201            (Self::Bitmap(bitmap1), Self::Bitmap(bitmap2)) => bitmap1 == bitmap2,
202            (Self::Set(set), Self::Bitmap(bitmap)) | (Self::Bitmap(bitmap), Self::Set(set)) => {
203                let set = set.iter().copied().collect::<RoaringBitmap>();
204                set == *bitmap
205            }
206            _ => false,
207        }
208    }
209}
210
211impl Extend<u32> for DeletionVector {
212    fn extend<T: IntoIterator<Item = u32>>(&mut self, iter: T) {
213        let iter = iter.into_iter();
214        // The mem::replace allows changing the variant of Self when we only
215        // have &mut Self.
216        *self = match (std::mem::take(self), iter.size_hint()) {
217            (Self::NoDeletions, (_, Some(0))) => Self::NoDeletions,
218            (Self::NoDeletions, (lower, _)) if lower >= BITMAP_THRESDHOLD => {
219                let bitmap = iter.collect::<RoaringBitmap>();
220                Self::Bitmap(bitmap)
221            }
222            (Self::NoDeletions, (_, Some(upper))) if upper < BITMAP_THRESDHOLD => {
223                let set = iter.collect::<HashSet<_>>();
224                Self::Set(set)
225            }
226            (Self::NoDeletions, _) => {
227                // We don't know the size, so just try as a set and move to bitmap
228                // if it ends up being big.
229                let set = iter.collect::<HashSet<_>>();
230                if set.len() > BITMAP_THRESDHOLD {
231                    let bitmap = set.into_iter().collect::<RoaringBitmap>();
232                    Self::Bitmap(bitmap)
233                } else {
234                    Self::Set(set)
235                }
236            }
237            (Self::Set(mut set), _) => {
238                set.extend(iter);
239                if set.len() > BITMAP_THRESDHOLD {
240                    let bitmap = set.drain().collect::<RoaringBitmap>();
241                    Self::Bitmap(bitmap)
242                } else {
243                    Self::Set(set)
244                }
245            }
246            (Self::Bitmap(mut bitmap), _) => {
247                bitmap.extend(iter);
248                Self::Bitmap(bitmap)
249            }
250        };
251    }
252}
253
254// TODO: impl methods for DeletionVector
255/// impl DeletionVector {
256///     pub fn get(i: u32) -> bool { ... }
257/// }
258/// impl BitAnd for DeletionVector { ... }
259impl IntoIterator for DeletionVector {
260    type IntoIter = Box<dyn Iterator<Item = Self::Item> + Send>;
261    type Item = u32;
262
263    fn into_iter(self) -> Self::IntoIter {
264        match self {
265            Self::NoDeletions => Box::new(std::iter::empty()),
266            Self::Set(set) => {
267                // In many cases, it's much better if this is sorted. It's
268                // guaranteed to be small, so the cost is low.
269                let mut sorted = set.into_iter().collect::<Vec<_>>();
270                sorted.sort();
271                Box::new(sorted.into_iter())
272            }
273            Self::Bitmap(bitmap) => Box::new(bitmap.into_iter()),
274        }
275    }
276}
277
278impl FromIterator<u32> for DeletionVector {
279    fn from_iter<T: IntoIterator<Item = u32>>(iter: T) -> Self {
280        let mut deletion_vector = Self::default();
281        deletion_vector.extend(iter);
282        deletion_vector
283    }
284}
285
286impl From<RoaringBitmap> for DeletionVector {
287    fn from(bitmap: RoaringBitmap) -> Self {
288        if bitmap.is_empty() {
289            Self::NoDeletions
290        } else {
291            Self::Bitmap(bitmap)
292        }
293    }
294}
295
296#[cfg(test)]
297#[cfg_attr(coverage, coverage(off))]
298mod test {
299    use super::*;
300    use deepsize::DeepSizeOf;
301    use rstest::rstest;
302
303    fn set_dv(vals: impl IntoIterator<Item = u32>) -> DeletionVector {
304        DeletionVector::Set(HashSet::from_iter(vals))
305    }
306    fn bitmap_dv(vals: impl IntoIterator<Item = u32>) -> DeletionVector {
307        DeletionVector::Bitmap(RoaringBitmap::from_iter(vals))
308    }
309
310    #[test]
311    fn test_set_bitmap_equality() {
312        assert_eq!(set_dv(0..100), bitmap_dv(0..100));
313    }
314
315    #[test]
316    fn test_threshold_promotes_to_bitmap() {
317        let dv = DeletionVector::from_iter(0..(BITMAP_THRESDHOLD as u32));
318        assert!(matches!(dv, DeletionVector::Bitmap(_)));
319    }
320
321    #[rstest]
322    #[case::middle_deletions(&[3, 5], &[0, 1, 2, 4, 6, 7, 8])]
323    #[case::start_deletions(&[0, 1, 2], &[3, 4, 5, 6, 7, 8, 9])]
324    fn test_map_offsets(#[case] deleted: &[u32], #[case] expected: &[u32]) {
325        let dv = DeletionVector::from_iter(deleted.iter().copied());
326        let mut mapper = OffsetMapper::new(Arc::new(dv));
327        let output: Vec<_> = (0..expected.len() as u32)
328            .map(|o| mapper.map_offset(o))
329            .collect();
330        assert_eq!(output, expected);
331    }
332
333    #[test]
334    fn test_deep_size_of() {
335        assert_eq!(
336            DeletionVector::NoDeletions.deep_size_of(),
337            std::mem::size_of::<DeletionVector>()
338        );
339        assert!(set_dv([1, 2, 3]).deep_size_of() > std::mem::size_of::<DeletionVector>());
340        assert!(bitmap_dv([1, 2, 3]).deep_size_of() > std::mem::size_of::<DeletionVector>());
341    }
342
343    #[rstest]
344    #[case::no_deletions(DeletionVector::NoDeletions, 0, true)]
345    #[case::set(set_dv([1, 2, 3]), 3, false)]
346    #[case::bitmap(bitmap_dv([1, 2, 3, 4, 5]), 5, false)]
347    fn test_len_is_empty(#[case] dv: DeletionVector, #[case] len: usize, #[case] empty: bool) {
348        assert_eq!(dv.len(), len);
349        assert_eq!(dv.is_empty(), empty);
350    }
351
352    #[rstest]
353    #[case::no_deletions(DeletionVector::NoDeletions, 1, false)]
354    #[case::set_contains(set_dv([1, 2, 3]), 1, true)]
355    #[case::set_missing(set_dv([1, 2, 3]), 0, false)]
356    #[case::bitmap_contains(bitmap_dv([10, 20, 30]), 10, true)]
357    #[case::bitmap_missing(bitmap_dv([10, 20, 30]), 5, false)]
358    fn test_contains(#[case] dv: DeletionVector, #[case] val: u32, #[case] expected: bool) {
359        assert_eq!(dv.contains(val), expected);
360    }
361
362    #[rstest]
363    #[case::no_del_empty_range(DeletionVector::NoDeletions, 0..0, true)]
364    #[case::no_del_non_empty(DeletionVector::NoDeletions, 0..1, false)]
365    #[case::set_full_range(set_dv([1, 2, 3]), 1..4, true)]
366    #[case::set_partial(set_dv([1, 2, 3]), 0..2, false)]
367    #[case::bitmap_full(bitmap_dv([10, 11, 12]), 10..13, true)]
368    #[case::bitmap_partial(bitmap_dv([10, 11, 12]), 9..11, false)]
369    fn test_contains_range(
370        #[case] dv: DeletionVector,
371        #[case] range: std::ops::Range<u32>,
372        #[case] expected: bool,
373    ) {
374        assert_eq!(dv.contains_range(range), expected);
375    }
376
377    #[test]
378    fn test_range_cardinality() {
379        assert_eq!(DeletionVector::NoDeletions.range_cardinality(0..100), 0);
380        let bm = bitmap_dv([5, 10, 15]);
381        assert_eq!(bm.range_cardinality(0..20), 3);
382        assert_eq!(bm.range_cardinality(6..14), 1);
383    }
384
385    #[rstest]
386    #[case::no_deletions(DeletionVector::NoDeletions, vec![])]
387    #[case::set(set_dv([3, 1, 2]), vec![1, 2, 3])]
388    #[case::bitmap(bitmap_dv([30, 10, 20]), vec![10, 20, 30])]
389    fn test_iterators(#[case] dv: DeletionVector, #[case] expected: Vec<u32>) {
390        // Test iter()
391        let mut items: Vec<_> = dv.iter().collect();
392        items.sort();
393        assert_eq!(items, expected);
394
395        // Test to_sorted_iter()
396        assert_eq!(dv.to_sorted_iter().collect::<Vec<_>>(), expected);
397
398        // Test into_sorted_iter() and into_iter() (both consume, so clone first)
399        assert_eq!(dv.clone().into_sorted_iter().collect::<Vec<_>>(), expected);
400        assert_eq!(dv.into_iter().collect::<Vec<_>>(), expected);
401    }
402
403    #[test]
404    fn test_build_predicate() {
405        let addrs = [0u64, 1, 2, 3, 4];
406        assert!(DeletionVector::NoDeletions
407            .build_predicate(addrs.iter())
408            .is_none());
409
410        let pred = set_dv([1, 3]).build_predicate(addrs.iter()).unwrap();
411        assert_eq!(
412            pred.iter().map(|v| v.unwrap()).collect::<Vec<_>>(),
413            [true, false, true, false, true]
414        );
415
416        let pred = bitmap_dv([0, 2, 4]).build_predicate(addrs.iter()).unwrap();
417        assert_eq!(
418            pred.iter().map(|v| v.unwrap()).collect::<Vec<_>>(),
419            [false, true, false, true, false]
420        );
421    }
422
423    #[rstest]
424    #[case::no_deletions(DeletionVector::NoDeletions, 0)]
425    #[case::set(set_dv([1, 2, 3]), 3)]
426    #[case::bitmap(bitmap_dv([10, 20]), 2)]
427    fn test_to_roaring(#[case] dv: DeletionVector, #[case] len: u64) {
428        let bitmap: RoaringBitmap = (&dv).into();
429        assert_eq!(bitmap.len(), len);
430    }
431
432    #[test]
433    fn test_partial_eq() {
434        assert_eq!(DeletionVector::NoDeletions, DeletionVector::NoDeletions);
435        assert_eq!(set_dv([1, 2, 3]), set_dv([1, 2, 3]));
436        assert_eq!(bitmap_dv([1, 2, 3]), bitmap_dv([1, 2, 3]));
437        assert_eq!(set_dv([5, 6, 7]), bitmap_dv([5, 6, 7])); // cross-type
438        assert_eq!(bitmap_dv([5, 6, 7]), set_dv([5, 6, 7])); // reverse
439        assert_ne!(DeletionVector::NoDeletions, set_dv([1]));
440        assert_ne!(DeletionVector::NoDeletions, bitmap_dv([1]));
441    }
442
443    #[test]
444    fn test_extend() {
445        // Empty iter -> stays NoDeletions
446        let mut dv = DeletionVector::NoDeletions;
447        dv.extend(std::iter::empty::<u32>());
448        assert!(matches!(dv, DeletionVector::NoDeletions));
449
450        // Unknown size small -> Set
451        let mut dv = DeletionVector::NoDeletions;
452        dv.extend(std::iter::from_fn({
453            let mut i = 0u32;
454            move || {
455                i += 1;
456                (i <= 10).then_some(i - 1)
457            }
458        }));
459        assert!(matches!(dv, DeletionVector::Set(_)));
460
461        // Unknown size large -> Bitmap
462        let mut dv = DeletionVector::NoDeletions;
463        dv.extend((0u32..10_000).filter(|_| true));
464        assert!(matches!(dv, DeletionVector::Bitmap(_)));
465
466        // Set stays Set when small
467        let mut dv = set_dv([1, 2, 3]);
468        dv.extend([4, 5, 6]);
469        assert!(matches!(dv, DeletionVector::Set(_)) && dv.len() == 6);
470
471        // Set promotes to Bitmap when large
472        let mut dv = set_dv([1, 2, 3]);
473        dv.extend(100..(BITMAP_THRESDHOLD as u32 + 100));
474        assert!(matches!(dv, DeletionVector::Bitmap(_)));
475
476        // Bitmap stays Bitmap
477        let mut dv = bitmap_dv([1, 2, 3]);
478        dv.extend([4, 5, 6]);
479        assert!(matches!(dv, DeletionVector::Bitmap(_)) && dv.len() == 6);
480    }
481
482    #[test]
483    fn test_from_roaring() {
484        let dv: DeletionVector = RoaringBitmap::new().into();
485        assert!(matches!(dv, DeletionVector::NoDeletions));
486
487        let dv: DeletionVector = RoaringBitmap::from_iter([1, 2, 3]).into();
488        assert!(matches!(dv, DeletionVector::Bitmap(_)) && dv.len() == 3);
489    }
490
491    #[test]
492    fn test_map_offset_dense_then_sparse() {
493        // First half densely deleted (80% deleted), second half sparse (20% deleted)
494        // This creates varying deletion density that might trip up the algorithm
495        let mut deleted = Vec::new();
496        // Dense region: delete 4 out of every 5 rows (keep every 5th)
497        for i in 0..500u32 {
498            if i % 5 != 0 {
499                deleted.push(i);
500            }
501        }
502        // Sparse region: delete 1 out of every 5 rows
503        for i in 500..1000u32 {
504            if i % 5 == 0 {
505                deleted.push(i);
506            }
507        }
508        let dv = DeletionVector::Bitmap(RoaringBitmap::from_iter(deleted));
509        let mut mapper = OffsetMapper::new(Arc::new(dv));
510
511        // In dense region: offset 0 -> row 0 (kept), offset 1 -> row 5 (kept), etc.
512        assert_eq!(mapper.map_offset(0), 0);
513        assert_eq!(mapper.map_offset(1), 5);
514        assert_eq!(mapper.map_offset(99), 495);
515
516        // Transition to sparse region
517        // At row 500, we've had 400 deletions in dense region, plus row 500 is deleted
518        // offset 100 should get row 501
519        assert_eq!(mapper.map_offset(100), 501);
520    }
521}