polars_arrow/bitmap/utils/
slice_iterator.rs

1use crate::bitmap::Bitmap;
2
3/// Internal state of [`SlicesIterator`]
4#[derive(Debug, Clone, PartialEq)]
5enum State {
6    // normal iteration
7    Nominal,
8    // nothing more to iterate.
9    Finished,
10}
11
12/// Iterator over a bitmap that returns slices of set regions.
13///
14/// This is the most efficient method to extract slices of values from arrays
15/// with a validity bitmap.
16/// For example, the bitmap `00101111` returns `[(0,4), (6,1)]`
17#[derive(Debug, Clone)]
18pub struct SlicesIterator<'a> {
19    values: std::slice::Iter<'a, u8>,
20    count: usize,
21    mask: u8,
22    max_len: usize,
23    current_byte: &'a u8,
24    state: State,
25    len: usize,
26    start: usize,
27    on_region: bool,
28}
29
30impl<'a> SlicesIterator<'a> {
31    /// Creates a new [`SlicesIterator`]
32    pub fn new(values: &'a Bitmap) -> Self {
33        let (buffer, offset, _) = values.as_slice();
34        let mut iter = buffer.iter();
35
36        let (current_byte, state) = match iter.next() {
37            Some(b) => (b, State::Nominal),
38            None => (&0, State::Finished),
39        };
40
41        Self {
42            state,
43            count: values.len() - values.unset_bits(),
44            max_len: values.len(),
45            values: iter,
46            mask: 1u8.rotate_left(offset as u32),
47            current_byte,
48            len: 0,
49            start: 0,
50            on_region: false,
51        }
52    }
53
54    #[inline]
55    fn finish(&mut self) -> Option<(usize, usize)> {
56        self.state = State::Finished;
57        if self.on_region {
58            Some((self.start, self.len))
59        } else {
60            None
61        }
62    }
63
64    #[inline]
65    fn current_len(&self) -> usize {
66        self.start + self.len
67    }
68
69    /// Returns the total number of slots.
70    /// It corresponds to the sum of all lengths of all slices.
71    #[inline]
72    pub fn slots(&self) -> usize {
73        self.count
74    }
75}
76
77impl Iterator for SlicesIterator<'_> {
78    type Item = (usize, usize);
79
80    #[inline]
81    fn next(&mut self) -> Option<Self::Item> {
82        loop {
83            if self.state == State::Finished {
84                return None;
85            }
86            if self.current_len() == self.max_len {
87                return self.finish();
88            }
89
90            if self.mask == 1 {
91                // at the beginning of a byte => try to skip it all together
92                match (self.on_region, self.current_byte) {
93                    (true, &255u8) => {
94                        self.len = std::cmp::min(self.max_len - self.start, self.len + 8);
95                        if let Some(v) = self.values.next() {
96                            self.current_byte = v;
97                        };
98                        continue;
99                    },
100                    (false, &0) => {
101                        self.len = std::cmp::min(self.max_len - self.start, self.len + 8);
102                        if let Some(v) = self.values.next() {
103                            self.current_byte = v;
104                        };
105                        continue;
106                    },
107                    _ => (), // we need to run over all bits of this byte
108                }
109            };
110
111            let value = (self.current_byte & self.mask) != 0;
112            self.mask = self.mask.rotate_left(1);
113
114            match (self.on_region, value) {
115                (true, true) => self.len += 1,
116                (false, false) => self.len += 1,
117                (true, false) => {
118                    self.on_region = false;
119                    let result = (self.start, self.len);
120                    self.start += self.len;
121                    self.len = 1;
122                    if self.mask == 1 {
123                        // reached a new byte => try to fetch it from the iterator
124                        if let Some(v) = self.values.next() {
125                            self.current_byte = v;
126                        };
127                    }
128                    return Some(result);
129                },
130                (false, true) => {
131                    self.start += self.len;
132                    self.len = 1;
133                    self.on_region = true;
134                },
135            }
136
137            if self.mask == 1 {
138                // reached a new byte => try to fetch it from the iterator
139                match self.values.next() {
140                    Some(v) => self.current_byte = v,
141                    None => return self.finish(),
142                };
143            }
144        }
145    }
146}