faex/bit_vectors/rank_select/dense_sampling_rank/
mod.rs

1use super::{RankStructure, RankSupport, SelectSupport};
2use crate::Build;
3use crate::{
4    bit_vectors::BitVec, int_vectors::compact_int_vec::CompactIntVec, profiling::HeapSize,
5    util::BitsRequired,
6};
7use serde::{Deserialize, Serialize};
8use std::cmp::min;
9
10///
11#[derive(Debug, Serialize, Deserialize)]
12pub struct DenseSamplingRank {
13    superblocks: Vec<usize>,
14    superblock_size: usize,
15    blocks: CompactIntVec,
16    k: usize,
17    total_rank: usize,
18}
19
20impl DenseSamplingRank {
21    #[inline]
22    pub fn new(data: &BitVec, k: usize) -> Self {
23        assert!(k > 0, "k must be greater than 0");
24
25        let superblock_size = k * BitVec::CONTAINER_WIDTH;
26        let num_superblocks = data.len() / superblock_size;
27        let mut superblocks = Vec::with_capacity(num_superblocks + 1);
28
29        // This could be superblock_size - block_size, but in practice it is enough with -1
30        let max_rank_offset_value = (k - 1) * BitVec::CONTAINER_WIDTH;
31        let rank_offset_int_width = max_rank_offset_value.bits_required() as usize;
32
33        let num_blocks = data.len() / BitVec::CONTAINER_WIDTH;
34        let mut blocks = CompactIntVec::with_capacity(rank_offset_int_width, num_blocks + 1);
35
36        let mut rank = 0;
37        let mut rank_offset = 0;
38        // Push initial 0 values for convenience
39        superblocks.push(rank);
40        blocks.push(rank_offset);
41
42        let raw_data = data.raw_data();
43        for i in 0..num_superblocks {
44            for j in i * k..(i + 1) * k - 1 {
45                rank_offset += unsafe { raw_data.get_unchecked(j) }.count_ones() as usize;
46                blocks.push(rank_offset);
47            }
48            let last_block_rank =
49                unsafe { raw_data.get_unchecked((i + 1) * k - 1) }.count_ones() as usize;
50            rank_offset += last_block_rank;
51            rank += rank_offset;
52            rank_offset = 0;
53
54            blocks.push(rank_offset);
55            superblocks.push(rank);
56        }
57
58        let mut unsampled_superblock_rank = 0;
59        // There may be more blocks after the last superblock
60        for i in num_superblocks * k..num_blocks {
61            unsampled_superblock_rank += unsafe { raw_data.get_unchecked(i) }.count_ones() as usize;
62            blocks.push(unsampled_superblock_rank);
63        }
64
65        // If the last block is not full, we need to add the remaining 1s to the unsampled rank,
66        // but we do not push a value for that block, since it is not fully block-sampled and we
67        // cannot do the same trick for the binary search as we do for the superblocks.
68        if num_blocks != raw_data.len() {
69            unsampled_superblock_rank += raw_data.last().unwrap().count_ones() as usize;
70        }
71
72        // For technical reasons, we need to add the total rank as the last superblock.
73        // This allows us to access the total rank (altought we store it in the struct),
74        // but also it allows us to perform binary search correctly, because otherwise, we
75        // may perform a sequential search within two superblocks. In this way,
76        // the binary search right position will be always >= the rank we are looking for.
77        // but if we didn't do this, it could be that the last position could be < the rank we are looking for,
78        // and wont be selected as the greatest of the lessers.
79        // It is not necessary to add the total rank if all the blocks are fully sampled by a superblock, since the
80        // last superblock will be the total rank.
81        if unsampled_superblock_rank != 0 {
82            rank += unsampled_superblock_rank;
83            superblocks.push(rank);
84        }
85
86        Self {
87            superblocks,
88            superblock_size,
89            blocks,
90            k,
91            total_rank: rank,
92        }
93    }
94
95    #[inline]
96    pub fn superblocks(&self) -> &[usize] {
97        &self.superblocks
98    }
99
100    #[inline]
101    pub fn superblock_size(&self) -> usize {
102        self.superblock_size
103    }
104
105    #[inline]
106    pub fn blocks(&self) -> &CompactIntVec {
107        &self.blocks
108    }
109
110    #[inline]
111    pub fn k(&self) -> usize {
112        self.k
113    }
114}
115impl HeapSize for DenseSamplingRank {
116    #[inline]
117    fn heap_size_in_bits(&self) -> usize {
118        // Do not rely on underlying data structure's heap_size_in_bits,
119        // since we want to count the heap size of the rank structure without
120        // static overhead (i.e first block/superblock that is pushed as a 0 value for convenience)
121        self.superblocks.heap_size_in_bits() + self.blocks.heap_size_in_bits()
122    }
123}
124
125impl RankSupport<BitVec> for DenseSamplingRank {
126    #[inline]
127    unsafe fn rank(&self, data: &BitVec, index: usize) -> Option<usize> {
128        // By definition, rank(0) = 0
129        if index == 0 {
130            return Some(0);
131        }
132        if index > data.len() {
133            return None;
134        }
135
136        let is = index / self.superblock_size;
137        let iw = index / BitVec::CONTAINER_WIDTH;
138
139        let rank = self.superblocks.get_unchecked(is) + self.blocks.get_unchecked(iw);
140        let block_offset = index % BitVec::CONTAINER_WIDTH;
141        let last_block = data.raw_data().get(iw).copied().unwrap_or(0);
142
143        // getbits!(last_block, block_offset, 0) optimized, we know that block_offset is always <64 so this does not overflow
144        let last_block_target = last_block & ((1 << block_offset) - 1);
145        let last_block_rank = last_block_target.count_ones() as usize;
146
147        Some(rank + last_block_rank)
148    }
149}
150
151impl SelectSupport<BitVec> for DenseSamplingRank {
152    #[inline]
153    unsafe fn select(&self, data: &BitVec, rank: usize) -> Option<usize> {
154        // By definition, select(0) = 0
155        if rank == 0 {
156            return Some(0);
157        }
158
159        if rank > self.total_rank {
160            return None;
161        }
162
163        let mut left_superblock = 0;
164        let mut right_superblock = self.superblocks.len() - 1;
165
166        while right_superblock - left_superblock > 1 {
167            let mid_superblock = (left_superblock + right_superblock) / 2;
168            let mid_rank = *self.superblocks.get_unchecked(mid_superblock);
169            if mid_rank < rank {
170                left_superblock = mid_superblock;
171            } else {
172                right_superblock = mid_superblock;
173            }
174        }
175
176        // search for the block that contains the rank.
177        // We know for sure that in left position is the superblock value that is the greatest
178        // of the ranks <= rank, as the last position is always greater or equal than the rank,
179        // as the total rank is the last superblock value.
180        let superblock_rank = *self.superblocks.get_unchecked(left_superblock);
181        let remaining_rank = rank - superblock_rank;
182        let raw_data = data.raw_data();
183        let mut left_block_index = left_superblock * self.k;
184
185        let mut right_block_index = min(left_block_index + self.k - 1, raw_data.len() - 1);
186        while right_block_index - left_block_index > 1 {
187            let mid = (left_block_index + right_block_index) / 2;
188            let mid_rank = self.blocks.get_unchecked(mid);
189            if mid_rank < remaining_rank {
190                left_block_index = mid;
191            } else {
192                right_block_index = mid;
193            }
194        }
195
196        // We cannot do the same trick as in the previous binary search, so we have to check
197        // if the rank at the right block is less than the remaining rank, and if so, we select
198        // the right block, otherwise, we select the left block. This happens for example
199        // when searching by number 5 in the following blockvector: `1 2 3 4`, left would
200        // point to 3 and right to 4, but our target_block_index must be the one targeted by `4`
201        let right_rank = self.blocks.get_unchecked(right_block_index);
202        // check whether the right index contains the greatest of the lessers (we do not have the binary
203        // search bounds trick here)
204        let target_block_index = if right_rank < remaining_rank {
205            right_block_index
206        } else {
207            left_block_index
208        };
209
210        let mut local_rank = self.blocks.get_unchecked(target_block_index);
211
212        // at this point, we are exactly that the block that contains the rank is `target_block_index`
213        let mut block = *raw_data.get_unchecked(target_block_index);
214
215        // select the bit in the block
216        let mut bit_index = 0;
217        while local_rank < remaining_rank {
218            if block & 0b1 == 0b1 {
219                local_rank += 1;
220            }
221            block >>= 1;
222            bit_index += 1;
223        }
224
225        Some(target_block_index * BitVec::CONTAINER_WIDTH + bit_index)
226    }
227
228    #[inline]
229    unsafe fn select0(&self, data: &BitVec, rank0: usize) -> Option<usize> {
230        // // By definition, select0(0) = 0
231        if rank0 == 0 {
232            return Some(0);
233        }
234        let total_rank0 = data.len() - self.total_rank;
235        if rank0 > total_rank0 {
236            return None;
237        }
238
239        let mut left = 0;
240        let mut right = self.superblocks.len() - 1;
241
242        while right - left > 1 {
243            let mid = (left + right) / 2;
244            let bits_before_mid = mid * self.superblock_size;
245            let mid_rank0 = bits_before_mid - *self.superblocks.get_unchecked(mid);
246            if mid_rank0 < rank0 {
247                left = mid;
248            } else {
249                right = mid;
250            }
251        }
252
253        // search for the block that contains the rank.
254        // We know for sure that in left position is the superblock value that is the greatest
255        // of the ranks <= rank, as the last position is always greater or equal than the rank,
256        // as the total rank is the last superblock value.
257        // search for the block that contains the rank
258        let bits_before_left = left * self.superblock_size;
259
260        let superblock_rank0 = bits_before_left - *self.superblocks.get_unchecked(left);
261        let remaining_rank0 = rank0 - superblock_rank0;
262        let raw_data = data.raw_data();
263
264        let first_block_index = left * self.k;
265        let mut left_block_index = first_block_index;
266
267        let mut right_block_index = min(left_block_index + self.k - 1, raw_data.len() - 1);
268        while right_block_index - left_block_index > 1 {
269            let mid = (left_block_index + right_block_index) / 2;
270            let bits_before_mid = (mid - first_block_index) * BitVec::CONTAINER_WIDTH;
271            let mid_rank0 = bits_before_mid - self.blocks.get_unchecked(mid);
272            if mid_rank0 < remaining_rank0 {
273                left_block_index = mid;
274            } else {
275                right_block_index = mid;
276            }
277        }
278
279        // We cannot do the same trick as in the previous binary search, so we have to check
280        // if the rank at the right block is less than the remaining rank, and if so, we select
281        // the right block, otherwise, we select the left block. This happens for example
282        // when searching by number 5 in the following blockvector: `1 2 3 4`, left would
283        // point to 3 and right to 4, but our target_block_index must be the one targeted by `4`
284        let bits_before_right = (right_block_index - first_block_index) * BitVec::CONTAINER_WIDTH;
285        let right_rank0 = bits_before_right - self.blocks.get_unchecked(right_block_index);
286        // check whether the right index contains the greatest of the lessers (we do not have the binary
287        // search bounds trick here)
288        let target_block_index = if right_rank0 < remaining_rank0 {
289            right_block_index
290        } else {
291            left_block_index
292        };
293
294        let bits_before_target = (target_block_index - first_block_index) * BitVec::CONTAINER_WIDTH;
295        let mut local_rank0 = bits_before_target - self.blocks.get_unchecked(target_block_index);
296
297        // at this point, we are exactly that the block that contains the rank is `target_block_index`
298        let mut block = *raw_data.get_unchecked(target_block_index);
299
300        // select the bit in the block
301        let mut bit_index = 0;
302        while local_rank0 < remaining_rank0 {
303            if block & 0b1 == 0b0 {
304                local_rank0 += 1;
305            }
306            block >>= 1;
307            bit_index += 1;
308        }
309
310        Some(target_block_index * BitVec::CONTAINER_WIDTH + bit_index)
311    }
312}
313
314pub struct DenseSamplingRankSpec {
315    pub k: usize,
316}
317
318impl DenseSamplingRankSpec {
319    #[inline]
320    pub const fn new(k: usize) -> Self {
321        Self { k }
322    }
323}
324
325impl DenseSamplingRank {
326    #[inline]
327    pub const fn spec(k: usize) -> DenseSamplingRankSpec {
328        DenseSamplingRankSpec::new(k)
329    }
330}
331
332impl Build<BitVec, RankStructure<BitVec, DenseSamplingRank>> for DenseSamplingRankSpec {
333    #[inline]
334    fn build(&self, data: BitVec) -> RankStructure<BitVec, DenseSamplingRank> {
335        let sparse_sampling = DenseSamplingRank::new(&data, self.k);
336        unsafe { RankStructure::new(data, sparse_sampling) }
337    }
338}
339
340impl Build<&BitVec, DenseSamplingRank> for DenseSamplingRankSpec {
341    #[inline]
342    fn build(&self, data: &BitVec) -> DenseSamplingRank {
343        DenseSamplingRank::new(data, self.k)
344    }
345}
346
347#[cfg(test)]
348mod tests;