simple_sds_sbwt/bit_vector/
rank_support.rs

1//! Rank queries on plain bitvectors.
2//!
3//! The structure is the same as `rank_support_v` in [SDSL](https://github.com/simongog/sdsl-lite):
4//!
5//! > Gog, Petri: Optimized succinct data structures for massive data.  
6//! > Software: Practice and Experience, 2014.  
7//! > DOI: [10.1002/spe.2198](https://doi.org/10.1002/spe.2198)
8//!
9//! The original version is called rank9:
10//!
11//! > Vigna: Broadword Implementation of Rank/Select Queries.  
12//! > WEA 2008.  
13//! > DOI: [10.1007/978-3-540-68552-4_12](https://doi.org/10.1007/978-3-540-68552-4_12)
14//!
15//! We divide the bitvector into blocks of 2^9 = 512 bits.
16//! Each block is further divided into 8 words of 64 bits each.
17//! For each block, we store two 64-bit integers:
18//!
19//! * The first stores the number of ones in previous blocks.
20//! * The second stores, for each word except the first, the number of ones in previous words using 9 bits per word.
21//!
22//! The space overhead is 25%.
23
24use crate::bit_vector::BitVector;
25use crate::ops::BitVec;
26use crate::raw_vector::AccessRaw;
27use crate::serialize::Serialize;
28use crate::bits;
29
30use std::{cmp, io};
31
32//-----------------------------------------------------------------------------
33
34/// Rank support structure for plain bitvectors.
35///
36/// The structure depends on the parent bitvector and assumes that the parent remains unchanged.
37/// Using the [`BitVector`] interface is usually more convenient.
38#[derive(Clone, Debug, PartialEq, Eq)]
39pub struct RankSupport {
40    // No RawVector or bits::read_int because we want to avoid branching.
41    samples: Vec<(u64, u64)>,
42}
43
44impl RankSupport {
45    /// Number of bits per block (512).
46    pub const BLOCK_SIZE: usize = 512;
47
48    const RELATIVE_RANK_BITS: usize = 9;
49    const RELATIVE_RANK_MASK: usize = 0x1FF;
50    const WORDS_PER_BLOCK: usize = 8;
51    const WORD_MASK: usize = 0x7;
52
53    /// Returns the number of 512-bit blocks in the bitvector.
54    pub fn blocks(&self) -> usize {
55        self.samples.len()
56    }
57
58    /// Builds a rank support structure for the parent bitvector.
59    ///
60    /// # Examples
61    ///
62    /// ```
63    /// use simple_sds_sbwt::bit_vector::BitVector;
64    /// use simple_sds_sbwt::bit_vector::rank_support::RankSupport;
65    ///
66    /// let data = vec![false, true, true, false, true, false, true, true, false, false, false];
67    /// let bv: BitVector = data.into_iter().collect();
68    /// let rs = RankSupport::new(&bv);
69    /// assert_eq!(rs.blocks(), 1);
70    /// ```
71    pub fn new(parent: &BitVector) -> RankSupport {
72        let words = bits::bits_to_words(parent.len());
73        let blocks = (parent.len() + Self::BLOCK_SIZE - 1) / Self::BLOCK_SIZE;
74        let mut samples: Vec<(u64, u64)> = Vec::with_capacity(blocks);
75
76        let mut ones: usize = 0;
77        for block in 0..blocks {
78            let mut block_ones: usize = 0;
79            let mut relative_ranks: u64 = 0;
80            let block_words = cmp::min(Self::WORDS_PER_BLOCK, words - block * Self::WORDS_PER_BLOCK);
81            for word in 0..block_words {
82                block_ones += parent.data.word(block * Self::WORDS_PER_BLOCK + word).count_ones() as usize;
83                relative_ranks |= ((block_ones as u64) << (word as u64 * Self::RELATIVE_RANK_BITS as u64)) as u64;
84            }
85            // Clear the high bit. We don't store the relative rank after all 8 words.
86            relative_ranks &= bits::low_set((Self::WORDS_PER_BLOCK - 1) * Self::RELATIVE_RANK_BITS);
87            samples.push((ones as u64, relative_ranks));
88            ones += block_ones;
89        }
90
91        RankSupport {
92            samples,
93        }
94    }
95
96    /// Returns the rank at the specified index of the bitvector.
97    ///
98    /// # Arguments
99    ///
100    /// * `parent`: The parent bitvector.
101    /// * `index`: Index in the bit array or value in the integer array.
102    ///
103    /// # Examples
104    ///
105    /// ```
106    /// use simple_sds_sbwt::bit_vector::BitVector;
107    /// use simple_sds_sbwt::bit_vector::rank_support::RankSupport;
108    ///
109    /// let data = vec![false, true, true, false, true, false, true, true, false, false, false];
110    /// let bv: BitVector = data.into_iter().collect();
111    /// let rs = RankSupport::new(&bv);
112    /// assert_eq!(rs.rank(&bv, 0), 0);
113    /// assert_eq!(rs.rank(&bv, 1), 0);
114    /// assert_eq!(rs.rank(&bv, 2), 1);
115    /// assert_eq!(rs.rank(&bv, 7), 4);
116    /// ```
117    ///
118    /// # Panics
119    ///
120    /// May panic if `index >= parent.len()`.
121    pub fn rank(&self, parent: &BitVector, index: usize) -> usize {
122        let block = index / Self::BLOCK_SIZE;
123        let (word, offset) = bits::split_offset(index);
124
125        // Rank at the start of the block and relative ranks at the start of the words.
126        let (block_start, relative_ranks) = self.samples[block];
127
128        // Transform the absolute word index into a relative word index within the block.
129        // Then reorder the words 0..8 to 1..8, 0, because the second sample stores relative
130        // ranks for words 1..8.
131        let relative = (word + Self::WORDS_PER_BLOCK - 1) & Self::WORD_MASK;
132
133        // Relative rank at the start of the word.
134        let word_start = (relative_ranks >> (relative * Self::RELATIVE_RANK_BITS)) as usize & Self::RELATIVE_RANK_MASK;
135
136        // Relative rank within the word.
137        let within_word = (parent.data.word(word) & unsafe { bits::low_set_unchecked(offset) }).count_ones() as usize;
138
139        block_start as usize + word_start + within_word
140    }
141
142    /// Unsafe version of [`RankSupport::rank`] without bounds checks.
143    ///
144    /// # Safety
145    ///
146    /// Behavior is undefined if `index >= parent.len()`.
147    pub unsafe fn rank_unchecked(&self, parent: &BitVector, index: usize) -> usize {
148        let block = index / Self::BLOCK_SIZE;
149        let (word, offset) = bits::split_offset(index);
150
151        // Rank at the start of the block and relative ranks at the start of the words.
152        let (block_start, relative_ranks) = *self.samples.get_unchecked(block);
153
154        // Transform the absolute word index into a relative word index within the block.
155        // Then reorder the words 0..8 to 1..8, 0, because the second sample stores relative
156        // ranks for words 1..8.
157        let relative = ((word & Self::WORD_MASK) + Self::WORDS_PER_BLOCK - 1) & Self::WORD_MASK;
158
159        // Relative rank at the start of the word.
160        let word_start = (relative_ranks >> (relative * Self::RELATIVE_RANK_BITS)) as usize & Self::RELATIVE_RANK_MASK;
161
162        // Relative rank within the word.
163        let within_word = (parent.data.word_unchecked(word) & bits::low_set_unchecked(offset)).count_ones() as usize;
164
165        block_start as usize + word_start + within_word
166    }
167}
168
169//-----------------------------------------------------------------------------
170
171impl Serialize for RankSupport {
172    fn serialize_header<T: io::Write>(&self, writer: &mut T) -> io::Result<()> {
173        self.samples.serialize_header(writer)?;
174        Ok(())
175    }
176
177    fn serialize_body<T: io::Write>(&self, writer: &mut T) -> io::Result<()> {
178        self.samples.serialize_body(writer)?;
179        Ok(())
180    }
181
182    fn load<T: io::Read>(reader: &mut T) -> io::Result<Self> {
183        let samples = Vec::<(u64, u64)>::load(reader)?;
184        Ok(RankSupport {
185            samples,
186        })
187    }
188
189    fn size_in_elements(&self) -> usize {
190        self.samples.size_in_elements()
191    }
192}
193
194//-----------------------------------------------------------------------------
195
196#[cfg(test)]
197mod tests {
198    use super::*;
199    use crate::bit_vector::BitVector;
200    use crate::ops::BitVec;
201    use crate::raw_vector::{RawVector, PushRaw};
202    use crate::serialize;
203    use rand::Rng;
204
205    #[test]
206    fn empty_vector() {
207        let bv = BitVector::from(RawVector::new());
208        let rs = RankSupport::new(&bv);
209        assert_eq!(rs.blocks(), 0, "Non-zero rank blocks for empty vector");
210    }
211
212    fn raw_vector(len: usize) -> RawVector {
213        let mut data = RawVector::with_capacity(len);
214        let mut rng = rand::thread_rng();
215        while data.len() < len {
216            let value: u64 =  rng.gen();
217            let bits = cmp::min(bits::WORD_BITS, len - data.len());
218            unsafe { data.push_int(value, bits); }
219        }
220        assert_eq!(data.len(), len, "Invalid length for random RawVector");
221        data
222    }
223
224    fn test_vector(len: usize, blocks: usize) {
225        let data = raw_vector(len);
226        let bv = BitVector::from(data.clone());
227        let rs = RankSupport::new(&bv);
228        assert_eq!(bv.len(), len, "Invalid bitvector length at {}", len);
229        assert_eq!(rs.blocks(), blocks, "Invalid number of rank blocks at {}", len);
230
231        let mut count: usize = 0;
232        for i in 0..bv.len() {
233            assert_eq!(rs.rank(&bv, i), count, "Invalid rank({}) at {}", i, len);
234            count += bv.get(i) as usize;
235        }
236
237        unsafe {
238            let mut count: usize = 0;
239            for i in 0..bv.len() {
240                assert_eq!(rs.rank_unchecked(&bv, i), count, "Invalid rank_unchecked({}) at {}", i, len);
241                count += bv.get(i) as usize;
242            }
243        }
244    }
245
246    #[test]
247    fn non_empty_vector() {
248        test_vector(4095, 8);
249        test_vector(4096, 8);
250        test_vector(4097, 9);
251    }
252
253    #[test]
254    fn serialize() {
255        let data = raw_vector(5187);
256        let bv = BitVector::from(data);
257        let original = RankSupport::new(&bv);
258        let _ = serialize::test(&original, "rank-support", None, true);
259    }
260}
261
262//-----------------------------------------------------------------------------
263