simple_sds_sbwt/bit_vector/
select_support.rs

1//! Select queries on plain bitvectors.
2//!
3//! The structure is the same as `select_support_mcl` in [SDSL](https://github.com/simongog/sdsl-lite):
4//!
5//! > Gog, Petri: Optimized succinct data structures for massive data.  
6//! > Software: Practice and Experience, 2014.  
7//! > DOI: [10.1002/spe.2198](https://doi.org/10.1002/spe.2198)
8//!
9//! This is a simplified version of the original three-level structure:
10//!
11//! > Clark: Compact Pat Trees.  
12//! > Ph.D. Thesis (Section 2.2.2), University of Waterloo, 1996.  
13//! > [http://www.nlc-bnc.ca/obj/s4/f2/dsk3/ftp04/nq21335.pdf](http://www.nlc-bnc.ca/obj/s4/f2/dsk3/ftp04/nq21335.pdf)
14//!
15//! We divide the integer array into superblocks of 2^12 = 4096 values.
16//! For each superblock, we sample the first value.
17//! Let `x` and `y` be two consecutive superblock samples and let `u` be the universe size of the integer array.
18//! There are now two cases:
19//!
20//! 1. The superblock is long: `y - x >= log^4 u`.
21//!    We can store all values in the superblock explicitly.
22//! 2. The superblock is short.
23//!    We divide the superblock into blocks of 2^6 = 64 values.
24//!    For each block, we sample the first value relative to the superblock sample.
25//!    We then search for the value in the bit array one 64-bit word at a time.
26//!
27//! The space overhead is 18.75% in the worst case.
28
29use crate::bit_vector::{BitVector, Transformation};
30use crate::int_vector::IntVector;
31use crate::ops::{Vector, Resize, Pack, Access, Push, BitVec};
32use crate::serialize::Serialize;
33use crate::bits;
34
35use std::io::{Error, ErrorKind};
36use std::{io, marker};
37
38//-----------------------------------------------------------------------------
39
40/// Select support structure for plain bitvectors.
41///
42/// The structure depends on the parent bitvector and assumes that the parent remains unchanged.
43/// Using the [`BitVector`] interface is usually more convenient.
44///
45/// This type must be parametrized with a [`Transformation`].
46#[derive(Clone, Debug, PartialEq, Eq)]
47pub struct SelectSupport<T: Transformation> {
48    // (superblock sample, 2 * index + is_short) for each superblock.
49    samples: IntVector,
50
51    // If the superblock is long, the index from the superblock array points to the
52    // first value in this array.
53    long: IntVector,
54
55    // If the superblock is short, the index from the superblock array points to the
56    // first block sample in this array.
57    short: IntVector,
58
59    // We use `T` only for accessing static methods.
60    _marker: marker::PhantomData<T>,
61}
62
63impl<T: Transformation> SelectSupport<T> {
64    /// Number of ones per superblock (4096).
65    pub const SUPERBLOCK_SIZE: usize = 4096;
66
67    const SUPERBLOCK_MASK: usize = 0xFFF;
68    const BLOCKS_IN_SUPERBLOCK: usize = 64;
69    const BLOCK_SIZE: usize = 64;
70    const BLOCK_MASK: usize = 0x3F;
71
72    /// Returns the number superblocks in the bitvector.
73    pub fn superblocks(&self) -> usize {
74        self.samples.len() / 2
75    }
76
77    /// Returns the number of long superblocks in the bitvector.
78    pub fn long_superblocks(&self) -> usize {
79        (self.long.len() + Self::SUPERBLOCK_SIZE - 1) / Self::SUPERBLOCK_SIZE
80    }
81
82    /// Returns the number of short superblocks in the bitvector.
83    pub fn short_superblocks(&self) -> usize {
84        (self.short.len() + Self::BLOCKS_IN_SUPERBLOCK - 1) / Self::BLOCKS_IN_SUPERBLOCK
85    }
86
87    /// Builds a select support structure for the parent bitvector.
88    ///
89    /// # Examples
90    ///
91    /// ```
92    /// use simple_sds_sbwt::bit_vector::{BitVector, Identity};
93    /// use simple_sds_sbwt::bit_vector::select_support::SelectSupport;
94    ///
95    /// let mut data = vec![false, true, true, false, true, false, true, true, false, false, false];
96    /// let bv: BitVector = data.into_iter().collect();
97    /// let ss = SelectSupport::<Identity>::new(&bv);
98    /// assert_eq!(ss.superblocks(), 1);
99    /// assert_eq!(ss.long_superblocks(), 0);
100    /// assert_eq!(ss.short_superblocks(), 1);
101    /// ```
102    pub fn new(parent: &BitVector) -> SelectSupport<T> {
103        let superblocks = (T::count_ones(parent) + Self::SUPERBLOCK_SIZE - 1) / Self::SUPERBLOCK_SIZE;
104        let log4 = bits::bit_len(parent.len() as u64);
105        let log4 = log4 * log4;
106        let log4 = log4 * log4;
107
108        let mut result = SelectSupport {
109            samples: IntVector::default(),
110            long: IntVector::default(),
111            short: IntVector::default(),
112            _marker: marker::PhantomData,
113        };
114        result.samples.reserve(superblocks * 2);
115
116        // `sample_iter` will iterate over superblock samples.
117        let mut sample_iter = T::one_iter(parent);
118        let mut sample = sample_iter.next();
119
120        // `iter` will iterate over the values we store in `self.long` and `self.short`.
121        let mut iter = T::one_iter(parent);
122        let mut value = iter.next();
123
124        while sample != None {
125            let start = sample.unwrap();
126            let next_sample = sample_iter.nth(Self::SUPERBLOCK_SIZE - 1);
127            let limit = match next_sample {
128                Some(v) => v,
129                None => (T::count_ones(parent), parent.len()),
130            };
131            result.samples.push(start.1 as u64);
132            // Long superblock.
133            if limit.1 - start.1 >= log4 {
134                result.samples.push((2 * result.long.len()) as u64);
135                let values = limit.0 - start.0;
136                for _ in 0..values {
137                    result.long.push((value.unwrap().1 - start.1) as u64);
138                    value = iter.next();
139                }
140            }
141            // Short superblock.
142            else {
143                result.samples.push((2 * result.short.len() + 1) as u64);
144                let blocks = ((limit.0 - start.0) + Self::BLOCK_SIZE - 1) / Self::BLOCK_SIZE;
145                for _ in 0..blocks {
146                    result.short.push((value.unwrap().1 - start.1) as u64);
147                    value = iter.nth(Self::BLOCK_SIZE - 1);
148                }
149            }
150            sample = next_sample;
151        }
152
153        result.samples.pack();
154        result.long.pack();
155        result.short.pack();
156        result
157    }
158
159    /// Returns the value of the specified rank in the parent bitvector.
160    ///
161    /// # Arguments
162    ///
163    /// * `parent`: The parent bitvector.
164    /// * `rank`: Index in the integer array or rank of a set bit in the bit array.
165    ///
166    /// # Examples
167    ///
168    /// ```
169    /// use simple_sds_sbwt::bit_vector::{BitVector, Identity};
170    /// use simple_sds_sbwt::bit_vector::select_support::SelectSupport;
171    ///
172    /// let mut data = vec![false, true, true, false, true, false, true, true, false, false, false];
173    /// let bv: BitVector = data.into_iter().collect();
174    /// let ss = SelectSupport::<Identity>::new(&bv);
175    /// assert_eq!(ss.select(&bv, 0), 1);
176    /// assert_eq!(ss.select(&bv, 1), 2);
177    /// assert_eq!(ss.select(&bv, 4), 7);
178    /// ```
179    ///
180    /// # Panics
181    ///
182    /// May panic if `rank >= T::count_ones(parent)`.
183    pub fn select(&self, parent: &BitVector, rank: usize) -> usize {
184        let (superblock, offset) = (rank / Self::SUPERBLOCK_SIZE, rank & Self::SUPERBLOCK_MASK);
185        let mut result: usize = self.samples.get(2 * superblock) as usize;
186        if offset == 0 {
187            return result;
188        }
189
190        let ptr = self.samples.get(2 * superblock + 1) as usize;
191        let (ptr, is_short) = (ptr / 2, ptr & 1);
192        if is_short == 0 {
193            result += self.long.get(ptr + offset) as usize;
194        } else {
195            let (block, mut relative_rank) = (offset / Self::BLOCK_SIZE, offset & Self::BLOCK_MASK);
196            result += self.short.get(ptr + block) as usize;
197            // Search within the block until we find the set bit of relative rank `relative_rank`
198            // from the start of the current word.
199            if relative_rank > 0 {
200                let (mut word, word_offset) = bits::split_offset(result);
201                let mut value: u64 = T::word(parent, word) & unsafe { !bits::low_set_unchecked(word_offset) };
202                loop {
203                    let ones = value.count_ones() as usize;
204                    if ones > relative_rank {
205                        result = bits::bit_offset(word, unsafe { bits::select(value, relative_rank) });
206                        break;
207                    }
208                    relative_rank -= ones;
209                    word += 1;
210                    value = T::word(parent, word);
211                }
212            }
213        }
214
215        result
216    }
217
218    /// Unsafe version of [`SelectSupport::select`] without some bounds checks.
219    ///
220    /// # Safety
221    ///
222    /// Behavior is undefined if `rank >= T::count_ones(parent)`.
223    pub unsafe fn select_unchecked(&self, parent: &BitVector, rank: usize) -> usize {
224        let (superblock, offset) = (rank / Self::SUPERBLOCK_SIZE, rank & Self::SUPERBLOCK_MASK);
225        let mut result: usize = self.samples.get(2 * superblock) as usize;
226        if offset == 0 {
227            return result;
228        }
229
230        let ptr = self.samples.get(2 * superblock + 1) as usize;
231        let (ptr, is_short) = (ptr / 2, ptr & 1);
232        if is_short == 0 {
233            result += self.long.get(ptr + offset) as usize;
234        } else {
235            let (block, mut relative_rank) = (offset / Self::BLOCK_SIZE, offset & Self::BLOCK_MASK);
236            result += self.short.get(ptr + block) as usize;
237            // Search within the block until we find the set bit of relative rank `relative_rank`
238            // from the start of the current word.
239            if relative_rank > 0 {
240                let (mut word, word_offset) = bits::split_offset(result);
241                let mut value: u64 = T::word_unchecked(parent, word) & !bits::low_set_unchecked(word_offset);
242                loop {
243                    let ones = value.count_ones() as usize;
244                    if ones > relative_rank {
245                        result = bits::bit_offset(word, bits::select(value, relative_rank));
246                        break;
247                    }
248                    relative_rank -= ones;
249                    word += 1;
250                    value = T::word_unchecked(parent, word);
251                }
252            }
253        }
254
255        result
256    }
257}
258
259//-----------------------------------------------------------------------------
260
261impl<T: Transformation> Serialize for SelectSupport<T> {
262    fn serialize_header<W: io::Write>(&self, _: &mut W) -> io::Result<()> {
263        Ok(())
264    }
265
266    fn serialize_body<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
267        self.samples.serialize(writer)?;
268        self.long.serialize(writer)?;
269        self.short.serialize(writer)?;
270        Ok(())
271    }
272
273    fn load<W: io::Read>(reader: &mut W) -> io::Result<Self> {
274        let samples = IntVector::load(reader)?;
275        let long = IntVector::load(reader)?;
276        let short = IntVector::load(reader)?;
277        let result = SelectSupport {
278            samples,
279            long,
280            short,
281            _marker: marker::PhantomData,
282        };
283        if result.superblocks() != result.long_superblocks() + result.short_superblocks() {
284            Err(Error::new(ErrorKind::InvalidData, "Invalid long/short superblock counts"))
285        }
286        else {
287            Ok(result)
288        }
289    }
290
291    fn size_in_elements(&self) -> usize {
292        self.samples.size_in_elements() + self.long.size_in_elements() + self.short.size_in_elements()
293    }
294}
295
296//-----------------------------------------------------------------------------
297
298#[cfg(test)]
299mod tests {
300    use super::*;
301    use crate::bit_vector::{BitVector, Identity};
302    use crate::ops::BitVec;
303    use crate::raw_vector::{RawVector, PushRaw};
304    use crate::serialize;
305    use rand::distributions::{Bernoulli, Distribution};
306
307    #[test]
308    fn empty_vector() {
309        let bv = BitVector::from(RawVector::new());
310        let ss = SelectSupport::<Identity>::new(&bv);
311        assert_eq!(ss.superblocks(), 0, "Non-zero select superblocks for empty vector");
312        assert_eq!(ss.long_superblocks(), 0, "Non-zero long superblocks for empty vector");
313        assert_eq!(ss.short_superblocks(), 0, "Non-zero short superblocks for empty vector");
314    }
315
316    fn with_density(len: usize, density: f64) -> RawVector {
317        let mut data = RawVector::with_capacity(len);
318        let mut rng = rand::thread_rng();
319        let dist = Bernoulli::new(density).unwrap();
320        let mut iter = dist.sample_iter(&mut rng);
321        while data.len() < len {
322            data.push_bit(iter.next().unwrap());
323        }
324        assert_eq!(data.len(), len, "Invalid length for random RawVector");
325        data
326    }
327
328    fn test_vector(len: usize, density: f64) {
329        let data = with_density(len, density);
330        let bv = BitVector::from(data.clone());
331        let ss = SelectSupport::<Identity>::new(&bv);
332        assert_eq!(bv.len(), len, "test_vector({}, {}): invalid bitvector length", len, density);
333
334        let superblocks = ss.superblocks();
335        let long = ss.long_superblocks();
336        let short = ss.short_superblocks();
337        assert_eq!(superblocks, long + short, "test_vector({}, {}): block counts do not match", len, density);
338
339        // This test assumes that the number of ones is within 6 stdevs of the expected.
340        let ones: f64 = bv.count_ones() as f64;
341        let expected: f64 = len as f64 * density;
342        let stdev: f64 = (len as f64 * density * (1.0 - density)).sqrt();
343        assert!(ones >= expected - 6.0 * stdev && ones <= expected + 6.0 * stdev,
344            "test_vector({}, {}): unexpected number of ones: {}", len, density, ones);
345
346        let mut next: usize = 0;
347        for i in 0..bv.count_ones() {
348            let value = ss.select(&bv, i);
349            assert!(value >= next, "test_vector({}, {}): select({}) == {}, expected at least {}", len, density, i, value, next);
350            assert!(bv.get(value), "test_vector({}, {}): select({}) == {} is not set", len, density, i, value);
351            next = value + 1;
352        }
353
354        unsafe {
355            let mut next: usize = 0;
356            for i in 0..bv.count_ones() {
357                let value = ss.select_unchecked(&bv, i);
358                assert!(value >= next, "test_vector({}, {}): select_unchecked({}) == {}, expected at least {}", len, density, i, value, next);
359                assert!(bv.get(value), "test_vector({}, {}): select_unchecked({}) == {} is not set", len, density, i, value);
360                next = value + 1;
361            }
362        }
363    }
364
365    #[test]
366    fn non_empty_vector() {
367        test_vector(8131, 0.1);
368        test_vector(8192, 0.5);
369        test_vector(8266, 0.9);
370    }
371
372    #[test]
373    fn serialize() {
374        let data = with_density(5187, 0.5);
375        let bv = BitVector::from(data);
376        let original = SelectSupport::<Identity>::new(&bv);
377        let _ = serialize::test(&original, "select-support", None, true);
378    }
379}
380
381//-----------------------------------------------------------------------------