simple_sds_sbwt/
rl_vector.rs

1//! A run-length encoded bitvector supporting rank, select, and related queries.
2//!
3//! This is a run-length encoded bitvector similar to the one used in RLCSA:
4//!
5//! > Mäkinen, Navarro, Sirén, Välimäki: Storage and Retrieval of Highly Repetitive Sequence Collections.  
6//! > Journal of Computational Biology, 2010.  
7//! > DOI: [10.1089/cmb.2009.0169](https://doi.org/10.1089/cmb.2009.0169)
8//!
9//! The vector is encoded by storing the maximal runs of unset and set bits.
10//! If there is a run of `n0` unset bits followed by a run of `n1` set bits, we encode it as integers `n0` and `n1 - 1`.
11//! Each integer is encoded in little-endian order using 4-bit code units.
12//! The lowest 3 bits of each code unit contain data.
13//! If the high bit is set, the encoding continues in the next unit.
14//! We partition the encoding into 64-unit (32-byte) blocks that consist of entire runs.
15//! If there is not enough space left for encoding the next `(n0, n1)`, we pad the block with empty code units and start a new block.
16//!
17//! For each block, we store a sample `(rank(i, 1), i)`, where `i` is the number of bits encoded before that block.
18//! Queries use binary search on the samples to find the right block and then decompress the block sequentially.
19//! A [`SampleIndex`] is used for narrowing down the range of the binary search.
20
21use crate::int_vector::IntVector;
22use crate::ops::{Vector, Access, Push, Resize, BitVec, Rank, Select, SelectZero, PredSucc};
23use crate::rl_vector::index::SampleIndex;
24use crate::serialize::Serialize;
25use crate::bits;
26
27use std::io::{Error, ErrorKind};
28use std::iter::FusedIterator;
29use std::{cmp, io};
30
31pub mod index;
32
33#[cfg(test)]
34mod tests;
35
36//-----------------------------------------------------------------------------
37
38/// An immutable run-length encoded bitvector supporting rank, select, and related queries.
39///
40/// This type should be used when the bitvector contains long runs of both set and unset bits.
41/// Other bitvector types are more appropriate for dense (no long runs) and sparse (long runs of unset bits) bitvectors.
42/// The bitvector is immutable, though it would be easy to implement a mutable version by storing the blocks in a B+-tree rather than a vector.
43/// The maximum length of the vector is approximately [`usize::MAX`] bits.
44///
45/// Conversions between various [`BitVec`] types are possible using the [`From`] trait.
46///
47/// `RLVector` implements the following `simple_sds` traits:
48/// * Basic functionality: [`BitVec`]
49/// * Queries and operations: [`Rank`], [`Select`], [`SelectZero`], [`PredSucc`]
50/// * Serialization: [`Serialize`]
51///
52/// # Examples
53///
54/// ```
55/// use simple_sds_sbwt::ops::{BitVec, Rank, Select, SelectZero, PredSucc};
56/// use simple_sds_sbwt::rl_vector::{RLVector, RLBuilder};
57///
58/// let mut builder = RLBuilder::new();
59/// builder.try_set(18, 22);
60/// builder.try_set(95, 15);
61/// builder.try_set(110, 10);
62/// builder.try_set(140, 12);
63/// builder.set_len(200);
64/// let rv = RLVector::from(builder);
65///
66/// // BitVec
67/// assert_eq!(rv.len(), 200);
68/// assert!(!rv.is_empty());
69/// assert_eq!(rv.count_ones(), 59);
70/// assert_eq!(rv.count_zeros(), 141);
71/// assert!(rv.get(119));
72/// assert!(!rv.get(120));
73/// for (index, value) in rv.iter().enumerate() {
74///     assert_eq!(value, rv.get(index));
75/// }
76///
77/// // Rank
78/// assert!(rv.supports_rank());
79/// assert_eq!(rv.rank(100), 27);
80/// assert_eq!(rv.rank(130), 47);
81/// assert_eq!(rv.rank_zero(60), 38);
82///
83/// // Select
84/// assert!(rv.supports_select());
85/// assert_eq!(rv.select(24), Some(97));
86/// let mut iter = rv.select_iter(46);
87/// assert_eq!(iter.next(), Some((46, 119)));
88/// assert_eq!(iter.next(), Some((47, 140)));
89/// let v: Vec<(usize, usize)> = rv.one_iter().take(4).collect();
90/// assert_eq!(v, vec![(0, 18), (1, 19), (2, 20), (3, 21)]);
91///
92/// // SelectZero
93/// assert!(rv.supports_select_zero());
94/// assert_eq!(rv.select_zero(130), Some(189));
95/// let mut iter = rv.select_zero_iter(72);
96/// assert_eq!(iter.next(), Some((72, 94)));
97/// assert_eq!(iter.next(), Some((73, 120)));
98/// let v: Vec<(usize, usize)> = rv.zero_iter().take(4).collect();
99/// assert_eq!(v, vec![(0, 0), (1, 1), (2, 2), (3, 3)]);
100///
101/// // PredSucc
102/// assert!(rv.supports_pred_succ());
103/// assert!(rv.predecessor(17).next().is_none());
104/// assert_eq!(rv.predecessor(18).next(), Some((0, 18)));
105/// assert_eq!(rv.predecessor(40).next(), Some((21, 39)));
106/// assert_eq!(rv.successor(139).next(), Some((47, 140)));
107/// assert_eq!(rv.successor(140).next(), Some((47, 140)));
108/// assert!(rv.successor(152).next().is_none());
109/// ```
110///
111/// # Notes
112///
113/// * `RLVector` never panics from I/O errors.
114/// * All `RLVector` queries are always enabled without additional support structures.
115#[derive(Clone, Debug, PartialEq, Eq)]
116pub struct RLVector {
117    len: usize,
118    ones: usize,
119    rank_index: SampleIndex,
120    select_index: SampleIndex,
121    select_zero_index: SampleIndex,
122    // (ones, bits) up to the start of each block.
123    samples: IntVector,
124    // Concatenated blocks.
125    data: IntVector,
126}
127
128impl RLVector {
129    /// Number of bits in a code unit.
130    pub const CODE_SIZE: usize = 4;
131
132    // Number of data bits in a code unit.
133    const CODE_SHIFT: usize = Self::CODE_SIZE - 1;
134
135    // If this bit is set in a code unit, the encoding continues in the next unit.
136    const CODE_FLAG: u64 = 1 << Self::CODE_SHIFT;
137
138    // Largest value that can be stored in a single code unit.
139    const CODE_MASK: u64 = (1 << Self::CODE_SHIFT) - 1;
140
141    /// Number of code units in a block.
142    pub const BLOCK_SIZE: usize = 64;
143
144    /// Returns a copy of the source bitvector as `RLVector`.
145    ///
146    /// The copy is created by iterating over the set bits using [`Select::one_iter`].
147    /// [`From`] implementations from other bitvector types should generally use this function.
148    ///
149    /// # Examples
150    ///
151    /// ```
152    /// use simple_sds_sbwt::bit_vector::BitVector;
153    /// use simple_sds_sbwt::ops::BitVec;
154    /// use simple_sds_sbwt::rl_vector::RLVector;
155    /// use std::iter::FromIterator;
156    ///
157    /// let source: Vec<bool> = vec![true, false, true, true, false, true, true, false];
158    /// let bv = BitVector::from_iter(source);
159    /// let rv = RLVector::copy_bit_vec(&bv);
160    /// assert_eq!(rv.len(), bv.len());
161    /// assert_eq!(rv.count_ones(), bv.count_ones());
162    /// ```
163    pub fn copy_bit_vec<'a, T: BitVec<'a> + Select<'a>>(source: &'a T) -> Self {
164        let mut builder = RLBuilder::new();
165        for (_, index) in source.one_iter() {
166            unsafe { builder.set_bit_unchecked(index); }
167        }
168        builder.set_len(source.len());
169        RLVector::from(builder)
170    }
171
172    /// Returns an iterator over the runs of set bits in the bitvector.
173    ///
174    /// See [`RunIter`] for an example.
175    pub fn run_iter(&self) -> RunIter<'_> {
176        RunIter {
177            parent: self,
178            offset: 0,
179            pos: (0, 0),
180            limit: self.ones_after(0),
181        }
182    }
183
184    // Returns the number of blocks in the encoding.
185    fn blocks(&self) -> usize {
186        self.samples.len() / 2
187    }
188
189    // Returns the number of set bits after the given block.
190    fn ones_after(&self, block: usize) -> usize {
191        if block + 1 < self.blocks() {
192            self.samples.get(2 * (block + 1)) as usize
193        } else {
194            self.count_ones()
195        }
196    }
197
198    // Decodes a value starting from `offset` and returns (value, new offset).
199    fn decode(&self, offset: usize) -> (usize, usize) {
200        let mut value = 0;
201        let mut offset = offset;
202        let mut shift = 0;
203        loop {
204            let code = self.data.get(offset);
205            offset += 1;
206            value += ((code & Self::CODE_MASK) << shift) as usize;
207            shift += Self::CODE_SHIFT;
208            if code & Self::CODE_FLAG == 0 {
209                return (value, offset);
210            }
211        }
212    }
213
214    // Returns the identifier of the last block `i` in the range where `f(i) <= value`.
215    fn block_for<F: Fn(usize) -> usize>(low: usize, high: usize, value: usize, f: F) -> usize {
216        let mut low = low;
217        let mut high = high;
218        while high - low > 1 {
219            let mid = low + (high - low) / 2;
220            let candidate = f(mid);
221            if candidate <= value {
222                low = mid;
223            } else {
224                high = mid;
225            }
226        }
227        low
228    }
229
230    // Returns an iterator covering the given block.
231    fn iter_for_block(&self, block: usize) -> RunIter<'_> {
232        let pos = if self.samples.is_empty() { (0, 0) } else { (self.samples.get(2 * block) as usize, self.samples.get(2 * block + 1) as usize) };
233        RunIter {
234            parent: self,
235            offset: block * Self::BLOCK_SIZE,
236            pos,
237            limit: self.ones_after(block),
238        }
239    }
240
241    // Returns an iterator covering the block that contains the given bit, or an empty iterator if there is no such bit.
242    fn iter_for_bit(&self, index: usize) -> RunIter<'_> {
243        if index >= self.len() {
244            return RunIter::empty_iter(self);
245        }
246
247        let range = self.rank_index.range(index);
248        let block = Self::block_for(range.start, range.end, index, |i| self.samples.get(2 * i + 1) as usize);
249        self.iter_for_block(block)
250    }
251
252    // Returns an iterator covering the block that contains the given set bit, or an empty iterator if there is no such bit.
253    fn iter_for_one(&self, rank: usize) -> RunIter<'_> {
254        if rank >= self.count_ones() {
255            return RunIter::empty_iter(self);
256        }
257
258        let range = self.select_index.range(rank);
259        let block = Self::block_for(range.start, range.end, rank, |i| self.samples.get(2 * i) as usize);
260        self.iter_for_block(block)
261    }
262
263    // Returns an iterator covering the block that contains the given unset bit, or an empty iterator if there is no such bit.
264    fn iter_for_zero(&self, rank: usize) -> RunIter<'_> {
265        if rank >= self.count_zeros() {
266            return RunIter::empty_iter(self);
267        }
268
269        let range = self.select_zero_index.range(rank);
270        let block = Self::block_for(range.start, range.end, rank, |i| (self.samples.get(2 * i + 1) - self.samples.get(2 * i)) as usize);
271        self.iter_for_block(block)
272    }
273}
274
275//-----------------------------------------------------------------------------
276
277/// Space-efficient [`RLVector`] construction.
278///
279/// `RLBuilder` builds an [`RLVector`] incrementally.
280/// Bits must be set in order, and they cannot be unset later.
281/// Set bits are combined into maximal runs.
282/// Once the construction is finished, the builder can be converted into an [`RLVector`] using the [`From`] trait.
283///
284/// # Examples
285///
286/// ```
287/// use simple_sds_sbwt::ops::BitVec;
288/// use simple_sds_sbwt::rl_vector::{RLVector, RLBuilder};
289///
290/// let mut builder = RLBuilder::new();
291///
292/// builder.try_set(18, 22);
293/// assert_eq!(builder.len(), 40);
294///
295/// // Combine two runs into one.
296/// builder.try_set(95, 15);
297/// builder.try_set(110, 10);
298///
299/// // Trying to set earlier bits will fail.
300/// assert!(builder.try_set(100, 1).is_err());
301///
302/// builder.try_set(140, 12);
303///
304/// // Append a final run of unset bits.
305/// builder.set_len(200);
306///
307/// let rv = RLVector::from(builder);
308/// assert_eq!(rv.len(), 200);
309/// ```
310#[derive(Clone, Debug)]
311pub struct RLBuilder {
312    len: usize,
313    ones: usize,
314    // Position after the last encoded run.
315    tail: usize,
316    run: (usize, usize),
317    samples: Vec<(usize, usize)>,
318    data: IntVector,
319}
320
321impl RLBuilder {
322    /// Returns an empty `RLBuilder`.
323    pub fn new() -> Self {
324        RLBuilder::default()
325    }
326
327    /// Returns the length of the bitvector.
328    ///
329    /// This is the first position that can be set.
330    pub fn len(&self) -> usize {
331        self.len
332    }
333
334    /// Returns `true` if the bitvector is empty.
335    ///
336    /// Makes Clippy happy.
337    pub fn is_empty(&self) -> bool {
338        self.len() == 0
339    }
340
341    /// Returns the number of set bits in the bitvector.
342    pub fn count_ones(&self) -> usize {
343        self.ones
344    }
345
346    /// Returns the number of unset bits in the bitvector.
347    pub fn count_zeros(&self) -> usize {
348        self.len() - self.count_ones()
349    }
350
351    // Returns the position after the last encoded run.
352    fn tail(&self) -> usize {
353        self.tail
354    }
355
356    // Returns the number of blocks in the encoding.
357    fn blocks(&self) -> usize {
358        self.samples.len()
359    }
360
361    /// Encodes a run of set bits of length `len` starting at position `start`.
362    ///
363    /// Does nothing if `len == 0`.
364    /// Returns [`Err`] if `start < self.len()` of `start + len > usize::MAX`.
365    ///
366    /// # Arguments
367    ///
368    /// * `start`: Starting position of the run.
369    /// * `len`: Length of the run.
370    pub fn try_set(&mut self, start: usize, len: usize) -> Result<(), String> {
371        if start < self.len() {
372            return Err(format!("RLBuilder: Cannot set bit {} when length is {}", start, self.len()));
373        }
374        if usize::MAX - len < start {
375            return Err(format!("RLBuilder: A run of length {} starting at {} exceeds the maximum length of a bitvector", start, len));
376        }
377        unsafe { self.set_run_unchecked(start, len); }
378        Ok(())
379    }
380
381    /// Sets the specified bit in the bitvector.
382    ///
383    /// # Safety
384    ///
385    /// Behavior is undefined if `index < self.len()`.
386    pub unsafe fn set_bit_unchecked(&mut self, index: usize) {
387        self.set_run_unchecked(index, 1);
388    }
389
390    /// Encodes a run of set bits of length `len` starting at position `start`.
391    ///
392    /// Does nothing if `len == 0`.
393    ///
394    /// # Arguments
395    ///
396    /// * `start`: Starting position of the run.
397    /// * `len`: Length of the run.
398    ///
399    /// # Safety
400    ///
401    /// Behavior is undefined if `start < self.len()` of `start + len > usize::MAX`.
402    pub unsafe fn set_run_unchecked(&mut self, start: usize, len: usize) {
403        if len == 0 {
404            return;
405        }
406        if start == self.len() {
407            self.len += len;
408            self.ones += len;
409            self.run.1 += len;
410        } else {
411            self.flush();
412            self.len = start + len;
413            self.ones += len;
414            self.run = (start, len);
415        }
416    }
417
418    /// Sets the length of the bitvector to `len` bits.
419    ///
420    /// No effect if `self.len() >= len`.
421    /// This is intended for appending a final run of unset bits.
422    pub fn set_len(&mut self, len: usize) {
423        if len > self.len() {
424            self.flush();
425            self.len = len;
426        }
427    }
428
429    // Encodes the current run if necessary and sets the active run to `(self.len(), 0)`.
430    fn flush(&mut self) {
431        if self.run.1 == 0 {
432            return;
433        }
434
435        // Add a new block if there is not enough space for the run in the current block.
436        let units_needed = Self::code_len(self.run.0 - self.tail()) + Self::code_len(self.run.1 - 1);
437        if self.data.len() + units_needed > self.blocks() * RLVector::BLOCK_SIZE {
438            self.data.resize(self.blocks() * RLVector::BLOCK_SIZE, 0);
439            let sample = (self.ones - self.run.1, self.tail);
440            self.samples.push(sample);
441        }
442
443        // Encode the run and update the statistics.
444        self.encode(self.run.0 - self.tail());
445        self.encode(self.run.1 - 1);
446        self.tail = self.run.0 + self.run.1;
447        self.run = (self.len(), 0);
448    }
449
450    // Encodes the given value.
451    fn encode(&mut self, value: usize) {
452        let mut value = value as u64;
453        while value > RLVector::CODE_MASK {
454            self.data.push((value & RLVector::CODE_MASK) | RLVector::CODE_FLAG);
455            value = value >> RLVector::CODE_SHIFT;
456        }
457        self.data.push(value);
458    }
459
460    // Number of code units required for encoding the value.
461    fn code_len(value: usize) -> usize {
462        bits::div_round_up(bits::bit_len(value as u64), RLVector::CODE_SHIFT)
463    }
464}
465
466impl Default for RLBuilder {
467    fn default() -> Self {
468        RLBuilder {
469            len: 0,
470            ones: 0,
471            tail: 0,
472            run: (0, 0),
473            samples: Vec::new(),
474            data: IntVector::new(RLVector::CODE_SIZE).unwrap(),
475        }
476    }
477}
478
479impl From<RLBuilder> for RLVector {
480    fn from(builder: RLBuilder) -> Self {
481        let mut builder = builder;
482        builder.flush();
483
484        // Build indexes for narrowing down binary search ranges.
485        let rank_index = SampleIndex::new(builder.samples.iter().map(|(_, bits)| *bits), builder.len());
486        let select_index = SampleIndex::new(builder.samples.iter().map(|(ones, _)| *ones), builder.count_ones());
487        let select_zero_index = SampleIndex::new(builder.samples.iter().map(|(ones, bits)| bits - ones), builder.count_zeros());
488
489        // Compress the samples.
490        let max_value = builder.samples.last().unwrap_or(&(0, 0)).1;
491        let mut samples = IntVector::with_capacity(2 * builder.blocks(), bits::bit_len(max_value as u64)).unwrap();
492        for (ones, bits) in builder.samples.iter() {
493            samples.push(*ones as u64);
494            samples.push(*bits as u64);
495        }
496
497        RLVector {
498            len: builder.len(),
499            ones: builder.count_ones(),
500            rank_index,
501            select_index,
502            select_zero_index,
503            samples: samples,
504            data: builder.data,
505        }
506    }
507}
508
509//-----------------------------------------------------------------------------
510
511/// A read-only iterator over the runs in [`RLVector`].
512///
513/// The type of `Item` is `(`[`usize`]`, `[`usize`]`)`.
514/// The first value is the starting position of a maximal run of set bits, and the second value is its length.
515///
516/// Most [`RLVector`] queries use this iterator for iterating over the runs in a block.
517///
518/// # Examples
519///
520/// ```
521/// use simple_sds_sbwt::rl_vector::{RLVector, RLBuilder};
522///
523/// let mut builder = RLBuilder::new();
524/// builder.try_set(18, 22);
525/// builder.try_set(95, 15);
526/// builder.try_set(110, 10); // Merge with the previous run.
527/// builder.try_set(140, 12);
528/// builder.set_len(200);
529/// let rv = RLVector::from(builder);
530///
531/// let runs: Vec<(usize, usize)> = rv.run_iter().collect();
532/// assert_eq!(runs, vec![(18, 22), (95, 25), (140, 12)]);
533///
534/// let mut iter = rv.run_iter();
535/// assert_eq!(iter.offset(), 0);
536/// assert_eq!(iter.rank(), 0);
537/// assert_eq!(iter.rank_zero(), 0);
538/// while let Some(_) = iter.next() {}
539/// assert_eq!(iter.offset(), 140 + 12);
540/// assert_eq!(iter.rank(), 22 + 25 + 12);
541/// assert_eq!(iter.rank_zero(), 140 - 22 - 25);
542/// ```
543#[derive(Clone, Debug)]
544pub struct RunIter<'a> {
545    parent: &'a RLVector,
546    // Offset in the encoding.
547    offset: usize,
548    // (rank, index) reached so far.
549    pos: (usize, usize),
550    // Number of ones after the current block.
551    limit: usize,
552}
553
554impl<'a> RunIter<'a> {
555    /// Returns the position in the bitvector after the latest run.
556    pub fn offset(&self) -> usize {
557        self.pos.1
558    }
559
560    /// Returns the number of set bits up to the end of the latest run.
561    pub fn rank(&self) -> usize {
562        self.pos.0
563    }
564
565    /// Returns the number of unset bits up to the end of the latest run.
566    pub fn rank_zero(&self) -> usize {
567        self.offset() - self.rank()
568    }
569
570    // Returns an empty iterator for the parent bitvector.
571    fn empty_iter(parent: &'a RLVector) -> Self {
572        RunIter {
573            parent,
574            offset: parent.data.len(),
575            pos: (parent.count_ones(), parent.len()),
576            limit: parent.count_ones(),
577        }
578    }
579
580    // Returns the position in the bitvector for the set bit of given rank, assuming that it is covered by the current run.
581    fn offset_for(&self, rank: usize) -> usize {
582        self.offset() - (self.rank() - rank)
583    }
584
585    // Returns the rank at a given position in the bitvector, assuming that it is covered by the current run.
586    fn rank_at(&self, index: usize) -> usize {
587        self.rank() - (self.offset() - index)
588    }
589
590    // Like `next()`, but only advances if the `advance()` returns `true` for the next run.
591    fn advance_if<F: FnMut(Option<<Self as Iterator>::Item>) -> bool>(&mut self, mut advance: F) -> Option<<Self as Iterator>::Item> {
592        if self.offset >= self.parent.data.len() {
593            // We are at the end anyway, but we will call `advance()` to let the user know
594            // what the next run would be.
595            let _ = advance(None);
596            return None;
597        }
598
599        // Move to the next block if we have run out of ones.
600        let mut offset = self.offset;
601        let mut limit = self.limit;
602        if self.rank() >= self.limit {
603            let block = bits::div_round_up(offset, RLVector::BLOCK_SIZE);
604            offset = block * RLVector::BLOCK_SIZE;
605            if block >= self.parent.blocks() {
606                if advance(None) {
607                    self.offset = offset;
608                }
609                return None;
610            }
611            limit = self.parent.ones_after(block);
612        }
613
614        // Decode the next run.
615        let (gap, offset) = self.parent.decode(offset);
616        let start = self.offset() + gap;
617        let (len, offset) = self.parent.decode(offset);
618        let result = Some((start, len + 1));
619
620        if advance(result) {
621            self.offset = offset;
622            self.limit = limit;
623            self.pos.0 += len + 1;
624            self.pos.1 = start + len + 1;
625        }
626        result
627    }
628}
629
630impl<'a> Iterator for RunIter<'a> {
631    // (start, length)
632    type Item = (usize, usize);
633
634    fn next(&mut self) -> Option<Self::Item> {
635        self.advance_if(|_| true)
636    }
637}
638
639impl<'a> FusedIterator for RunIter<'a> {}
640
641//-----------------------------------------------------------------------------
642
643/// A read-only iterator over [`RLVector`].
644///
645/// The type of `Item` is [`bool`].
646///
647/// # Examples
648///
649/// ```
650/// use simple_sds_sbwt::ops::{BitVec};
651/// use simple_sds_sbwt::rl_vector::{RLVector, RLBuilder};
652///
653/// let mut builder = RLBuilder::new();
654/// builder.try_set(18, 22);
655/// builder.try_set(95, 15);
656/// builder.try_set(110, 10);
657/// builder.try_set(140, 12);
658/// builder.set_len(200);
659/// let rv = RLVector::from(builder);
660///
661/// assert_eq!(rv.iter().len(), rv.len());
662/// for (index, value) in rv.iter().enumerate() {
663///     assert_eq!(value, rv.get(index));
664/// }
665/// ```
666#[derive(Clone, Debug)]
667pub struct Iter<'a> {
668    iter: RunIter<'a>,
669    // Run from the iterator.
670    run: Option<(usize, usize)>,
671    // Next bitvector offset.
672    pos: usize,
673}
674
675impl<'a> Iterator for Iter<'a> {
676    type Item = bool;
677
678    fn next(&mut self) -> Option<Self::Item> {
679        // Read the next run if we have processed the current one.
680        if let Some((start, len)) = self.run {
681            if self.pos >= start + len {
682                self.run = self.iter.next();
683            }
684        }
685
686        // Determine the next bit and advance.
687        match self.run {
688            Some((start, _)) => {
689                self.pos += 1;
690                Some(self.pos > start)
691            },
692            None => {
693                if self.pos >= self.iter.parent.len() {
694                    None
695                } else {
696                    self.pos += 1;
697                    Some(false)
698                }
699            },
700        }
701    }
702
703    #[inline]
704    fn size_hint(&self) -> (usize, Option<usize>) {
705        let remaining = self.iter.parent.len() - self.pos;
706        (remaining, Some(remaining))
707    }
708}
709
710impl<'a> ExactSizeIterator for Iter<'a> {}
711
712impl<'a> FusedIterator for Iter<'a> {}
713
714//-----------------------------------------------------------------------------
715
716impl<'a> BitVec<'a> for RLVector {
717    type Iter = Iter<'a>;
718
719    #[inline]
720    fn len(&self) -> usize {
721        self.len
722    }
723
724    #[inline]
725    fn count_ones(&self) -> usize {
726        self.ones
727    }
728
729    fn get(&self, index: usize) -> bool {
730        let mut iter = self.iter_for_bit(index);
731        while let Some((start, _)) = iter.next() {
732            if start > index {
733                return false;
734            }
735            if index < iter.offset() {
736                return true;
737            }
738        }
739
740        // Final run of unset bits.
741        false
742    }
743
744    fn iter(&'a self) -> Self::Iter {
745        Self::Iter {
746            iter: self.run_iter(),
747            run: Some((0, 0)),
748            pos: 0,
749        }
750    }
751}
752
753//-----------------------------------------------------------------------------
754
755impl<'a> Rank<'a> for RLVector {
756    fn supports_rank(&self) -> bool {
757        true
758    }
759
760    fn enable_rank(&mut self) {}
761
762    fn rank(&self, index: usize) -> usize {
763        let mut iter = self.iter_for_bit(index);
764        while let Some((start, len)) = iter.next() {
765            // We reached `index` but this run is too late to affect the rank.
766            if start >= index {
767                return iter.rank() - len;
768            }
769            // We reached `index` and a part of this run affects the rank.
770            if iter.offset() >= index {
771                return iter.rank_at(index);
772            }
773        }
774
775        iter.rank()
776    }
777}
778
779//-----------------------------------------------------------------------------
780
781/// An iterator over the set bits in [`RLVector`].
782///
783/// The type of `Item` is `(`[`usize`]`, `[`usize`]`)`.
784/// This can be interpreted as:
785///
786/// * `(index, value)` or `(i, select(i))` in the integer array; or
787/// * `(rank(j), j)` in the bit array with `j` such that `self.get(j) == true`.
788///
789/// Note that `index` is not always the index provided by [`Iterator::enumerate`].
790/// Queries may create iterators in the middle of the bitvector.
791///
792/// # Examples
793///
794/// ```
795/// use simple_sds_sbwt::ops::{BitVec, Select};
796/// use simple_sds_sbwt::rl_vector::{RLVector, RLBuilder};
797///
798/// let mut builder = RLBuilder::new();
799/// builder.try_set(18, 22);
800/// builder.try_set(95, 15);
801/// builder.try_set(110, 10);
802/// builder.try_set(140, 12);
803/// builder.set_len(200);
804/// let rv = RLVector::from(builder);
805///
806/// let mut iter = rv.one_iter();
807/// assert_eq!(iter.len(), rv.count_ones());
808/// assert_eq!(iter.next(), Some((0, 18)));
809/// assert_eq!(iter.next(), Some((1, 19)));
810/// assert_eq!(iter.next(), Some((2, 20)));
811/// ```
812#[derive(Clone, Debug)]
813pub struct OneIter<'a> {
814    iter: RunIter<'a>,
815    // Did we get a `None` from the iterator?
816    got_none: bool,
817    // Rank of the next set bit.
818    rank: usize,
819}
820
821impl<'a> OneIter<'a> {
822    // Returns an empty iterator for the parent bitvector.
823    fn empty_iter(parent: &'a RLVector) -> Self {
824        OneIter {
825            iter: RunIter::empty_iter(parent),
826            got_none: true,
827            rank: parent.count_ones(),
828        }
829    }
830}
831
832impl<'a> Iterator for OneIter<'a> {
833    type Item = (usize, usize);
834
835    fn next(&mut self) -> Option<Self::Item> {
836        // Read the next run if we have processed the current one.
837        if !self.got_none && self.rank >= self.iter.rank() {
838            self.got_none = self.iter.next().is_none();
839        }
840
841        // Determine the next set bit and advance.
842        if self.got_none {
843            None
844        } else {
845            let result = (self.rank, self.iter.offset_for(self.rank));
846            self.rank += 1;
847            Some(result)
848        }
849    }
850
851    #[inline]
852    fn size_hint(&self) -> (usize, Option<usize>) {
853        let remaining = self.iter.parent.count_ones() - self.rank;
854        (remaining, Some(remaining))
855    }
856}
857
858impl<'a> ExactSizeIterator for OneIter<'a> {}
859
860impl<'a> FusedIterator for OneIter<'a> {}
861
862//-----------------------------------------------------------------------------
863
864/// An iterator over the unset bits in [`RLVector`].
865///
866/// The type of `Item` is `(`[`usize`]`, `[`usize`]`)`.
867/// This can be interpreted as:
868///
869/// * `(index, value)` or `(i, select(i))` in the integer array of the complement; or
870/// * `(rank(j), j)` in the bit array with `j` such that `self.get(j) == false`.
871///
872/// Note that `index` is not always the index provided by [`Iterator::enumerate`].
873/// Queries may create iterators in the middle of the bitvector.
874///
875/// # Examples
876///
877/// ```
878/// use simple_sds_sbwt::ops::{BitVec, SelectZero};
879/// use simple_sds_sbwt::rl_vector::{RLVector, RLBuilder};
880///
881/// let mut builder = RLBuilder::new();
882/// builder.try_set(18, 22);
883/// builder.try_set(95, 15);
884/// builder.try_set(110, 10);
885/// builder.try_set(140, 12);
886/// builder.set_len(200);
887/// let rv = RLVector::from(builder);
888///
889/// let mut iter = rv.zero_iter();
890/// assert_eq!(iter.len(), rv.count_zeros());
891/// assert_eq!(iter.next(), Some((0, 0)));
892/// assert_eq!(iter.next(), Some((1, 1)));
893/// assert_eq!(iter.next(), Some((2, 2)));
894/// ```
895#[derive(Clone, Debug)]
896pub struct ZeroIter<'a> {
897    iter: RunIter<'a>,
898    // Did we get a `None` from the iterator?
899    got_none: bool,
900    // (rank, index) for the next unset bit.
901    pos: (usize, usize),
902}
903
904impl<'a> Iterator for ZeroIter<'a> {
905    type Item = (usize, usize);
906
907    fn next(&mut self) -> Option<Self::Item> {
908        // Read the next run of set bits if we have reached the current one.
909        if !self.got_none && self.pos.0 >= self.iter.rank_zero() {
910            self.pos.1 = self.iter.offset();
911            self.got_none = self.iter.next().is_none();
912        }
913
914        // Determine the next bit and advance.
915        if self.pos.0 >= self.iter.parent.count_zeros() {
916            None
917        } else {
918            let result = self.pos;
919            self.pos.0 += 1; self.pos.1 += 1;
920            Some(result)
921        }
922    }
923
924    #[inline]
925    fn size_hint(&self) -> (usize, Option<usize>) {
926        let remaining = self.iter.parent.count_zeros() - self.pos.0;
927        (remaining, Some(remaining))
928    }
929}
930
931impl<'a> ExactSizeIterator for ZeroIter<'a> {}
932
933impl<'a> FusedIterator for ZeroIter<'a> {}
934
935//-----------------------------------------------------------------------------
936
937impl<'a> Select<'a> for RLVector {
938    type OneIter = OneIter<'a>;
939
940    fn supports_select(&self) -> bool {
941        true
942    }
943
944    fn enable_select(&mut self) {}
945
946    fn one_iter(&'a self) -> Self::OneIter {
947        Self::OneIter {
948            iter: self.run_iter(),
949            got_none: false,
950            rank: 0,
951        }
952    }
953
954    fn select(&'a self, rank: usize) -> Option<usize> {
955        if rank >= self.count_ones() {
956            return None;
957        }
958
959        let mut iter = self.iter_for_one(rank);
960        while iter.rank() <= rank {
961            let _ = iter.next();
962        }
963        Some(iter.offset_for(rank))
964    }
965
966    fn select_iter(&'a self, rank: usize) -> Self::OneIter {
967        if rank >= self.count_ones() {
968            return Self::OneIter::empty_iter(self);
969        }
970
971        let mut iter = self.iter_for_one(rank);
972        while iter.rank() < rank {
973            let _ = iter.next();
974        }
975        Self::OneIter {
976            iter,
977            got_none: false,
978            rank,
979        }
980    }
981}
982
983//-----------------------------------------------------------------------------
984
985impl<'a> SelectZero<'a> for RLVector {
986    type ZeroIter = ZeroIter<'a>;
987
988    fn supports_select_zero(&self) -> bool {
989        true
990    }
991
992    fn enable_select_zero(&mut self) {}
993
994    fn zero_iter(&'a self) -> Self::ZeroIter {
995        let mut iter = self.run_iter();
996        // We must take the first run instead of using (0, 0).
997        // Otherwise `ZeroIter` would not work if the first run starts at 0.
998        let got_none = iter.next().is_none();
999        Self::ZeroIter {
1000            iter,
1001            got_none,
1002            pos: (0, 0),
1003        }
1004    }
1005
1006    fn select_zero(&'a self, rank: usize) -> Option<usize> {
1007        if rank >= self.count_zeros() {
1008            return None;
1009        }
1010
1011        // Determine the number of set bits before the relevant run of unset bits.
1012        let mut iter = self.iter_for_zero(rank);
1013        let mut ones = iter.rank();
1014        loop {
1015            match iter.next() {
1016                Some(_) => {
1017                    if iter.rank_zero() > rank {
1018                        return Some(rank + ones);
1019                    }
1020                    ones = iter.rank();
1021                },
1022                None => {
1023                    return Some(rank + ones);
1024                },
1025            }
1026        }
1027    }
1028
1029    fn select_zero_iter(&'a self, rank: usize) -> Self::ZeroIter {
1030        if rank >= self.count_zeros() {
1031            return Self::ZeroIter {
1032                iter: RunIter::empty_iter(self),
1033                got_none: true,
1034                pos: (self.count_zeros(), self.len()),
1035            }
1036        }
1037
1038        // Determine the number of set bits before the relevant run of unset bits.
1039        let mut iter = self.iter_for_zero(rank);
1040        let mut ones = iter.rank();
1041        loop {
1042            match iter.next() {
1043                Some(_) => {
1044                    if iter.rank_zero() > rank {
1045                        return Self::ZeroIter {
1046                            iter,
1047                            got_none: false,
1048                            pos: (rank, rank + ones),
1049                        };
1050                    }
1051                    ones = iter.rank();
1052                },
1053                None => {
1054                    return Self::ZeroIter {
1055                        iter,
1056                        got_none: true,
1057                        pos: (rank, rank + ones),
1058                    }
1059                },
1060            }
1061        }
1062    }
1063}
1064
1065//-----------------------------------------------------------------------------
1066
1067impl<'a> PredSucc<'a> for RLVector {
1068    type OneIter = OneIter<'a>;
1069
1070    fn supports_pred_succ(&self) -> bool {
1071        true
1072    }
1073
1074    fn enable_pred_succ(&mut self) {}
1075
1076    fn predecessor(&'a self, value: usize) -> Self::OneIter {
1077        if self.is_empty() {
1078            return Self::OneIter::empty_iter(self);
1079        }
1080
1081        // A predecessor past the end is the same as predecessor at the end.
1082        let value = cmp::min(value, self.len() - 1);
1083
1084        // Find the block that would contain the value. Then advance the iterator
1085        // until the next run starts after the value we are interested in.
1086        let mut iter = self.iter_for_bit(value);
1087        let mut iterate = true;
1088        while iterate {
1089            let _ = iter.advance_if(|next| {
1090                if next.is_none() {
1091                    iterate = false;
1092                    return false;
1093                }
1094                let (start, _) = next.unwrap();
1095
1096                iterate = start <= value;
1097                return iterate;
1098            });
1099        }
1100
1101        // If we are before the first run, there is no predecessor.
1102        if iter.rank() == 0 {
1103            return Self::OneIter::empty_iter(self);
1104        }
1105
1106        let rank = if iter.offset() > value { iter.rank_at(value) } else { iter.rank() - 1 };
1107        Self::OneIter {
1108            iter,
1109            got_none: false,
1110            rank,
1111        }
1112    }
1113
1114    fn successor(&'a self, value: usize) -> Self::OneIter {
1115        if value >= self.len() {
1116            return Self::OneIter::empty_iter(self);
1117        }
1118
1119        // Find the block that would contain the value. Then advance the iterator
1120        // until the last run covers a position at or after the value we are interested in.
1121        let mut iter = self.iter_for_bit(value);
1122        let rank;
1123        loop {
1124            let result = iter.next();
1125            if result.is_none() {
1126                return Self::OneIter::empty_iter(self);
1127            }
1128            let (start, len) = result.unwrap();
1129            if start > value {
1130                rank = iter.rank() - len;
1131                break;
1132            }
1133            if iter.offset() > value {
1134                rank = iter.rank_at(value);
1135                break;
1136            }
1137        }
1138
1139        Self::OneIter {
1140            iter,
1141            got_none: false,
1142            rank,
1143        }
1144    }
1145}
1146
1147//-----------------------------------------------------------------------------
1148
1149impl Serialize for RLVector {
1150    fn serialize_header<T: io::Write>(&self, writer: &mut T) -> io::Result<()> {
1151        self.len.serialize(writer)?;
1152        self.ones.serialize(writer)?;
1153        Ok(())
1154    }
1155
1156    fn serialize_body<T: io::Write>(&self, writer: &mut T) -> io::Result<()> {
1157        self.samples.serialize(writer)?;
1158        self.data.serialize(writer)?;
1159        Ok(())
1160    }
1161
1162    fn load<T: io::Read>(reader: &mut T) -> io::Result<Self> {
1163        let len = usize::load(reader)?;
1164        let ones = usize::load(reader)?;
1165        let samples = IntVector::load(reader)?;
1166        let data = IntVector::load(reader)?;
1167
1168        // Sanity checks.
1169        let sample_blocks = samples.len() / 2;
1170        let data_blocks = bits::div_round_up(data.len(), Self::BLOCK_SIZE);
1171        if sample_blocks != data_blocks {
1172            return Err(Error::new(ErrorKind::InvalidData, "Mismatch between number of blocks and samples"));
1173        }
1174
1175        // Rebuild indexes for narrowing down binary search ranges.
1176        let rank_index = SampleIndex::new((0..sample_blocks).map(|block| samples.get(2 * block + 1) as usize), len);
1177        let select_index = SampleIndex::new((0..sample_blocks).map(|block| samples.get(2 * block) as usize), ones);
1178        let select_zero_index = SampleIndex::new((0..sample_blocks).map(|block| (samples.get(2 * block + 1) - samples.get(2 * block)) as usize), len - ones);
1179
1180        let result = RLVector {
1181            len, ones, rank_index, select_index, select_zero_index, samples, data,
1182        };
1183        Ok(result)
1184    }
1185
1186    fn size_in_elements(&self) -> usize {
1187        self.len.size_in_elements() +
1188        self.ones.size_in_elements() +
1189        self.samples.size_in_elements() +
1190        self.data.size_in_elements()
1191    }
1192}
1193
1194//-----------------------------------------------------------------------------