simple_sds_sbwt/rl_vector.rs
1//! A run-length encoded bitvector supporting rank, select, and related queries.
2//!
3//! This is a run-length encoded bitvector similar to the one used in RLCSA:
4//!
5//! > Mäkinen, Navarro, Sirén, Välimäki: Storage and Retrieval of Highly Repetitive Sequence Collections.
6//! > Journal of Computational Biology, 2010.
7//! > DOI: [10.1089/cmb.2009.0169](https://doi.org/10.1089/cmb.2009.0169)
8//!
9//! The vector is encoded by storing the maximal runs of unset and set bits.
10//! If there is a run of `n0` unset bits followed by a run of `n1` set bits, we encode it as integers `n0` and `n1 - 1`.
11//! Each integer is encoded in little-endian order using 4-bit code units.
12//! The lowest 3 bits of each code unit contain data.
13//! If the high bit is set, the encoding continues in the next unit.
14//! We partition the encoding into 64-unit (32-byte) blocks that consist of entire runs.
15//! If there is not enough space left for encoding the next `(n0, n1)`, we pad the block with empty code units and start a new block.
16//!
17//! For each block, we store a sample `(rank(i, 1), i)`, where `i` is the number of bits encoded before that block.
18//! Queries use binary search on the samples to find the right block and then decompress the block sequentially.
19//! A [`SampleIndex`] is used for narrowing down the range of the binary search.
20
21use crate::int_vector::IntVector;
22use crate::ops::{Vector, Access, Push, Resize, BitVec, Rank, Select, SelectZero, PredSucc};
23use crate::rl_vector::index::SampleIndex;
24use crate::serialize::Serialize;
25use crate::bits;
26
27use std::io::{Error, ErrorKind};
28use std::iter::FusedIterator;
29use std::{cmp, io};
30
31pub mod index;
32
33#[cfg(test)]
34mod tests;
35
36//-----------------------------------------------------------------------------
37
38/// An immutable run-length encoded bitvector supporting rank, select, and related queries.
39///
40/// This type should be used when the bitvector contains long runs of both set and unset bits.
41/// Other bitvector types are more appropriate for dense (no long runs) and sparse (long runs of unset bits) bitvectors.
42/// The bitvector is immutable, though it would be easy to implement a mutable version by storing the blocks in a B+-tree rather than a vector.
43/// The maximum length of the vector is approximately [`usize::MAX`] bits.
44///
45/// Conversions between various [`BitVec`] types are possible using the [`From`] trait.
46///
47/// `RLVector` implements the following `simple_sds` traits:
48/// * Basic functionality: [`BitVec`]
49/// * Queries and operations: [`Rank`], [`Select`], [`SelectZero`], [`PredSucc`]
50/// * Serialization: [`Serialize`]
51///
52/// # Examples
53///
54/// ```
55/// use simple_sds_sbwt::ops::{BitVec, Rank, Select, SelectZero, PredSucc};
56/// use simple_sds_sbwt::rl_vector::{RLVector, RLBuilder};
57///
58/// let mut builder = RLBuilder::new();
59/// builder.try_set(18, 22);
60/// builder.try_set(95, 15);
61/// builder.try_set(110, 10);
62/// builder.try_set(140, 12);
63/// builder.set_len(200);
64/// let rv = RLVector::from(builder);
65///
66/// // BitVec
67/// assert_eq!(rv.len(), 200);
68/// assert!(!rv.is_empty());
69/// assert_eq!(rv.count_ones(), 59);
70/// assert_eq!(rv.count_zeros(), 141);
71/// assert!(rv.get(119));
72/// assert!(!rv.get(120));
73/// for (index, value) in rv.iter().enumerate() {
74/// assert_eq!(value, rv.get(index));
75/// }
76///
77/// // Rank
78/// assert!(rv.supports_rank());
79/// assert_eq!(rv.rank(100), 27);
80/// assert_eq!(rv.rank(130), 47);
81/// assert_eq!(rv.rank_zero(60), 38);
82///
83/// // Select
84/// assert!(rv.supports_select());
85/// assert_eq!(rv.select(24), Some(97));
86/// let mut iter = rv.select_iter(46);
87/// assert_eq!(iter.next(), Some((46, 119)));
88/// assert_eq!(iter.next(), Some((47, 140)));
89/// let v: Vec<(usize, usize)> = rv.one_iter().take(4).collect();
90/// assert_eq!(v, vec![(0, 18), (1, 19), (2, 20), (3, 21)]);
91///
92/// // SelectZero
93/// assert!(rv.supports_select_zero());
94/// assert_eq!(rv.select_zero(130), Some(189));
95/// let mut iter = rv.select_zero_iter(72);
96/// assert_eq!(iter.next(), Some((72, 94)));
97/// assert_eq!(iter.next(), Some((73, 120)));
98/// let v: Vec<(usize, usize)> = rv.zero_iter().take(4).collect();
99/// assert_eq!(v, vec![(0, 0), (1, 1), (2, 2), (3, 3)]);
100///
101/// // PredSucc
102/// assert!(rv.supports_pred_succ());
103/// assert!(rv.predecessor(17).next().is_none());
104/// assert_eq!(rv.predecessor(18).next(), Some((0, 18)));
105/// assert_eq!(rv.predecessor(40).next(), Some((21, 39)));
106/// assert_eq!(rv.successor(139).next(), Some((47, 140)));
107/// assert_eq!(rv.successor(140).next(), Some((47, 140)));
108/// assert!(rv.successor(152).next().is_none());
109/// ```
110///
111/// # Notes
112///
113/// * `RLVector` never panics from I/O errors.
114/// * All `RLVector` queries are always enabled without additional support structures.
115#[derive(Clone, Debug, PartialEq, Eq)]
116pub struct RLVector {
117 len: usize,
118 ones: usize,
119 rank_index: SampleIndex,
120 select_index: SampleIndex,
121 select_zero_index: SampleIndex,
122 // (ones, bits) up to the start of each block.
123 samples: IntVector,
124 // Concatenated blocks.
125 data: IntVector,
126}
127
128impl RLVector {
129 /// Number of bits in a code unit.
130 pub const CODE_SIZE: usize = 4;
131
132 // Number of data bits in a code unit.
133 const CODE_SHIFT: usize = Self::CODE_SIZE - 1;
134
135 // If this bit is set in a code unit, the encoding continues in the next unit.
136 const CODE_FLAG: u64 = 1 << Self::CODE_SHIFT;
137
138 // Largest value that can be stored in a single code unit.
139 const CODE_MASK: u64 = (1 << Self::CODE_SHIFT) - 1;
140
141 /// Number of code units in a block.
142 pub const BLOCK_SIZE: usize = 64;
143
144 /// Returns a copy of the source bitvector as `RLVector`.
145 ///
146 /// The copy is created by iterating over the set bits using [`Select::one_iter`].
147 /// [`From`] implementations from other bitvector types should generally use this function.
148 ///
149 /// # Examples
150 ///
151 /// ```
152 /// use simple_sds_sbwt::bit_vector::BitVector;
153 /// use simple_sds_sbwt::ops::BitVec;
154 /// use simple_sds_sbwt::rl_vector::RLVector;
155 /// use std::iter::FromIterator;
156 ///
157 /// let source: Vec<bool> = vec![true, false, true, true, false, true, true, false];
158 /// let bv = BitVector::from_iter(source);
159 /// let rv = RLVector::copy_bit_vec(&bv);
160 /// assert_eq!(rv.len(), bv.len());
161 /// assert_eq!(rv.count_ones(), bv.count_ones());
162 /// ```
163 pub fn copy_bit_vec<'a, T: BitVec<'a> + Select<'a>>(source: &'a T) -> Self {
164 let mut builder = RLBuilder::new();
165 for (_, index) in source.one_iter() {
166 unsafe { builder.set_bit_unchecked(index); }
167 }
168 builder.set_len(source.len());
169 RLVector::from(builder)
170 }
171
172 /// Returns an iterator over the runs of set bits in the bitvector.
173 ///
174 /// See [`RunIter`] for an example.
175 pub fn run_iter(&self) -> RunIter<'_> {
176 RunIter {
177 parent: self,
178 offset: 0,
179 pos: (0, 0),
180 limit: self.ones_after(0),
181 }
182 }
183
184 // Returns the number of blocks in the encoding.
185 fn blocks(&self) -> usize {
186 self.samples.len() / 2
187 }
188
189 // Returns the number of set bits after the given block.
190 fn ones_after(&self, block: usize) -> usize {
191 if block + 1 < self.blocks() {
192 self.samples.get(2 * (block + 1)) as usize
193 } else {
194 self.count_ones()
195 }
196 }
197
198 // Decodes a value starting from `offset` and returns (value, new offset).
199 fn decode(&self, offset: usize) -> (usize, usize) {
200 let mut value = 0;
201 let mut offset = offset;
202 let mut shift = 0;
203 loop {
204 let code = self.data.get(offset);
205 offset += 1;
206 value += ((code & Self::CODE_MASK) << shift) as usize;
207 shift += Self::CODE_SHIFT;
208 if code & Self::CODE_FLAG == 0 {
209 return (value, offset);
210 }
211 }
212 }
213
214 // Returns the identifier of the last block `i` in the range where `f(i) <= value`.
215 fn block_for<F: Fn(usize) -> usize>(low: usize, high: usize, value: usize, f: F) -> usize {
216 let mut low = low;
217 let mut high = high;
218 while high - low > 1 {
219 let mid = low + (high - low) / 2;
220 let candidate = f(mid);
221 if candidate <= value {
222 low = mid;
223 } else {
224 high = mid;
225 }
226 }
227 low
228 }
229
230 // Returns an iterator covering the given block.
231 fn iter_for_block(&self, block: usize) -> RunIter<'_> {
232 let pos = if self.samples.is_empty() { (0, 0) } else { (self.samples.get(2 * block) as usize, self.samples.get(2 * block + 1) as usize) };
233 RunIter {
234 parent: self,
235 offset: block * Self::BLOCK_SIZE,
236 pos,
237 limit: self.ones_after(block),
238 }
239 }
240
241 // Returns an iterator covering the block that contains the given bit, or an empty iterator if there is no such bit.
242 fn iter_for_bit(&self, index: usize) -> RunIter<'_> {
243 if index >= self.len() {
244 return RunIter::empty_iter(self);
245 }
246
247 let range = self.rank_index.range(index);
248 let block = Self::block_for(range.start, range.end, index, |i| self.samples.get(2 * i + 1) as usize);
249 self.iter_for_block(block)
250 }
251
252 // Returns an iterator covering the block that contains the given set bit, or an empty iterator if there is no such bit.
253 fn iter_for_one(&self, rank: usize) -> RunIter<'_> {
254 if rank >= self.count_ones() {
255 return RunIter::empty_iter(self);
256 }
257
258 let range = self.select_index.range(rank);
259 let block = Self::block_for(range.start, range.end, rank, |i| self.samples.get(2 * i) as usize);
260 self.iter_for_block(block)
261 }
262
263 // Returns an iterator covering the block that contains the given unset bit, or an empty iterator if there is no such bit.
264 fn iter_for_zero(&self, rank: usize) -> RunIter<'_> {
265 if rank >= self.count_zeros() {
266 return RunIter::empty_iter(self);
267 }
268
269 let range = self.select_zero_index.range(rank);
270 let block = Self::block_for(range.start, range.end, rank, |i| (self.samples.get(2 * i + 1) - self.samples.get(2 * i)) as usize);
271 self.iter_for_block(block)
272 }
273}
274
275//-----------------------------------------------------------------------------
276
277/// Space-efficient [`RLVector`] construction.
278///
279/// `RLBuilder` builds an [`RLVector`] incrementally.
280/// Bits must be set in order, and they cannot be unset later.
281/// Set bits are combined into maximal runs.
282/// Once the construction is finished, the builder can be converted into an [`RLVector`] using the [`From`] trait.
283///
284/// # Examples
285///
286/// ```
287/// use simple_sds_sbwt::ops::BitVec;
288/// use simple_sds_sbwt::rl_vector::{RLVector, RLBuilder};
289///
290/// let mut builder = RLBuilder::new();
291///
292/// builder.try_set(18, 22);
293/// assert_eq!(builder.len(), 40);
294///
295/// // Combine two runs into one.
296/// builder.try_set(95, 15);
297/// builder.try_set(110, 10);
298///
299/// // Trying to set earlier bits will fail.
300/// assert!(builder.try_set(100, 1).is_err());
301///
302/// builder.try_set(140, 12);
303///
304/// // Append a final run of unset bits.
305/// builder.set_len(200);
306///
307/// let rv = RLVector::from(builder);
308/// assert_eq!(rv.len(), 200);
309/// ```
310#[derive(Clone, Debug)]
311pub struct RLBuilder {
312 len: usize,
313 ones: usize,
314 // Position after the last encoded run.
315 tail: usize,
316 run: (usize, usize),
317 samples: Vec<(usize, usize)>,
318 data: IntVector,
319}
320
321impl RLBuilder {
322 /// Returns an empty `RLBuilder`.
323 pub fn new() -> Self {
324 RLBuilder::default()
325 }
326
327 /// Returns the length of the bitvector.
328 ///
329 /// This is the first position that can be set.
330 pub fn len(&self) -> usize {
331 self.len
332 }
333
334 /// Returns `true` if the bitvector is empty.
335 ///
336 /// Makes Clippy happy.
337 pub fn is_empty(&self) -> bool {
338 self.len() == 0
339 }
340
341 /// Returns the number of set bits in the bitvector.
342 pub fn count_ones(&self) -> usize {
343 self.ones
344 }
345
346 /// Returns the number of unset bits in the bitvector.
347 pub fn count_zeros(&self) -> usize {
348 self.len() - self.count_ones()
349 }
350
351 // Returns the position after the last encoded run.
352 fn tail(&self) -> usize {
353 self.tail
354 }
355
356 // Returns the number of blocks in the encoding.
357 fn blocks(&self) -> usize {
358 self.samples.len()
359 }
360
361 /// Encodes a run of set bits of length `len` starting at position `start`.
362 ///
363 /// Does nothing if `len == 0`.
364 /// Returns [`Err`] if `start < self.len()` of `start + len > usize::MAX`.
365 ///
366 /// # Arguments
367 ///
368 /// * `start`: Starting position of the run.
369 /// * `len`: Length of the run.
370 pub fn try_set(&mut self, start: usize, len: usize) -> Result<(), String> {
371 if start < self.len() {
372 return Err(format!("RLBuilder: Cannot set bit {} when length is {}", start, self.len()));
373 }
374 if usize::MAX - len < start {
375 return Err(format!("RLBuilder: A run of length {} starting at {} exceeds the maximum length of a bitvector", start, len));
376 }
377 unsafe { self.set_run_unchecked(start, len); }
378 Ok(())
379 }
380
381 /// Sets the specified bit in the bitvector.
382 ///
383 /// # Safety
384 ///
385 /// Behavior is undefined if `index < self.len()`.
386 pub unsafe fn set_bit_unchecked(&mut self, index: usize) {
387 self.set_run_unchecked(index, 1);
388 }
389
390 /// Encodes a run of set bits of length `len` starting at position `start`.
391 ///
392 /// Does nothing if `len == 0`.
393 ///
394 /// # Arguments
395 ///
396 /// * `start`: Starting position of the run.
397 /// * `len`: Length of the run.
398 ///
399 /// # Safety
400 ///
401 /// Behavior is undefined if `start < self.len()` of `start + len > usize::MAX`.
402 pub unsafe fn set_run_unchecked(&mut self, start: usize, len: usize) {
403 if len == 0 {
404 return;
405 }
406 if start == self.len() {
407 self.len += len;
408 self.ones += len;
409 self.run.1 += len;
410 } else {
411 self.flush();
412 self.len = start + len;
413 self.ones += len;
414 self.run = (start, len);
415 }
416 }
417
418 /// Sets the length of the bitvector to `len` bits.
419 ///
420 /// No effect if `self.len() >= len`.
421 /// This is intended for appending a final run of unset bits.
422 pub fn set_len(&mut self, len: usize) {
423 if len > self.len() {
424 self.flush();
425 self.len = len;
426 }
427 }
428
429 // Encodes the current run if necessary and sets the active run to `(self.len(), 0)`.
430 fn flush(&mut self) {
431 if self.run.1 == 0 {
432 return;
433 }
434
435 // Add a new block if there is not enough space for the run in the current block.
436 let units_needed = Self::code_len(self.run.0 - self.tail()) + Self::code_len(self.run.1 - 1);
437 if self.data.len() + units_needed > self.blocks() * RLVector::BLOCK_SIZE {
438 self.data.resize(self.blocks() * RLVector::BLOCK_SIZE, 0);
439 let sample = (self.ones - self.run.1, self.tail);
440 self.samples.push(sample);
441 }
442
443 // Encode the run and update the statistics.
444 self.encode(self.run.0 - self.tail());
445 self.encode(self.run.1 - 1);
446 self.tail = self.run.0 + self.run.1;
447 self.run = (self.len(), 0);
448 }
449
450 // Encodes the given value.
451 fn encode(&mut self, value: usize) {
452 let mut value = value as u64;
453 while value > RLVector::CODE_MASK {
454 self.data.push((value & RLVector::CODE_MASK) | RLVector::CODE_FLAG);
455 value = value >> RLVector::CODE_SHIFT;
456 }
457 self.data.push(value);
458 }
459
460 // Number of code units required for encoding the value.
461 fn code_len(value: usize) -> usize {
462 bits::div_round_up(bits::bit_len(value as u64), RLVector::CODE_SHIFT)
463 }
464}
465
466impl Default for RLBuilder {
467 fn default() -> Self {
468 RLBuilder {
469 len: 0,
470 ones: 0,
471 tail: 0,
472 run: (0, 0),
473 samples: Vec::new(),
474 data: IntVector::new(RLVector::CODE_SIZE).unwrap(),
475 }
476 }
477}
478
479impl From<RLBuilder> for RLVector {
480 fn from(builder: RLBuilder) -> Self {
481 let mut builder = builder;
482 builder.flush();
483
484 // Build indexes for narrowing down binary search ranges.
485 let rank_index = SampleIndex::new(builder.samples.iter().map(|(_, bits)| *bits), builder.len());
486 let select_index = SampleIndex::new(builder.samples.iter().map(|(ones, _)| *ones), builder.count_ones());
487 let select_zero_index = SampleIndex::new(builder.samples.iter().map(|(ones, bits)| bits - ones), builder.count_zeros());
488
489 // Compress the samples.
490 let max_value = builder.samples.last().unwrap_or(&(0, 0)).1;
491 let mut samples = IntVector::with_capacity(2 * builder.blocks(), bits::bit_len(max_value as u64)).unwrap();
492 for (ones, bits) in builder.samples.iter() {
493 samples.push(*ones as u64);
494 samples.push(*bits as u64);
495 }
496
497 RLVector {
498 len: builder.len(),
499 ones: builder.count_ones(),
500 rank_index,
501 select_index,
502 select_zero_index,
503 samples: samples,
504 data: builder.data,
505 }
506 }
507}
508
509//-----------------------------------------------------------------------------
510
511/// A read-only iterator over the runs in [`RLVector`].
512///
513/// The type of `Item` is `(`[`usize`]`, `[`usize`]`)`.
514/// The first value is the starting position of a maximal run of set bits, and the second value is its length.
515///
516/// Most [`RLVector`] queries use this iterator for iterating over the runs in a block.
517///
518/// # Examples
519///
520/// ```
521/// use simple_sds_sbwt::rl_vector::{RLVector, RLBuilder};
522///
523/// let mut builder = RLBuilder::new();
524/// builder.try_set(18, 22);
525/// builder.try_set(95, 15);
526/// builder.try_set(110, 10); // Merge with the previous run.
527/// builder.try_set(140, 12);
528/// builder.set_len(200);
529/// let rv = RLVector::from(builder);
530///
531/// let runs: Vec<(usize, usize)> = rv.run_iter().collect();
532/// assert_eq!(runs, vec![(18, 22), (95, 25), (140, 12)]);
533///
534/// let mut iter = rv.run_iter();
535/// assert_eq!(iter.offset(), 0);
536/// assert_eq!(iter.rank(), 0);
537/// assert_eq!(iter.rank_zero(), 0);
538/// while let Some(_) = iter.next() {}
539/// assert_eq!(iter.offset(), 140 + 12);
540/// assert_eq!(iter.rank(), 22 + 25 + 12);
541/// assert_eq!(iter.rank_zero(), 140 - 22 - 25);
542/// ```
543#[derive(Clone, Debug)]
544pub struct RunIter<'a> {
545 parent: &'a RLVector,
546 // Offset in the encoding.
547 offset: usize,
548 // (rank, index) reached so far.
549 pos: (usize, usize),
550 // Number of ones after the current block.
551 limit: usize,
552}
553
554impl<'a> RunIter<'a> {
555 /// Returns the position in the bitvector after the latest run.
556 pub fn offset(&self) -> usize {
557 self.pos.1
558 }
559
560 /// Returns the number of set bits up to the end of the latest run.
561 pub fn rank(&self) -> usize {
562 self.pos.0
563 }
564
565 /// Returns the number of unset bits up to the end of the latest run.
566 pub fn rank_zero(&self) -> usize {
567 self.offset() - self.rank()
568 }
569
570 // Returns an empty iterator for the parent bitvector.
571 fn empty_iter(parent: &'a RLVector) -> Self {
572 RunIter {
573 parent,
574 offset: parent.data.len(),
575 pos: (parent.count_ones(), parent.len()),
576 limit: parent.count_ones(),
577 }
578 }
579
580 // Returns the position in the bitvector for the set bit of given rank, assuming that it is covered by the current run.
581 fn offset_for(&self, rank: usize) -> usize {
582 self.offset() - (self.rank() - rank)
583 }
584
585 // Returns the rank at a given position in the bitvector, assuming that it is covered by the current run.
586 fn rank_at(&self, index: usize) -> usize {
587 self.rank() - (self.offset() - index)
588 }
589
590 // Like `next()`, but only advances if the `advance()` returns `true` for the next run.
591 fn advance_if<F: FnMut(Option<<Self as Iterator>::Item>) -> bool>(&mut self, mut advance: F) -> Option<<Self as Iterator>::Item> {
592 if self.offset >= self.parent.data.len() {
593 // We are at the end anyway, but we will call `advance()` to let the user know
594 // what the next run would be.
595 let _ = advance(None);
596 return None;
597 }
598
599 // Move to the next block if we have run out of ones.
600 let mut offset = self.offset;
601 let mut limit = self.limit;
602 if self.rank() >= self.limit {
603 let block = bits::div_round_up(offset, RLVector::BLOCK_SIZE);
604 offset = block * RLVector::BLOCK_SIZE;
605 if block >= self.parent.blocks() {
606 if advance(None) {
607 self.offset = offset;
608 }
609 return None;
610 }
611 limit = self.parent.ones_after(block);
612 }
613
614 // Decode the next run.
615 let (gap, offset) = self.parent.decode(offset);
616 let start = self.offset() + gap;
617 let (len, offset) = self.parent.decode(offset);
618 let result = Some((start, len + 1));
619
620 if advance(result) {
621 self.offset = offset;
622 self.limit = limit;
623 self.pos.0 += len + 1;
624 self.pos.1 = start + len + 1;
625 }
626 result
627 }
628}
629
630impl<'a> Iterator for RunIter<'a> {
631 // (start, length)
632 type Item = (usize, usize);
633
634 fn next(&mut self) -> Option<Self::Item> {
635 self.advance_if(|_| true)
636 }
637}
638
639impl<'a> FusedIterator for RunIter<'a> {}
640
641//-----------------------------------------------------------------------------
642
643/// A read-only iterator over [`RLVector`].
644///
645/// The type of `Item` is [`bool`].
646///
647/// # Examples
648///
649/// ```
650/// use simple_sds_sbwt::ops::{BitVec};
651/// use simple_sds_sbwt::rl_vector::{RLVector, RLBuilder};
652///
653/// let mut builder = RLBuilder::new();
654/// builder.try_set(18, 22);
655/// builder.try_set(95, 15);
656/// builder.try_set(110, 10);
657/// builder.try_set(140, 12);
658/// builder.set_len(200);
659/// let rv = RLVector::from(builder);
660///
661/// assert_eq!(rv.iter().len(), rv.len());
662/// for (index, value) in rv.iter().enumerate() {
663/// assert_eq!(value, rv.get(index));
664/// }
665/// ```
666#[derive(Clone, Debug)]
667pub struct Iter<'a> {
668 iter: RunIter<'a>,
669 // Run from the iterator.
670 run: Option<(usize, usize)>,
671 // Next bitvector offset.
672 pos: usize,
673}
674
675impl<'a> Iterator for Iter<'a> {
676 type Item = bool;
677
678 fn next(&mut self) -> Option<Self::Item> {
679 // Read the next run if we have processed the current one.
680 if let Some((start, len)) = self.run {
681 if self.pos >= start + len {
682 self.run = self.iter.next();
683 }
684 }
685
686 // Determine the next bit and advance.
687 match self.run {
688 Some((start, _)) => {
689 self.pos += 1;
690 Some(self.pos > start)
691 },
692 None => {
693 if self.pos >= self.iter.parent.len() {
694 None
695 } else {
696 self.pos += 1;
697 Some(false)
698 }
699 },
700 }
701 }
702
703 #[inline]
704 fn size_hint(&self) -> (usize, Option<usize>) {
705 let remaining = self.iter.parent.len() - self.pos;
706 (remaining, Some(remaining))
707 }
708}
709
710impl<'a> ExactSizeIterator for Iter<'a> {}
711
712impl<'a> FusedIterator for Iter<'a> {}
713
714//-----------------------------------------------------------------------------
715
716impl<'a> BitVec<'a> for RLVector {
717 type Iter = Iter<'a>;
718
719 #[inline]
720 fn len(&self) -> usize {
721 self.len
722 }
723
724 #[inline]
725 fn count_ones(&self) -> usize {
726 self.ones
727 }
728
729 fn get(&self, index: usize) -> bool {
730 let mut iter = self.iter_for_bit(index);
731 while let Some((start, _)) = iter.next() {
732 if start > index {
733 return false;
734 }
735 if index < iter.offset() {
736 return true;
737 }
738 }
739
740 // Final run of unset bits.
741 false
742 }
743
744 fn iter(&'a self) -> Self::Iter {
745 Self::Iter {
746 iter: self.run_iter(),
747 run: Some((0, 0)),
748 pos: 0,
749 }
750 }
751}
752
753//-----------------------------------------------------------------------------
754
755impl<'a> Rank<'a> for RLVector {
756 fn supports_rank(&self) -> bool {
757 true
758 }
759
760 fn enable_rank(&mut self) {}
761
762 fn rank(&self, index: usize) -> usize {
763 let mut iter = self.iter_for_bit(index);
764 while let Some((start, len)) = iter.next() {
765 // We reached `index` but this run is too late to affect the rank.
766 if start >= index {
767 return iter.rank() - len;
768 }
769 // We reached `index` and a part of this run affects the rank.
770 if iter.offset() >= index {
771 return iter.rank_at(index);
772 }
773 }
774
775 iter.rank()
776 }
777}
778
779//-----------------------------------------------------------------------------
780
781/// An iterator over the set bits in [`RLVector`].
782///
783/// The type of `Item` is `(`[`usize`]`, `[`usize`]`)`.
784/// This can be interpreted as:
785///
786/// * `(index, value)` or `(i, select(i))` in the integer array; or
787/// * `(rank(j), j)` in the bit array with `j` such that `self.get(j) == true`.
788///
789/// Note that `index` is not always the index provided by [`Iterator::enumerate`].
790/// Queries may create iterators in the middle of the bitvector.
791///
792/// # Examples
793///
794/// ```
795/// use simple_sds_sbwt::ops::{BitVec, Select};
796/// use simple_sds_sbwt::rl_vector::{RLVector, RLBuilder};
797///
798/// let mut builder = RLBuilder::new();
799/// builder.try_set(18, 22);
800/// builder.try_set(95, 15);
801/// builder.try_set(110, 10);
802/// builder.try_set(140, 12);
803/// builder.set_len(200);
804/// let rv = RLVector::from(builder);
805///
806/// let mut iter = rv.one_iter();
807/// assert_eq!(iter.len(), rv.count_ones());
808/// assert_eq!(iter.next(), Some((0, 18)));
809/// assert_eq!(iter.next(), Some((1, 19)));
810/// assert_eq!(iter.next(), Some((2, 20)));
811/// ```
812#[derive(Clone, Debug)]
813pub struct OneIter<'a> {
814 iter: RunIter<'a>,
815 // Did we get a `None` from the iterator?
816 got_none: bool,
817 // Rank of the next set bit.
818 rank: usize,
819}
820
821impl<'a> OneIter<'a> {
822 // Returns an empty iterator for the parent bitvector.
823 fn empty_iter(parent: &'a RLVector) -> Self {
824 OneIter {
825 iter: RunIter::empty_iter(parent),
826 got_none: true,
827 rank: parent.count_ones(),
828 }
829 }
830}
831
832impl<'a> Iterator for OneIter<'a> {
833 type Item = (usize, usize);
834
835 fn next(&mut self) -> Option<Self::Item> {
836 // Read the next run if we have processed the current one.
837 if !self.got_none && self.rank >= self.iter.rank() {
838 self.got_none = self.iter.next().is_none();
839 }
840
841 // Determine the next set bit and advance.
842 if self.got_none {
843 None
844 } else {
845 let result = (self.rank, self.iter.offset_for(self.rank));
846 self.rank += 1;
847 Some(result)
848 }
849 }
850
851 #[inline]
852 fn size_hint(&self) -> (usize, Option<usize>) {
853 let remaining = self.iter.parent.count_ones() - self.rank;
854 (remaining, Some(remaining))
855 }
856}
857
858impl<'a> ExactSizeIterator for OneIter<'a> {}
859
860impl<'a> FusedIterator for OneIter<'a> {}
861
862//-----------------------------------------------------------------------------
863
864/// An iterator over the unset bits in [`RLVector`].
865///
866/// The type of `Item` is `(`[`usize`]`, `[`usize`]`)`.
867/// This can be interpreted as:
868///
869/// * `(index, value)` or `(i, select(i))` in the integer array of the complement; or
870/// * `(rank(j), j)` in the bit array with `j` such that `self.get(j) == false`.
871///
872/// Note that `index` is not always the index provided by [`Iterator::enumerate`].
873/// Queries may create iterators in the middle of the bitvector.
874///
875/// # Examples
876///
877/// ```
878/// use simple_sds_sbwt::ops::{BitVec, SelectZero};
879/// use simple_sds_sbwt::rl_vector::{RLVector, RLBuilder};
880///
881/// let mut builder = RLBuilder::new();
882/// builder.try_set(18, 22);
883/// builder.try_set(95, 15);
884/// builder.try_set(110, 10);
885/// builder.try_set(140, 12);
886/// builder.set_len(200);
887/// let rv = RLVector::from(builder);
888///
889/// let mut iter = rv.zero_iter();
890/// assert_eq!(iter.len(), rv.count_zeros());
891/// assert_eq!(iter.next(), Some((0, 0)));
892/// assert_eq!(iter.next(), Some((1, 1)));
893/// assert_eq!(iter.next(), Some((2, 2)));
894/// ```
895#[derive(Clone, Debug)]
896pub struct ZeroIter<'a> {
897 iter: RunIter<'a>,
898 // Did we get a `None` from the iterator?
899 got_none: bool,
900 // (rank, index) for the next unset bit.
901 pos: (usize, usize),
902}
903
904impl<'a> Iterator for ZeroIter<'a> {
905 type Item = (usize, usize);
906
907 fn next(&mut self) -> Option<Self::Item> {
908 // Read the next run of set bits if we have reached the current one.
909 if !self.got_none && self.pos.0 >= self.iter.rank_zero() {
910 self.pos.1 = self.iter.offset();
911 self.got_none = self.iter.next().is_none();
912 }
913
914 // Determine the next bit and advance.
915 if self.pos.0 >= self.iter.parent.count_zeros() {
916 None
917 } else {
918 let result = self.pos;
919 self.pos.0 += 1; self.pos.1 += 1;
920 Some(result)
921 }
922 }
923
924 #[inline]
925 fn size_hint(&self) -> (usize, Option<usize>) {
926 let remaining = self.iter.parent.count_zeros() - self.pos.0;
927 (remaining, Some(remaining))
928 }
929}
930
931impl<'a> ExactSizeIterator for ZeroIter<'a> {}
932
933impl<'a> FusedIterator for ZeroIter<'a> {}
934
935//-----------------------------------------------------------------------------
936
937impl<'a> Select<'a> for RLVector {
938 type OneIter = OneIter<'a>;
939
940 fn supports_select(&self) -> bool {
941 true
942 }
943
944 fn enable_select(&mut self) {}
945
946 fn one_iter(&'a self) -> Self::OneIter {
947 Self::OneIter {
948 iter: self.run_iter(),
949 got_none: false,
950 rank: 0,
951 }
952 }
953
954 fn select(&'a self, rank: usize) -> Option<usize> {
955 if rank >= self.count_ones() {
956 return None;
957 }
958
959 let mut iter = self.iter_for_one(rank);
960 while iter.rank() <= rank {
961 let _ = iter.next();
962 }
963 Some(iter.offset_for(rank))
964 }
965
966 fn select_iter(&'a self, rank: usize) -> Self::OneIter {
967 if rank >= self.count_ones() {
968 return Self::OneIter::empty_iter(self);
969 }
970
971 let mut iter = self.iter_for_one(rank);
972 while iter.rank() < rank {
973 let _ = iter.next();
974 }
975 Self::OneIter {
976 iter,
977 got_none: false,
978 rank,
979 }
980 }
981}
982
983//-----------------------------------------------------------------------------
984
985impl<'a> SelectZero<'a> for RLVector {
986 type ZeroIter = ZeroIter<'a>;
987
988 fn supports_select_zero(&self) -> bool {
989 true
990 }
991
992 fn enable_select_zero(&mut self) {}
993
994 fn zero_iter(&'a self) -> Self::ZeroIter {
995 let mut iter = self.run_iter();
996 // We must take the first run instead of using (0, 0).
997 // Otherwise `ZeroIter` would not work if the first run starts at 0.
998 let got_none = iter.next().is_none();
999 Self::ZeroIter {
1000 iter,
1001 got_none,
1002 pos: (0, 0),
1003 }
1004 }
1005
1006 fn select_zero(&'a self, rank: usize) -> Option<usize> {
1007 if rank >= self.count_zeros() {
1008 return None;
1009 }
1010
1011 // Determine the number of set bits before the relevant run of unset bits.
1012 let mut iter = self.iter_for_zero(rank);
1013 let mut ones = iter.rank();
1014 loop {
1015 match iter.next() {
1016 Some(_) => {
1017 if iter.rank_zero() > rank {
1018 return Some(rank + ones);
1019 }
1020 ones = iter.rank();
1021 },
1022 None => {
1023 return Some(rank + ones);
1024 },
1025 }
1026 }
1027 }
1028
1029 fn select_zero_iter(&'a self, rank: usize) -> Self::ZeroIter {
1030 if rank >= self.count_zeros() {
1031 return Self::ZeroIter {
1032 iter: RunIter::empty_iter(self),
1033 got_none: true,
1034 pos: (self.count_zeros(), self.len()),
1035 }
1036 }
1037
1038 // Determine the number of set bits before the relevant run of unset bits.
1039 let mut iter = self.iter_for_zero(rank);
1040 let mut ones = iter.rank();
1041 loop {
1042 match iter.next() {
1043 Some(_) => {
1044 if iter.rank_zero() > rank {
1045 return Self::ZeroIter {
1046 iter,
1047 got_none: false,
1048 pos: (rank, rank + ones),
1049 };
1050 }
1051 ones = iter.rank();
1052 },
1053 None => {
1054 return Self::ZeroIter {
1055 iter,
1056 got_none: true,
1057 pos: (rank, rank + ones),
1058 }
1059 },
1060 }
1061 }
1062 }
1063}
1064
1065//-----------------------------------------------------------------------------
1066
1067impl<'a> PredSucc<'a> for RLVector {
1068 type OneIter = OneIter<'a>;
1069
1070 fn supports_pred_succ(&self) -> bool {
1071 true
1072 }
1073
1074 fn enable_pred_succ(&mut self) {}
1075
1076 fn predecessor(&'a self, value: usize) -> Self::OneIter {
1077 if self.is_empty() {
1078 return Self::OneIter::empty_iter(self);
1079 }
1080
1081 // A predecessor past the end is the same as predecessor at the end.
1082 let value = cmp::min(value, self.len() - 1);
1083
1084 // Find the block that would contain the value. Then advance the iterator
1085 // until the next run starts after the value we are interested in.
1086 let mut iter = self.iter_for_bit(value);
1087 let mut iterate = true;
1088 while iterate {
1089 let _ = iter.advance_if(|next| {
1090 if next.is_none() {
1091 iterate = false;
1092 return false;
1093 }
1094 let (start, _) = next.unwrap();
1095
1096 iterate = start <= value;
1097 return iterate;
1098 });
1099 }
1100
1101 // If we are before the first run, there is no predecessor.
1102 if iter.rank() == 0 {
1103 return Self::OneIter::empty_iter(self);
1104 }
1105
1106 let rank = if iter.offset() > value { iter.rank_at(value) } else { iter.rank() - 1 };
1107 Self::OneIter {
1108 iter,
1109 got_none: false,
1110 rank,
1111 }
1112 }
1113
1114 fn successor(&'a self, value: usize) -> Self::OneIter {
1115 if value >= self.len() {
1116 return Self::OneIter::empty_iter(self);
1117 }
1118
1119 // Find the block that would contain the value. Then advance the iterator
1120 // until the last run covers a position at or after the value we are interested in.
1121 let mut iter = self.iter_for_bit(value);
1122 let rank;
1123 loop {
1124 let result = iter.next();
1125 if result.is_none() {
1126 return Self::OneIter::empty_iter(self);
1127 }
1128 let (start, len) = result.unwrap();
1129 if start > value {
1130 rank = iter.rank() - len;
1131 break;
1132 }
1133 if iter.offset() > value {
1134 rank = iter.rank_at(value);
1135 break;
1136 }
1137 }
1138
1139 Self::OneIter {
1140 iter,
1141 got_none: false,
1142 rank,
1143 }
1144 }
1145}
1146
1147//-----------------------------------------------------------------------------
1148
1149impl Serialize for RLVector {
1150 fn serialize_header<T: io::Write>(&self, writer: &mut T) -> io::Result<()> {
1151 self.len.serialize(writer)?;
1152 self.ones.serialize(writer)?;
1153 Ok(())
1154 }
1155
1156 fn serialize_body<T: io::Write>(&self, writer: &mut T) -> io::Result<()> {
1157 self.samples.serialize(writer)?;
1158 self.data.serialize(writer)?;
1159 Ok(())
1160 }
1161
1162 fn load<T: io::Read>(reader: &mut T) -> io::Result<Self> {
1163 let len = usize::load(reader)?;
1164 let ones = usize::load(reader)?;
1165 let samples = IntVector::load(reader)?;
1166 let data = IntVector::load(reader)?;
1167
1168 // Sanity checks.
1169 let sample_blocks = samples.len() / 2;
1170 let data_blocks = bits::div_round_up(data.len(), Self::BLOCK_SIZE);
1171 if sample_blocks != data_blocks {
1172 return Err(Error::new(ErrorKind::InvalidData, "Mismatch between number of blocks and samples"));
1173 }
1174
1175 // Rebuild indexes for narrowing down binary search ranges.
1176 let rank_index = SampleIndex::new((0..sample_blocks).map(|block| samples.get(2 * block + 1) as usize), len);
1177 let select_index = SampleIndex::new((0..sample_blocks).map(|block| samples.get(2 * block) as usize), ones);
1178 let select_zero_index = SampleIndex::new((0..sample_blocks).map(|block| (samples.get(2 * block + 1) - samples.get(2 * block)) as usize), len - ones);
1179
1180 let result = RLVector {
1181 len, ones, rank_index, select_index, select_zero_index, samples, data,
1182 };
1183 Ok(result)
1184 }
1185
1186 fn size_in_elements(&self) -> usize {
1187 self.len.size_in_elements() +
1188 self.ones.size_in_elements() +
1189 self.samples.size_in_elements() +
1190 self.data.size_in_elements()
1191 }
1192}
1193
1194//-----------------------------------------------------------------------------