simple_sds_sbwt/sparse_vector.rs
1//! An Elias-Fano encoded array supporting rank, select, and related queries.
2//!
3//! This structure is equivalent to `sd_vector` in [SDSL](https://github.com/simongog/sdsl-lite).
4//! It is also known in the literature as sdarray:
5//!
6//! > Okanohara, Sadakane: Practical Entropy-Compressed Rank/Select Dictionary.
7//! > Proc. ALENEX 2007.
8//! > DOI: [10.1137/1.9781611972870.6](https://doi.org/10.1137/1.9781611972870.6)
9//!
10//! The rule for selecting the number of low bits and buckets is from:
11//!
12//! > Ma, Puglisi, Raman, Zhukova: On Elias-Fano for Rank Queries in FM-Indexes.
13//! > Proc. DCC 2021.
14//! > DOI: [10.1109/DCC50243.2021.00030](https://doi.org/10.1109/DCC50243.2021.00030)
15//!
16//! Assume that we have a bitvector of length `n` with `m` set bits, with `m` much smaller than `n`.
17//! Let `w ≈ log(n) - log(m)`.
18//! In the integer array interpretation (see [`BitVec`]), we take the low `w` bits from each value and store them in an [`IntVector`].
19//! We place the values in buckets by the remaining high bits.
20//! A [`BitVector`] encodes the number of values in each bucket in unary.
21//! If there are `k >= 0` values in a bucket, the bitvector will contain `k` set bits followed by an unset bit.
22//! Then
23//!
24//! > `select(i) = low[i] + ((high.select(i) - i) << w)`.
25//!
26//! Rank, predecessor, and successor queries use `select_zero` on `high` followed by a linear scan.
27//!
28//! The `select_zero` implementation is based on finding the right run of unset bits using binary search with `select`.
29//! It is not particularly efficient.
30//!
31//! We can also support multisets that contain duplicate values (in the integer array interpretation).
32//! Rank/select queries for unset bits do not work correctly with multisets.
33
34use crate::bit_vector::BitVector;
35use crate::int_vector::IntVector;
36use crate::ops::{Vector, Access, BitVec, Rank, Select, PredSucc, SelectZero};
37use crate::raw_vector::{RawVector, AccessRaw};
38use crate::serialize::Serialize;
39use crate::bits;
40
41use std::convert::TryFrom;
42use std::io::{Error, ErrorKind};
43use std::iter::FusedIterator;
44use std::{cmp, io};
45
46#[cfg(test)]
47mod tests;
48
49//-----------------------------------------------------------------------------
50
51/// An immutable Elias-Fano encoded bitvector supporting, rank, select, and related queries.
52///
53/// This structure should be used for sparse bitvectors, where frequency of set bits is low.
54/// For dense bitvectors or when [`SelectZero`] is needed, [`BitVector`] is usually a better choice.
55/// Because most queries require support structures for one of the components, the bitvector itself is immutable.
56/// The maximum length of the vector is approximately [`usize::MAX`] bits.
57///
58/// Conversions between various [`BitVec`] types are possible using the [`From`] trait.
59///
60/// `SparseVector` supports partial multiset semantics.
61/// A multiset bitvector is one that contains duplicate values in the integer array interpretation.
62/// Queries that operate on present values work correctly with a multiset, while [`Rank::rank_zero`] and [`SelectZero`] do not.
63/// Multiset vectors can be built with [`SparseBuilder::multiset`] and [`SparseVector::try_from_iter`].
64///
65/// `SparseVector` implements the following `simple_sds` traits:
66/// * Basic functionality: [`BitVec`]
67/// * Queries and operations: [`Rank`], [`Select`], [`SelectZero`], [`PredSucc`]
68/// * Serialization: [`Serialize`]
69///
70/// # Examples
71///
72/// ```
73/// use simple_sds_sbwt::ops::{BitVec, Rank, Select, SelectZero, PredSucc};
74/// use simple_sds_sbwt::sparse_vector::{SparseVector, SparseBuilder};
75/// use std::convert::TryFrom;
76///
77/// let mut builder = SparseBuilder::new(137, 4).unwrap();
78/// builder.set(1); builder.set(33); builder.set(95); builder.set(123);
79/// let sv = SparseVector::try_from(builder).unwrap();
80///
81/// // BitVec
82/// assert_eq!(sv.len(), 137);
83/// assert!(!sv.is_empty());
84/// assert_eq!(sv.count_ones(), 4);
85/// assert_eq!(sv.count_zeros(), 133);
86/// assert!(sv.get(33));
87/// assert!(!sv.get(34));
88/// for (index, value) in sv.iter().enumerate() {
89/// assert_eq!(value, sv.get(index));
90/// }
91///
92/// // Rank
93/// assert!(sv.supports_rank());
94/// assert_eq!(sv.rank(33), 1);
95/// assert_eq!(sv.rank(34), 2);
96/// assert_eq!(sv.rank_zero(65), 63);
97///
98/// // Select
99/// assert!(sv.supports_select());
100/// assert_eq!(sv.select(1), Some(33));
101/// let mut iter = sv.select_iter(2);
102/// assert_eq!(iter.next(), Some((2, 95)));
103/// assert_eq!(iter.next(), Some((3, 123)));
104/// assert!(iter.next().is_none());
105/// let v: Vec<(usize, usize)> = sv.one_iter().collect();
106/// assert_eq!(v, vec![(0, 1), (1, 33), (2, 95), (3, 123)]);
107///
108/// // SelectZero
109/// assert!(sv.supports_select_zero());
110/// assert_eq!(sv.select_zero(35), Some(37));
111/// let mut iter = sv.select_zero_iter(92);
112/// assert_eq!(iter.next(), Some((92, 94)));
113/// assert_eq!(iter.next(), Some((93, 96)));
114///
115/// // PredSucc
116/// assert!(sv.supports_pred_succ());
117/// assert!(sv.predecessor(0).next().is_none());
118/// assert_eq!(sv.predecessor(1).next(), Some((0, 1)));
119/// assert_eq!(sv.predecessor(2).next(), Some((0, 1)));
120/// assert_eq!(sv.successor(122).next(), Some((3, 123)));
121/// assert_eq!(sv.successor(123).next(), Some((3, 123)));
122/// assert!(sv.successor(124).next().is_none());
123/// ```
124///
125/// # Notes
126///
127/// * `SparseVector` never panics from I/O errors.
128/// * All `SparseVector` queries are always enabled without additional support structures.
129#[derive(Clone, Debug, PartialEq, Eq)]
130pub struct SparseVector {
131 len: usize,
132 high: BitVector,
133 low: IntVector,
134}
135
136// Bitvector index encoded as offsets in `high` and `low`.
137#[derive(Clone, Copy, Debug, PartialEq, Eq)]
138struct Pos {
139 high: usize,
140 low: usize,
141}
142
143// Bitvector index encoded as high and low parts.
144#[derive(Clone, Copy, Debug, PartialEq, Eq)]
145struct Parts {
146 high: usize,
147 low: usize,
148}
149
150impl SparseVector {
151 // Stop binary search in `select_zero` when there are at most this many runs left.
152 const BINARY_SEARCH_THRESHOLD: usize = 16;
153
154 /// Returns a copy of the source bitvector as `SparseVector`.
155 ///
156 /// The copy is created by iterating over the set bits using [`Select::one_iter`].
157 /// [`From`] implementations from other bitvector types should generally use this function.
158 ///
159 /// # Examples
160 ///
161 /// ```
162 /// use simple_sds_sbwt::bit_vector::BitVector;
163 /// use simple_sds_sbwt::ops::BitVec;
164 /// use simple_sds_sbwt::sparse_vector::SparseVector;
165 /// use std::iter::FromIterator;
166 ///
167 /// let source: Vec<bool> = vec![true, false, true, true, false, true, true, false];
168 /// let bv = BitVector::from_iter(source);
169 /// let sv = SparseVector::copy_bit_vec(&bv);
170 /// assert_eq!(sv.len(), bv.len());
171 /// assert_eq!(sv.count_ones(), bv.count_ones());
172 /// assert!(!sv.is_multiset());
173 /// ```
174 pub fn copy_bit_vec<'a, T: BitVec<'a> + Select<'a>>(source: &'a T) -> Self {
175 let mut builder = SparseBuilder::new(source.len(), source.count_ones()).unwrap();
176 for (_, index) in source.one_iter() {
177 unsafe { builder.set_unchecked(index); }
178 }
179 SparseVector::try_from(builder).unwrap()
180 }
181
182 /// Builds a vector from the values in the iterator using multiset semantics.
183 ///
184 /// Returns an error message if the values are not sorted.
185 /// Universe size is set to be barely large enough for the values.
186 ///
187 /// # Examples
188 ///
189 /// ```
190 /// use simple_sds_sbwt::sparse_vector::SparseVector;
191 /// use simple_sds_sbwt::ops::{BitVec, Select};
192 ///
193 /// let source: Vec<usize> = vec![3, 4, 4, 7, 11, 19];
194 /// let sv = SparseVector::try_from_iter(source.iter().cloned()).unwrap();
195 /// assert_eq!(sv.len(), 20);
196 /// assert_eq!(sv.count_ones(), source.len());
197 /// assert!(sv.is_multiset());
198 ///
199 /// for (index, value) in sv.one_iter() {
200 /// assert_eq!(value, source[index]);
201 /// }
202 /// ```
203 pub fn try_from_iter<T: Iterator<Item = usize> + DoubleEndedIterator + ExactSizeIterator>(iter: T) -> Result<SparseVector, &'static str> {
204 let mut iter = iter;
205 let (ones, _) = iter.size_hint();
206 let universe = if let Some(pos) = iter.next_back() { pos + 1 } else { 0 };
207 let mut builder = SparseBuilder::multiset(universe, ones);
208 for pos in iter {
209 builder.try_set(pos)?;
210 }
211 if universe > 0 {
212 builder.try_set(universe - 1)?;
213 }
214 SparseVector::try_from(builder)
215 }
216
217 /// Returns `true` if the vector is a multiset (contains duplicate values).
218 ///
219 /// This method is somewhat expensive, as it iterates over the vector.
220 pub fn is_multiset(&self) -> bool {
221 let mut prev = self.len();
222 for (_, value) in self.one_iter() {
223 if value == prev {
224 return true;
225 }
226 prev = value;
227 }
228 false
229 }
230
231 // Split a bitvector index into high and low parts.
232 fn split(&self, index: usize) -> Parts {
233 Parts {
234 high: index >> self.low.width(),
235 low: index & unsafe { bits::low_set_unchecked(self.low.width()) as usize },
236 }
237 }
238
239 // Get (rank, bitvector index) from the offsets in `high` and `low`.
240 fn combine(&self, pos: Pos) -> (usize, usize) {
241 (pos.low, ((pos.high - pos.low) << self.low.width()) + (self.low.get(pos.low) as usize))
242 }
243
244 // Get the offsets in `high` and `low` for the set bit of the given rank.
245 fn pos(&self, rank: usize) -> Pos {
246 Pos {
247 high: self.high.select(rank).unwrap(),
248 low: rank,
249 }
250 }
251
252 // Get a `Pos` that points to the first value with this high part or to the following
253 // unset bit if no such values exist.
254 fn lower_bound(&self, high_part: usize) -> Pos {
255 if high_part == 0 {
256 Pos { high: 0, low: 0, }
257 } else {
258 let high_offset = self.high.select_zero(high_part - 1).unwrap() + 1;
259 Pos { high: high_offset, low: high_offset - high_part, }
260 }
261 }
262
263 // Get a `Pos` that points to the unset bit after the values with the this high part.
264 fn upper_bound(&self, high_part: usize) -> Pos {
265 let high_offset = self.high.select_zero(high_part).unwrap();
266 Pos { high: high_offset, low: high_offset - high_part, }
267 }
268
269 // Returns (run rank, one_iter past the run) for the run of 0s that contains
270 // unset bit of the given rank.
271 fn find_zero_run(&self, rank: usize) -> (usize, OneIter) {
272 let mut low = 0;
273 let mut high = self.count_ones();
274 let mut result = (0, self.one_iter());
275
276 // Invariant: `self.rank_zero(high) > rank`.
277 while high - low > Self::BINARY_SEARCH_THRESHOLD {
278 let mid = low + (high - low) / 2;
279 let mut iter = self.select_iter(mid);
280 let (_, mid_pos) = iter.next().unwrap();
281 if mid_pos - mid <= rank {
282 result = (mid + 1, iter);
283 low = mid + 1;
284 } else {
285 high = mid;
286 }
287 }
288
289 // Once we have only a few runs left, a linear scan is faster than
290 // `high.select()`.
291 let mut iter = result.1.clone();
292 while let Some((mid, mid_pos)) = iter.next() {
293 if mid_pos - mid <= rank {
294 result = (mid + 1, iter.clone());
295 } else {
296 break;
297 }
298 }
299
300 result
301 }
302}
303
304//-----------------------------------------------------------------------------
305
306/// Space-efficient [`SparseVector`] construction.
307///
308/// A `SparseBuilder` allocates the data structures based on universe size (bitvector length) and the number of set bits.
309/// The set bits must then be indicated in order using [`SparseBuilder::set`] or the [`Extend`] trait.
310/// Once the builder is full, it can be converted into a [`SparseVector`] using the [`TryFrom`] trait.
311/// The conversion will not fail if the builder is full.
312///
313/// Setting a bit `i` fails if the builder is full or the index is too small (`i < self.next_index()`) or too large (`i > self.universe()`).
314/// [`Extend::extend`] will panic in such situations.
315///
316/// # Examples
317///
318/// ```
319/// use simple_sds_sbwt::ops::BitVec;
320/// use simple_sds_sbwt::sparse_vector::{SparseVector, SparseBuilder};
321/// use std::convert::TryFrom;
322///
323/// let mut builder = SparseBuilder::new(300, 5).unwrap();
324/// assert_eq!(builder.len(), 0);
325/// assert_eq!(builder.capacity(), 5);
326/// assert_eq!(builder.universe(), 300);
327/// assert_eq!(builder.next_index(), 0);
328/// assert!(!builder.is_full());
329/// assert!(!builder.is_multiset());
330///
331/// builder.set(12);
332/// assert_eq!(builder.len(), 1);
333/// assert_eq!(builder.next_index(), 13);
334///
335/// // This will return an error because the index is too small.
336/// let _ = builder.try_set(10);
337/// assert_eq!(builder.len(), 1);
338/// assert_eq!(builder.next_index(), 13);
339///
340/// let v: Vec<usize> = vec![24, 48, 96, 192];
341/// builder.extend(v);
342/// assert_eq!(builder.len(), 5);
343/// assert!(builder.is_full());
344///
345/// let sv = SparseVector::try_from(builder).unwrap();
346/// assert_eq!(sv.len(), 300);
347/// assert_eq!(sv.count_ones(), 5);
348/// ```
349#[derive(Clone, Debug)]
350pub struct SparseBuilder {
351 data: SparseVector,
352 // We need a mutable bitvector during construction.
353 high: RawVector,
354 // Number of bits already set.
355 len: usize,
356 // The first index that can be set.
357 next: usize,
358 // `0` if we are building a multiset, `1` if not.
359 increment: usize,
360}
361
362impl SparseBuilder {
363 /// Returns an empty `SparseBuilder` without multiset semantics.
364 ///
365 /// Returns [`Err`] if `ones > universe`.
366 ///
367 /// # Arguments
368 ///
369 /// * `universe`: Universe size or length of the bitvector.
370 /// * `ones`: Number of bits that will be set in the bitvector.
371 pub fn new(universe: usize, ones: usize) -> Result<SparseBuilder, &'static str> {
372 if ones > universe {
373 return Err("Number of set bits is greater than universe size");
374 }
375
376 let (width, high_len) = Self::get_params(universe, ones);
377 let low = IntVector::with_len(ones, width, 0).unwrap();
378 let data = SparseVector {
379 len: universe,
380 high: BitVector::from(RawVector::new()),
381 low,
382 };
383
384 let high = RawVector::with_len(high_len, false);
385 Ok(SparseBuilder {
386 data,
387 high,
388 len: 0,
389 next: 0,
390 increment: 1,
391 })
392 }
393
394 /// Returns an empty `SparseBuilder` with multiset semantics.
395 ///
396 /// # Arguments
397 ///
398 /// * `universe`: Universe size or length of the bitvector.
399 /// * `ones`: Number of bits that will be set in the bitvector.
400 ///
401 /// # Examples
402 ///
403 /// ```
404 /// use simple_sds_sbwt::ops::BitVec;
405 /// use simple_sds_sbwt::sparse_vector::{SparseVector, SparseBuilder};
406 /// use std::convert::TryFrom;
407 ///
408 /// let mut builder = SparseBuilder::multiset(120, 3);
409 /// assert_eq!(builder.capacity(), 3);
410 /// assert_eq!(builder.universe(), 120);
411 /// assert!(builder.is_multiset());
412 ///
413 /// builder.set(12);
414 /// builder.set(24);
415 /// builder.set(24);
416 /// assert!(builder.is_full());
417 ///
418 /// let sv = SparseVector::try_from(builder).unwrap();
419 /// assert_eq!(sv.len(), 120);
420 /// assert_eq!(sv.count_ones(), 3);
421 /// assert!(sv.is_multiset());
422 /// ```
423 pub fn multiset(universe: usize, ones: usize) -> SparseBuilder {
424 let (width, high_len) = Self::get_params(universe, ones);
425 let low = IntVector::with_len(ones, width, 0).unwrap();
426 let data = SparseVector {
427 len: universe,
428 high: BitVector::from(RawVector::new()),
429 low,
430 };
431
432 let high = RawVector::with_len(high_len, false);
433 SparseBuilder {
434 data,
435 high,
436 len: 0,
437 next: 0,
438 increment: 0,
439 }
440 }
441
442 /// Returns `true` if the builder is using multiset semantics.
443 pub fn is_multiset(&self) -> bool {
444 self.increment == 0
445 }
446
447 // Returns `(low.width(), high.len())`. Now works with overfull multisets as well.
448 fn get_params(universe: usize, ones: usize) -> (usize, usize) {
449 let mut low_width: usize = 1;
450 if ones > 0 && ones <= universe {
451 let ideal_width = ((universe as f64 * 2.0_f64.ln()) / (ones as f64)).log2();
452 low_width = ideal_width.max(1.0).round() as usize;
453 }
454 let buckets = Self::get_buckets(universe, low_width);
455 (low_width, ones + buckets)
456 }
457
458 // Returns `high.len()` for the given `universe` and `low.width()`.
459 fn get_buckets(universe: usize, low_width: usize) -> usize {
460 let mut buckets = if low_width < bits::WORD_BITS { universe >> low_width } else { 0 };
461 if universe & (bits::low_set(low_width) as usize) != 0 {
462 buckets += 1;
463 }
464 buckets
465 }
466
467 /// Returns the number of bits that have already been set.
468 pub fn len(&self) -> usize {
469 self.len
470 }
471
472 /// Returns the number of bits that can be set.
473 pub fn capacity(&self) -> usize {
474 self.data.count_ones()
475 }
476
477 /// Returns the universe size or the length of the bitvector.
478 pub fn universe(&self) -> usize {
479 self.data.len()
480 }
481
482 /// Returns the smallest index in the bitvector that can still be set.
483 pub fn next_index(&self) -> usize {
484 self.next
485 }
486
487 /// Returns `true` if all bits that can be set have been set.
488 pub fn is_full(&self) -> bool {
489 self.len() == self.capacity()
490 }
491
492 /// Returns `true` if no bits have been set.
493 ///
494 /// Keeps Clippy happy.
495 pub fn is_empty(&self) -> bool {
496 self.len() == 0
497 }
498
499 /// Sets the specified bit in the bitvector.
500 ///
501 /// # Panics
502 ///
503 /// Panics if the builder is full, if `index < self.next_index()`, or if `index >= self.universe()`.
504 pub fn set(&mut self, index: usize) {
505 self.try_set(index).unwrap();
506 }
507
508 /// Unsafe version of [`SparseBuilder::set`] without sanity checks.
509 ///
510 /// # Safety
511 ///
512 /// Behavior is undefined if the builder is full, if `index < self.next_index()`, or if `index >= self.universe()`.
513 pub unsafe fn set_unchecked(&mut self, index: usize) {
514 let parts = self.data.split(index);
515 self.high.set_bit(parts.high + self.len, true);
516 self.data.low.set(self.len, parts.low as u64);
517 self.len += 1; self.next = index + self.increment;
518 }
519
520 /// Tries to set the specified bit in the bitvector.
521 ///
522 /// Returns [`Err`] if the builder is full, if `index < self.next_index()`, or if `index >= self.universe()`.
523 pub fn try_set(&mut self, index: usize) -> Result<(), &'static str> {
524 if self.is_full() {
525 return Err("The builder is full");
526 }
527 if index < self.next_index() {
528 if self.increment == 0 {
529 return Err("Index must be >= previous set position");
530 } else {
531 return Err("Index must be > previous set position");
532 }
533 }
534 if index >= self.universe() {
535 return Err("Index is larger than universe size");
536 }
537 unsafe { self.set_unchecked(index); }
538 Ok(())
539 }
540}
541
542impl Extend<usize> for SparseBuilder {
543 fn extend<T: IntoIterator<Item = usize>>(&mut self, iter: T) {
544 for index in iter {
545 self.set(index);
546 }
547 }
548}
549
550impl TryFrom<SparseBuilder> for SparseVector {
551 type Error = &'static str;
552
553 fn try_from(builder: SparseBuilder) -> Result<Self, Self::Error> {
554 let mut builder = builder;
555 if !builder.is_full() {
556 return Err("The builder is not full");
557 }
558 builder.data.high = BitVector::from(builder.high);
559 builder.data.high.enable_select();
560 builder.data.high.enable_select_zero();
561 Ok(builder.data)
562 }
563}
564
565//-----------------------------------------------------------------------------
566
567/// A read-only iterator over [`SparseVector`].
568///
569/// The type of `Item` is [`bool`].
570///
571/// # Examples
572///
573/// ```
574/// use simple_sds_sbwt::ops::BitVec;
575/// use simple_sds_sbwt::sparse_vector::{SparseVector, SparseBuilder};
576/// use std::convert::TryFrom;
577///
578/// let source: Vec<bool> = vec![true, false, true, true, false, true, true, false];
579/// let ones = source.iter().filter(|&b| *b).count();
580/// let mut builder = SparseBuilder::new(source.len(), ones).unwrap();
581/// for (index, _) in source.iter().enumerate().filter(|v| *v.1) {
582/// builder.set(index);
583/// }
584/// let sv = SparseVector::try_from(builder).unwrap();
585///
586/// assert_eq!(sv.iter().len(), source.len());
587/// for (index, value) in sv.iter().enumerate() {
588/// assert_eq!(source[index], value);
589/// }
590/// ```
591#[derive(Clone, Debug)]
592pub struct Iter<'a> {
593 parent: OneIter<'a>,
594 // The first index we have not visited.
595 next: usize,
596 // The first set bit we have not visited.
597 next_set: Option<usize>,
598 // The first index we should not visit.
599 limit: usize,
600 // The last set bit we have not visited.
601 last_set: Option<usize>,
602}
603
604impl<'a> Iterator for Iter<'a> {
605 type Item = bool;
606
607 fn next(&mut self) -> Option<Self::Item> {
608 if self.next >= self.limit {
609 return None;
610 }
611 match self.next_set {
612 Some(value) => {
613 if value == self.next {
614 // We have to find the next unvisited (unique) value, and `last_set` is the initial candidate.
615 self.next_set = self.last_set;
616 // Skip duplicates until we find a new value or run out of values.
617 for (_, index) in self.parent.by_ref() {
618 if index > self.next {
619 self.next_set = Some(index);
620 break;
621 }
622 }
623 self.next += 1;
624 Some(true)
625 } else {
626 self.next += 1;
627 Some(false)
628 }
629 },
630 None => {
631 self.next += 1;
632 Some(false)
633 },
634 }
635 }
636
637 #[inline]
638 fn size_hint(&self) -> (usize, Option<usize>) {
639 let remaining = self.limit - self.next;
640 (remaining, Some(remaining))
641 }
642}
643
644impl<'a> DoubleEndedIterator for Iter<'a> {
645 fn next_back(&mut self) -> Option<Self::Item> {
646 if self.next >= self.limit {
647 return None;
648 }
649 self.limit -= 1;
650 match self.last_set {
651 Some(value) => {
652 if value == self.limit {
653 // We have to find the previous unvisited (unique) value, and `next_set` is the initial candidate.
654 self.last_set = self.next_set;
655 // Skip duplicates until we find a new value or run out of values.
656 while let Some((_, index)) = self.parent.next_back() {
657 if index < self.limit {
658 self.last_set = Some(index);
659 break;
660 }
661 }
662 Some(true)
663 } else {
664 Some(false)
665 }
666 },
667 None => Some(false),
668 }
669 }
670}
671
672impl<'a> ExactSizeIterator for Iter<'a> {}
673
674impl<'a> FusedIterator for Iter<'a> {}
675
676//-----------------------------------------------------------------------------
677
678impl<'a> BitVec<'a> for SparseVector {
679 type Iter = Iter<'a>;
680
681 #[inline]
682 fn len(&self) -> usize {
683 self.len
684 }
685
686 #[inline]
687 fn count_ones(&self) -> usize {
688 self.low.len()
689 }
690
691 // Override the default implementation, because it may underflow with multisets.
692 #[inline]
693 fn count_zeros(&self) -> usize {
694 if self.count_ones() >= self.len() {
695 0
696 } else {
697 self.len() - self.count_ones()
698 }
699 }
700
701 fn get(&self, index: usize) -> bool {
702 // Find the first value with the same high part, if it exists.
703 let parts = self.split(index);
704 let mut pos = self.lower_bound(parts.high);
705
706 // Iterate forward over the values with the same high part until we find
707 // a value no less than `value` or we run out of such values.
708 while pos.high < self.high.len() && self.high.get(pos.high) {
709 let low = self.low.get(pos.low) as usize;
710 if low >= parts.low {
711 return low == parts.low;
712 }
713 pos.high += 1; pos.low += 1;
714 }
715
716 false
717 }
718
719 fn iter(&'a self) -> Self::Iter {
720 let mut one_iter = self.one_iter();
721 let next_set = if let Some((_, index)) = one_iter.next() {
722 Some(index)
723 } else {
724 None
725 };
726 let last_set = if let Some((_, index)) = one_iter.next_back() {
727 Some(index)
728 } else {
729 next_set
730 };
731 Self::Iter {
732 parent: one_iter,
733 next: 0,
734 next_set,
735 limit: self.len(),
736 last_set,
737 }
738 }
739}
740
741//-----------------------------------------------------------------------------
742
743impl<'a> Rank<'a> for SparseVector {
744 fn supports_rank(&self) -> bool {
745 true
746 }
747
748 fn enable_rank(&mut self) {}
749
750 fn rank(&self, index: usize) -> usize {
751 if index >= self.len() {
752 return self.count_ones();
753 }
754
755 // Find the last value with the same high part, if it exists.
756 let parts = self.split(index);
757 let mut pos = self.upper_bound(parts.high);
758 if pos.low == 0 {
759 return 0;
760 }
761 pos.high -= 1; pos.low -= 1;
762
763 // Iterate backward over the values with the same high part until we find
764 // as value lower than `index` or we run out of such values.
765 while self.high.get(pos.high) && (self.low.get(pos.low) as usize) >= parts.low {
766 if pos.low == 0 {
767 return 0;
768 }
769 pos.high -= 1; pos.low -= 1;
770 }
771
772 pos.low + 1
773 }
774}
775
776//-----------------------------------------------------------------------------
777
778/// An iterator over the set bits in [`SparseVector`].
779///
780/// The type of `Item` is `(`[`usize`]`, `[`usize`]`)`.
781/// This can be interpreted as:
782///
783/// * `(index, value)` or `(i, select(i))` in the integer array; or
784/// * `(rank(j), j)` in the bit array with `j` such that `self.get(j) == true`.
785///
786/// Note that `index` is not always the index provided by [`Iterator::enumerate`].
787/// Queries may create iterators in the middle of the bitvector.
788///
789/// # Examples
790///
791/// ```
792/// use simple_sds_sbwt::ops::{BitVec, Select};
793/// use simple_sds_sbwt::sparse_vector::{SparseVector, SparseBuilder};
794/// use std::convert::TryFrom;
795///
796/// let source: Vec<bool> = vec![true, false, true, true, false, true, true, false];
797/// let ones = source.iter().filter(|&b| *b).count();
798/// let mut builder = SparseBuilder::new(source.len(), ones).unwrap();
799/// for (index, _) in source.iter().enumerate().filter(|v| *v.1) {
800/// builder.set(index);
801/// }
802/// let sv = SparseVector::try_from(builder).unwrap();
803///
804/// let mut iter = sv.one_iter();
805/// assert_eq!(iter.len(), ones);
806/// assert_eq!(iter.next(), Some((0, 0)));
807/// assert_eq!(iter.next(), Some((1, 2)));
808/// assert_eq!(iter.next(), Some((2, 3)));
809/// assert_eq!(iter.next(), Some((3, 5)));
810/// assert_eq!(iter.next(), Some((4, 6)));
811/// assert!(iter.next().is_none());
812/// ```
813#[derive(Clone, Debug)]
814pub struct OneIter<'a> {
815 parent: &'a SparseVector,
816 // The first position we have not visited.
817 next: Pos,
818 // The first position we should not visit.
819 limit: Pos,
820}
821
822impl<'a> OneIter<'a> {
823 // Build an empty iterator for the parent bitvector.
824 fn empty_iter(parent: &'a SparseVector) -> OneIter<'a> {
825 OneIter {
826 parent,
827 next: Pos { high: parent.high.len(), low: parent.low.len(), },
828 limit: Pos { high: parent.high.len(), low: parent.low.len(), },
829 }
830 }
831}
832
833impl<'a> Iterator for OneIter<'a> {
834 type Item = (usize, usize);
835
836 fn next(&mut self) -> Option<Self::Item> {
837 if self.next.low >= self.limit.low {
838 None
839 } else {
840 while !self.parent.high.get(self.next.high) {
841 self.next.high += 1;
842 }
843 let result = self.parent.combine(self.next);
844 self.next.high += 1; self.next.low += 1;
845 Some(result)
846 }
847 }
848
849 #[inline]
850 fn size_hint(&self) -> (usize, Option<usize>) {
851 let remaining = self.limit.low - self.next.low;
852 (remaining, Some(remaining))
853 }
854}
855
856impl<'a> DoubleEndedIterator for OneIter<'a> {
857 fn next_back(&mut self) -> Option<Self::Item> {
858 if self.next.low >= self.limit.low {
859 None
860 } else {
861 self.limit.high -= 1; self.limit.low -= 1;
862 while !self.parent.high.get(self.limit.high) {
863 self.limit.high -= 1;
864 }
865 Some(self.parent.combine(self.limit))
866 }
867 }
868}
869
870impl<'a> ExactSizeIterator for OneIter<'a> {}
871
872impl<'a> FusedIterator for OneIter<'a> {}
873
874//-----------------------------------------------------------------------------
875
876/// An iterator over the unset bits in [`SparseVector`].
877///
878/// The type of `Item` is `(`[`usize`]`, `[`usize`]`)`.
879/// This can be interpreted as:
880///
881/// * `(index, value)` or `(i, select(i))` in the integer array of the complement; or
882/// * `(rank(j), j)` in the bit array with `j` such that `self.get(j) == false`.
883///
884/// Note that `index` is not always the index provided by [`Iterator::enumerate`].
885/// Queries may create iterators in the middle of the bitvector.
886///
887/// This iterator does not work correctly with multisets.
888///
889/// # Examples
890///
891/// ```
892/// use simple_sds_sbwt::ops::{BitVec, SelectZero};
893/// use simple_sds_sbwt::sparse_vector::{SparseVector, SparseBuilder};
894/// use std::convert::TryFrom;
895///
896/// let source: Vec<bool> = vec![true, false, true, true, false, true, true, false];
897/// let ones = source.iter().filter(|&b| *b).count();
898/// let mut builder = SparseBuilder::new(source.len(), ones).unwrap();
899/// for (index, _) in source.iter().enumerate().filter(|v| *v.1) {
900/// builder.set(index);
901/// }
902/// let sv = SparseVector::try_from(builder).unwrap();
903///
904/// let mut iter = sv.zero_iter();
905/// assert_eq!(iter.len(), source.len() - ones);
906/// assert_eq!(iter.next(), Some((0, 1)));
907/// assert_eq!(iter.next(), Some((1, 4)));
908/// assert_eq!(iter.next(), Some((2, 7)));
909/// assert!(iter.next().is_none());
910/// ```
911#[derive(Clone, Debug)]
912pub struct ZeroIter<'a> {
913 iter: OneIter<'a>,
914 // The position of the next one, or the length of the bitvector.
915 one_pos: usize,
916 // The first position we have not visited.
917 next: (usize, usize),
918 // The first position we should not visit.
919 limit: (usize, usize),
920}
921
922impl<'a> ZeroIter<'a> {
923 // Build an empty iterator for the parent bitvector.
924 fn empty_iter(parent: &'a SparseVector) -> ZeroIter<'a> {
925 ZeroIter {
926 iter: OneIter::empty_iter(parent),
927 one_pos: 0,
928 next: (0, 0),
929 limit: (0, 0),
930 }
931 }
932
933 // Go to the next run of zeros if necessary, assuming that we are not at the end.
934 fn next_run(&mut self) {
935 while self.next.1 >= self.one_pos {
936 self.next.1 = self.one_pos + 1;
937 self.one_pos = if let Some((_, pos)) = self.iter.next() { pos } else { self.limit.1 };
938 }
939 }
940}
941
942impl<'a> Iterator for ZeroIter<'a> {
943 type Item = (usize, usize);
944
945 fn next(&mut self) -> Option<Self::Item> {
946 if self.next.0 >= self.limit.0 {
947 None
948 } else {
949 self.next_run();
950 let result = self.next;
951 self.next.0 += 1;
952 self.next.1 += 1;
953 Some(result)
954 }
955 }
956
957 #[inline]
958 fn size_hint(&self) -> (usize, Option<usize>) {
959 let remaining = self.limit.0 - self.next.0;
960 (remaining, Some(remaining))
961 }
962}
963
964// TODO: DoubleEndedIterator?
965
966impl<'a> ExactSizeIterator for ZeroIter<'a> {}
967
968impl<'a> FusedIterator for ZeroIter<'a> {}
969
970//-----------------------------------------------------------------------------
971
972impl<'a> Select<'a> for SparseVector {
973 type OneIter = OneIter<'a>;
974
975 fn supports_select(&self) -> bool {
976 true
977 }
978
979 fn enable_select(&mut self) {}
980
981 fn one_iter(&'a self) -> Self::OneIter {
982 Self::OneIter {
983 parent: self,
984 next: Pos { high: 0, low: 0, },
985 limit: Pos { high: self.high.len(), low: self.low.len(), },
986 }
987 }
988
989 fn select(&'a self, rank: usize) -> Option<usize> {
990 if rank >= self.count_ones() {
991 None
992 } else {
993 Some(self.combine(self.pos(rank)).1)
994 }
995 }
996
997 fn select_iter(&'a self, rank: usize) -> Self::OneIter {
998 if rank >= self.count_ones() {
999 Self::OneIter::empty_iter(self)
1000 } else {
1001 Self::OneIter {
1002 parent: self,
1003 next: self.pos(rank),
1004 limit: Pos { high: self.high.len(), low: self.low.len(), },
1005 }
1006 }
1007 }
1008}
1009
1010//-----------------------------------------------------------------------------
1011
1012impl<'a> SelectZero<'a> for SparseVector {
1013 type ZeroIter = ZeroIter<'a>;
1014
1015 fn supports_select_zero(&self) -> bool {
1016 true
1017 }
1018
1019 fn enable_select_zero(&mut self) {}
1020
1021 fn zero_iter(&'a self) -> Self::ZeroIter {
1022 let mut iter = self.one_iter();
1023 let one_pos = if let Some((_, pos)) = iter.next() { pos } else { self.len() };
1024 ZeroIter {
1025 iter,
1026 one_pos,
1027 next: (0, 0),
1028 limit: (self.count_zeros(), self.len()),
1029 }
1030 }
1031
1032 fn select_zero(&'a self, rank: usize) -> Option<usize> {
1033 if rank >= self.count_zeros() {
1034 return None;
1035 }
1036 let (run_rank, _) = self.find_zero_run(rank);
1037 Some(run_rank + rank)
1038 }
1039
1040 fn select_zero_iter(&'a self, rank: usize) -> Self::ZeroIter {
1041 if rank >= self.count_zeros() {
1042 return Self::ZeroIter::empty_iter(self);
1043 }
1044 let (run_rank, mut iter) = self.find_zero_run(rank);
1045 let one_pos = if let Some((_, pos)) = iter.next() { pos } else { self.len() };
1046 ZeroIter {
1047 iter,
1048 one_pos,
1049 next: (rank, run_rank + rank),
1050 limit: (self.count_zeros(), self.len()),
1051 }
1052 }
1053}
1054
1055//-----------------------------------------------------------------------------
1056
1057impl<'a> PredSucc<'a> for SparseVector {
1058 type OneIter = OneIter<'a>;
1059
1060 fn supports_pred_succ(&self) -> bool {
1061 true
1062 }
1063
1064 fn enable_pred_succ(&mut self) {}
1065
1066 fn predecessor(&'a self, value: usize) -> Self::OneIter {
1067 if self.is_empty() {
1068 return Self::OneIter::empty_iter(self);
1069 }
1070
1071 // Find the last value with the same high part, if it exists.
1072 let parts = self.split(cmp::min(value, self.len() - 1));
1073 let mut pos = self.upper_bound(parts.high);
1074 if pos.low == 0 {
1075 return Self::OneIter::empty_iter(self);
1076 }
1077 pos.high -= 1; pos.low -= 1;
1078
1079 // Iterate backward over the values with the same high part until we find
1080 // a value no greater than `value` or we run out of such values.
1081 while self.high.get(pos.high) && (self.low.get(pos.low) as usize) > parts.low {
1082 if pos.low == 0 {
1083 return Self::OneIter::empty_iter(self);
1084 }
1085 pos.high -= 1; pos.low -= 1;
1086 }
1087
1088 // The predecessor has a lower high part, so we continue iterating until we find it.
1089 while !self.high.get(pos.high) {
1090 pos.high -= 1;
1091 }
1092
1093 Self::OneIter {
1094 parent: self,
1095 next: pos,
1096 limit: Pos { high: self.high.len(), low: self.low.len(), },
1097 }
1098 }
1099
1100 fn successor(&'a self, value: usize) -> Self::OneIter {
1101 if value >= self.len() {
1102 return Self::OneIter::empty_iter(self);
1103 }
1104
1105 // Find the first value with the same high part, if it exists.
1106 let parts = self.split(value);
1107 let mut pos = self.lower_bound(parts.high);
1108
1109 // Iterate forward over the values with the same high part until we find
1110 // a value no less than `value` or we run out of such values.
1111 while pos.high < self.high.len() && self.high.get(pos.high) {
1112 if (self.low.get(pos.low) as usize) >= parts.low {
1113 return Self::OneIter {
1114 parent: self,
1115 next: pos,
1116 limit: Pos { high: self.high.len(), low: self.low.len(), },
1117 };
1118 }
1119 pos.high += 1; pos.low += 1;
1120 }
1121
1122 // The successor has a greater high part, so we continue iterating until we find it.
1123 while pos.high < self.high.len() {
1124 if self.high.get(pos.high) {
1125 return Self::OneIter {
1126 parent: self,
1127 next: pos,
1128 limit: Pos { high: self.high.len(), low: self.low.len(), },
1129 };
1130 }
1131 pos.high += 1;
1132 }
1133
1134 Self::OneIter::empty_iter(self)
1135 }
1136}
1137
1138//-----------------------------------------------------------------------------
1139
1140impl Serialize for SparseVector {
1141 fn serialize_header<T: io::Write>(&self, writer: &mut T) -> io::Result<()> {
1142 self.len.serialize(writer)?;
1143 Ok(())
1144 }
1145
1146 fn serialize_body<T: io::Write>(&self, writer: &mut T) -> io::Result<()> {
1147 self.high.serialize(writer)?;
1148 self.low.serialize(writer)?;
1149 Ok(())
1150 }
1151
1152 fn load<T: io::Read>(reader: &mut T) -> io::Result<Self> {
1153 let len = usize::load(reader)?;
1154 let mut high = BitVector::load(reader)?;
1155 let low = IntVector::load(reader)?;
1156
1157 // Enable support structures, because the data may be from a library that does not know
1158 // how to build them.
1159 high.enable_select();
1160 high.enable_select_zero();
1161
1162 // Sanity checks.
1163 if low.len() != high.count_ones() {
1164 return Err(Error::new(ErrorKind::InvalidData, "Inconsistent number of set bits"));
1165 }
1166 if high.len() != low.len() + SparseBuilder::get_buckets(len, low.width()){
1167 return Err(Error::new(ErrorKind::InvalidData, "Invalid number of buckets"));
1168 }
1169
1170 let result = SparseVector {
1171 len, high, low,
1172 };
1173 Ok(result)
1174 }
1175
1176 fn size_in_elements(&self) -> usize {
1177 self.len.size_in_elements() +
1178 self.high.size_in_elements() +
1179 self.low.size_in_elements()
1180 }
1181}
1182
1183//-----------------------------------------------------------------------------