simple_sds_sbwt/
sparse_vector.rs

1//! An Elias-Fano encoded array supporting rank, select, and related queries.
2//!
3//! This structure is equivalent to `sd_vector` in [SDSL](https://github.com/simongog/sdsl-lite).
4//! It is also known in the literature as sdarray:
5//!
6//! > Okanohara, Sadakane: Practical Entropy-Compressed Rank/Select Dictionary.  
7//! > Proc. ALENEX 2007.  
8//! > DOI: [10.1137/1.9781611972870.6](https://doi.org/10.1137/1.9781611972870.6)
9//!
10//! The rule for selecting the number of low bits and buckets is from:
11//!
12//! > Ma, Puglisi, Raman, Zhukova: On Elias-Fano for Rank Queries in FM-Indexes.  
13//! > Proc. DCC 2021.  
14//! > DOI: [10.1109/DCC50243.2021.00030](https://doi.org/10.1109/DCC50243.2021.00030)
15//!
16//! Assume that we have a bitvector of length `n` with `m` set bits, with `m` much smaller than `n`.
17//! Let `w ≈ log(n) - log(m)`.
18//! In the integer array interpretation (see [`BitVec`]), we take the low `w` bits from each value and store them in an [`IntVector`].
19//! We place the values in buckets by the remaining high bits.
20//! A [`BitVector`] encodes the number of values in each bucket in unary.
21//! If there are `k >= 0` values in a bucket, the bitvector will contain `k` set bits followed by an unset bit.
22//! Then
23//!
24//! > `select(i) = low[i] + ((high.select(i) - i) << w)`.
25//!
26//! Rank, predecessor, and successor queries use `select_zero` on `high` followed by a linear scan.
27//!
28//! The `select_zero` implementation is based on finding the right run of unset bits using binary search with `select`.
29//! It is not particularly efficient.
30//!
31//! We can also support multisets that contain duplicate values (in the integer array interpretation).
32//! Rank/select queries for unset bits do not work correctly with multisets.
33
34use crate::bit_vector::BitVector;
35use crate::int_vector::IntVector;
36use crate::ops::{Vector, Access, BitVec, Rank, Select, PredSucc, SelectZero};
37use crate::raw_vector::{RawVector, AccessRaw};
38use crate::serialize::Serialize;
39use crate::bits;
40
41use std::convert::TryFrom;
42use std::io::{Error, ErrorKind};
43use std::iter::FusedIterator;
44use std::{cmp, io};
45
46#[cfg(test)]
47mod tests;
48
49//-----------------------------------------------------------------------------
50
51/// An immutable Elias-Fano encoded bitvector supporting, rank, select, and related queries.
52///
53/// This structure should be used for sparse bitvectors, where frequency of set bits is low.
54/// For dense bitvectors or when [`SelectZero`] is needed, [`BitVector`] is usually a better choice.
55/// Because most queries require support structures for one of the components, the bitvector itself is immutable.
56/// The maximum length of the vector is approximately [`usize::MAX`] bits.
57///
58/// Conversions between various [`BitVec`] types are possible using the [`From`] trait.
59///
60/// `SparseVector` supports partial multiset semantics.
61/// A multiset bitvector is one that contains duplicate values in the integer array interpretation.
62/// Queries that operate on present values work correctly with a multiset, while [`Rank::rank_zero`] and [`SelectZero`] do not.
63/// Multiset vectors can be built with [`SparseBuilder::multiset`] and [`SparseVector::try_from_iter`].
64///
65/// `SparseVector` implements the following `simple_sds` traits:
66/// * Basic functionality: [`BitVec`]
67/// * Queries and operations: [`Rank`], [`Select`], [`SelectZero`], [`PredSucc`]
68/// * Serialization: [`Serialize`]
69///
70/// # Examples
71///
72/// ```
73/// use simple_sds_sbwt::ops::{BitVec, Rank, Select, SelectZero, PredSucc};
74/// use simple_sds_sbwt::sparse_vector::{SparseVector, SparseBuilder};
75/// use std::convert::TryFrom;
76///
77/// let mut builder = SparseBuilder::new(137, 4).unwrap();
78/// builder.set(1); builder.set(33); builder.set(95); builder.set(123);
79/// let sv = SparseVector::try_from(builder).unwrap();
80///
81/// // BitVec
82/// assert_eq!(sv.len(), 137);
83/// assert!(!sv.is_empty());
84/// assert_eq!(sv.count_ones(), 4);
85/// assert_eq!(sv.count_zeros(), 133);
86/// assert!(sv.get(33));
87/// assert!(!sv.get(34));
88/// for (index, value) in sv.iter().enumerate() {
89///     assert_eq!(value, sv.get(index));
90/// }
91///
92/// // Rank
93/// assert!(sv.supports_rank());
94/// assert_eq!(sv.rank(33), 1);
95/// assert_eq!(sv.rank(34), 2);
96/// assert_eq!(sv.rank_zero(65), 63);
97///
98/// // Select
99/// assert!(sv.supports_select());
100/// assert_eq!(sv.select(1), Some(33));
101/// let mut iter = sv.select_iter(2);
102/// assert_eq!(iter.next(), Some((2, 95)));
103/// assert_eq!(iter.next(), Some((3, 123)));
104/// assert!(iter.next().is_none());
105/// let v: Vec<(usize, usize)> = sv.one_iter().collect();
106/// assert_eq!(v, vec![(0, 1), (1, 33), (2, 95), (3, 123)]);
107///
108/// // SelectZero
109/// assert!(sv.supports_select_zero());
110/// assert_eq!(sv.select_zero(35), Some(37));
111/// let mut iter = sv.select_zero_iter(92);
112/// assert_eq!(iter.next(), Some((92, 94)));
113/// assert_eq!(iter.next(), Some((93, 96)));
114///
115/// // PredSucc
116/// assert!(sv.supports_pred_succ());
117/// assert!(sv.predecessor(0).next().is_none());
118/// assert_eq!(sv.predecessor(1).next(), Some((0, 1)));
119/// assert_eq!(sv.predecessor(2).next(), Some((0, 1)));
120/// assert_eq!(sv.successor(122).next(), Some((3, 123)));
121/// assert_eq!(sv.successor(123).next(), Some((3, 123)));
122/// assert!(sv.successor(124).next().is_none());
123/// ```
124///
125/// # Notes
126///
127/// * `SparseVector` never panics from I/O errors.
128/// * All `SparseVector` queries are always enabled without additional support structures.
129#[derive(Clone, Debug, PartialEq, Eq)]
130pub struct SparseVector {
131    len: usize,
132    high: BitVector,
133    low: IntVector,
134}
135
136// Bitvector index encoded as offsets in `high` and `low`.
137#[derive(Clone, Copy, Debug, PartialEq, Eq)]
138struct Pos {
139    high: usize,
140    low: usize,
141}
142
143// Bitvector index encoded as high and low parts.
144#[derive(Clone, Copy, Debug, PartialEq, Eq)]
145struct Parts {
146    high: usize,
147    low: usize,
148}
149
150impl SparseVector {
151    // Stop binary search in `select_zero` when there are at most this many runs left.
152    const BINARY_SEARCH_THRESHOLD: usize = 16;
153
154    /// Returns a copy of the source bitvector as `SparseVector`.
155    ///
156    /// The copy is created by iterating over the set bits using [`Select::one_iter`].
157    /// [`From`] implementations from other bitvector types should generally use this function.
158    ///
159    /// # Examples
160    ///
161    /// ```
162    /// use simple_sds_sbwt::bit_vector::BitVector;
163    /// use simple_sds_sbwt::ops::BitVec;
164    /// use simple_sds_sbwt::sparse_vector::SparseVector;
165    /// use std::iter::FromIterator;
166    ///
167    /// let source: Vec<bool> = vec![true, false, true, true, false, true, true, false];
168    /// let bv = BitVector::from_iter(source);
169    /// let sv = SparseVector::copy_bit_vec(&bv);
170    /// assert_eq!(sv.len(), bv.len());
171    /// assert_eq!(sv.count_ones(), bv.count_ones());
172    /// assert!(!sv.is_multiset());
173    /// ```
174    pub fn copy_bit_vec<'a, T: BitVec<'a> + Select<'a>>(source: &'a T) -> Self {
175        let mut builder = SparseBuilder::new(source.len(), source.count_ones()).unwrap();
176        for (_, index) in source.one_iter() {
177            unsafe { builder.set_unchecked(index); }
178        }
179        SparseVector::try_from(builder).unwrap()
180    }
181
182    /// Builds a vector from the values in the iterator using multiset semantics.
183    ///
184    /// Returns an error message if the values are not sorted.
185    /// Universe size is set to be barely large enough for the values.
186    ///
187    /// # Examples
188    ///
189    /// ```
190    /// use simple_sds_sbwt::sparse_vector::SparseVector;
191    /// use simple_sds_sbwt::ops::{BitVec, Select};
192    ///
193    /// let source: Vec<usize> = vec![3, 4, 4, 7, 11, 19];
194    /// let sv = SparseVector::try_from_iter(source.iter().cloned()).unwrap();
195    /// assert_eq!(sv.len(), 20);
196    /// assert_eq!(sv.count_ones(), source.len());
197    /// assert!(sv.is_multiset());
198    ///
199    /// for (index, value) in sv.one_iter() {
200    ///     assert_eq!(value, source[index]);
201    /// }
202    /// ```
203    pub fn try_from_iter<T: Iterator<Item = usize> + DoubleEndedIterator + ExactSizeIterator>(iter: T) -> Result<SparseVector, &'static str> {
204        let mut iter = iter;
205        let (ones, _) = iter.size_hint();
206        let universe = if let Some(pos) = iter.next_back() { pos + 1 } else { 0 };
207        let mut builder = SparseBuilder::multiset(universe, ones);
208        for pos in iter {
209            builder.try_set(pos)?;
210        }
211        if universe > 0 {
212            builder.try_set(universe - 1)?;
213        }
214        SparseVector::try_from(builder)
215    }
216
217    /// Returns `true` if the vector is a multiset (contains duplicate values).
218    ///
219    /// This method is somewhat expensive, as it iterates over the vector.
220    pub fn is_multiset(&self) -> bool {
221        let mut prev = self.len();
222        for (_, value) in self.one_iter() {
223            if value == prev {
224                return true;
225            }
226            prev = value;
227        }
228        false
229    }
230
231    // Split a bitvector index into high and low parts.
232    fn split(&self, index: usize) -> Parts {
233        Parts {
234            high: index >> self.low.width(),
235            low: index & unsafe { bits::low_set_unchecked(self.low.width()) as usize },
236        }
237    }
238
239    // Get (rank, bitvector index) from the offsets in `high` and `low`.
240    fn combine(&self, pos: Pos) -> (usize, usize) {
241        (pos.low, ((pos.high - pos.low) << self.low.width()) + (self.low.get(pos.low) as usize))
242    }
243
244    // Get the offsets in `high` and `low` for the set bit of the given rank.
245    fn pos(&self, rank: usize) -> Pos {
246        Pos {
247            high: self.high.select(rank).unwrap(),
248            low: rank,
249        }
250    }
251
252    // Get a `Pos` that points to the first value with this high part or to the following
253    // unset bit if no such values exist.
254    fn lower_bound(&self, high_part: usize) -> Pos {
255        if high_part == 0 {
256            Pos { high: 0, low: 0, }
257        } else {
258            let high_offset = self.high.select_zero(high_part - 1).unwrap() + 1;
259            Pos { high: high_offset, low: high_offset - high_part, }
260        }
261    }
262
263    // Get a `Pos` that points to the unset bit after the values with the this high part.
264    fn upper_bound(&self, high_part: usize) -> Pos {
265        let high_offset = self.high.select_zero(high_part).unwrap();
266        Pos { high: high_offset, low: high_offset - high_part, }
267    }
268
269    // Returns (run rank, one_iter past the run) for the run of 0s that contains
270    // unset bit of the given rank.
271    fn find_zero_run(&self, rank: usize) -> (usize, OneIter) {
272        let mut low = 0;
273        let mut high = self.count_ones();
274        let mut result = (0, self.one_iter());
275
276        // Invariant: `self.rank_zero(high) > rank`.
277        while high - low > Self::BINARY_SEARCH_THRESHOLD {
278            let mid = low + (high - low) / 2;
279            let mut iter = self.select_iter(mid);
280            let (_, mid_pos) = iter.next().unwrap();
281            if mid_pos - mid <= rank {
282                result = (mid + 1, iter);
283                low = mid + 1;
284            } else {
285                high = mid;
286            }
287        }
288
289        // Once we have only a few runs left, a linear scan is faster than
290        // `high.select()`.
291        let mut iter = result.1.clone();
292        while let Some((mid, mid_pos)) = iter.next() {
293            if mid_pos - mid <= rank {
294                result = (mid + 1, iter.clone());
295            } else {
296                break;
297            }
298        }
299
300        result
301    }
302}
303
304//-----------------------------------------------------------------------------
305
306/// Space-efficient [`SparseVector`] construction.
307///
308/// A `SparseBuilder` allocates the data structures based on universe size (bitvector length) and the number of set bits.
309/// The set bits must then be indicated in order using [`SparseBuilder::set`] or the [`Extend`] trait.
310/// Once the builder is full, it can be converted into a [`SparseVector`] using the [`TryFrom`] trait.
311/// The conversion will not fail if the builder is full.
312///
313/// Setting a bit `i` fails if the builder is full or the index is too small (`i < self.next_index()`) or too large (`i > self.universe()`).
314/// [`Extend::extend`] will panic in such situations.
315///
316/// # Examples
317///
318/// ```
319/// use simple_sds_sbwt::ops::BitVec;
320/// use simple_sds_sbwt::sparse_vector::{SparseVector, SparseBuilder};
321/// use std::convert::TryFrom;
322///
323/// let mut builder = SparseBuilder::new(300, 5).unwrap();
324/// assert_eq!(builder.len(), 0);
325/// assert_eq!(builder.capacity(), 5);
326/// assert_eq!(builder.universe(), 300);
327/// assert_eq!(builder.next_index(), 0);
328/// assert!(!builder.is_full());
329/// assert!(!builder.is_multiset());
330///
331/// builder.set(12);
332/// assert_eq!(builder.len(), 1);
333/// assert_eq!(builder.next_index(), 13);
334///
335/// // This will return an error because the index is too small.
336/// let _ = builder.try_set(10);
337/// assert_eq!(builder.len(), 1);
338/// assert_eq!(builder.next_index(), 13);
339///
340/// let v: Vec<usize> = vec![24, 48, 96, 192];
341/// builder.extend(v);
342/// assert_eq!(builder.len(), 5);
343/// assert!(builder.is_full());
344///
345/// let sv = SparseVector::try_from(builder).unwrap();
346/// assert_eq!(sv.len(), 300);
347/// assert_eq!(sv.count_ones(), 5);
348/// ```
349#[derive(Clone, Debug)]
350pub struct SparseBuilder {
351    data: SparseVector,
352    // We need a mutable bitvector during construction.
353    high: RawVector,
354    // Number of bits already set.
355    len: usize,
356    // The first index that can be set.
357    next: usize,
358    // `0` if we are building a multiset, `1` if not.
359    increment: usize,
360}
361
362impl SparseBuilder {
363    /// Returns an empty `SparseBuilder` without multiset semantics.
364    ///
365    /// Returns [`Err`] if `ones > universe`.
366    ///
367    /// # Arguments
368    ///
369    /// * `universe`: Universe size or length of the bitvector.
370    /// * `ones`: Number of bits that will be set in the bitvector.
371    pub fn new(universe: usize, ones: usize) -> Result<SparseBuilder, &'static str> {
372        if ones > universe {
373            return Err("Number of set bits is greater than universe size");
374        }
375
376        let (width, high_len) = Self::get_params(universe, ones);
377        let low = IntVector::with_len(ones, width, 0).unwrap();
378        let data = SparseVector {
379            len: universe,
380            high: BitVector::from(RawVector::new()),
381            low,
382        };
383
384        let high = RawVector::with_len(high_len, false);
385        Ok(SparseBuilder {
386            data,
387            high,
388            len: 0,
389            next: 0,
390            increment: 1,
391        })
392    }
393
394    /// Returns an empty `SparseBuilder` with multiset semantics.
395    ///
396    /// # Arguments
397    ///
398    /// * `universe`: Universe size or length of the bitvector.
399    /// * `ones`: Number of bits that will be set in the bitvector.
400    ///
401    /// # Examples
402    ///
403    /// ```
404    /// use simple_sds_sbwt::ops::BitVec;
405    /// use simple_sds_sbwt::sparse_vector::{SparseVector, SparseBuilder};
406    /// use std::convert::TryFrom;
407    ///
408    /// let mut builder = SparseBuilder::multiset(120, 3);
409    /// assert_eq!(builder.capacity(), 3);
410    /// assert_eq!(builder.universe(), 120);
411    /// assert!(builder.is_multiset());
412    ///
413    /// builder.set(12);
414    /// builder.set(24);
415    /// builder.set(24);
416    /// assert!(builder.is_full());
417    ///
418    /// let sv = SparseVector::try_from(builder).unwrap();
419    /// assert_eq!(sv.len(), 120);
420    /// assert_eq!(sv.count_ones(), 3);
421    /// assert!(sv.is_multiset());
422    /// ```
423    pub fn multiset(universe: usize, ones: usize) -> SparseBuilder {
424        let (width, high_len) = Self::get_params(universe, ones);
425        let low = IntVector::with_len(ones, width, 0).unwrap();
426        let data = SparseVector {
427            len: universe,
428            high: BitVector::from(RawVector::new()),
429            low,
430        };
431
432        let high = RawVector::with_len(high_len, false);
433        SparseBuilder {
434            data,
435            high,
436            len: 0,
437            next: 0,
438            increment: 0,
439        }
440    }
441
442    /// Returns `true` if the builder is using multiset semantics.
443    pub fn is_multiset(&self) -> bool {
444        self.increment == 0
445    }
446
447    // Returns `(low.width(), high.len())`. Now works with overfull multisets as well.
448    fn get_params(universe: usize, ones: usize) -> (usize, usize) {
449        let mut low_width: usize = 1;
450        if ones > 0 && ones <= universe {
451            let ideal_width = ((universe as f64 * 2.0_f64.ln()) / (ones as f64)).log2();
452            low_width = ideal_width.max(1.0).round() as usize;
453        }
454        let buckets = Self::get_buckets(universe, low_width);
455        (low_width, ones + buckets)
456    }
457
458    // Returns `high.len()` for the given `universe` and `low.width()`.
459    fn get_buckets(universe: usize, low_width: usize) -> usize {
460        let mut buckets = if low_width < bits::WORD_BITS { universe >> low_width } else { 0 };
461        if universe & (bits::low_set(low_width) as usize) != 0 {
462            buckets += 1;
463        }
464        buckets
465    }
466
467    /// Returns the number of bits that have already been set.
468    pub fn len(&self) -> usize {
469        self.len
470    }
471
472    /// Returns the number of bits that can be set.
473    pub fn capacity(&self) -> usize {
474        self.data.count_ones()
475    }
476
477    /// Returns the universe size or the length of the bitvector.
478    pub fn universe(&self) -> usize {
479        self.data.len()
480    }
481
482    /// Returns the smallest index in the bitvector that can still be set.
483    pub fn next_index(&self) -> usize {
484        self.next
485    }
486
487    /// Returns `true` if all bits that can be set have been set.
488    pub fn is_full(&self) -> bool {
489        self.len() == self.capacity()
490    }
491
492    /// Returns `true` if no bits have been set.
493    ///
494    /// Keeps Clippy happy.
495    pub fn is_empty(&self) -> bool {
496        self.len() == 0
497    }
498
499    /// Sets the specified bit in the bitvector.
500    ///
501    /// # Panics
502    ///
503    /// Panics if the builder is full, if `index < self.next_index()`, or if `index >= self.universe()`.
504    pub fn set(&mut self, index: usize) {
505        self.try_set(index).unwrap();
506    }
507
508    /// Unsafe version of [`SparseBuilder::set`] without sanity checks.
509    ///
510    /// # Safety
511    ///
512    /// Behavior is undefined if the builder is full, if `index < self.next_index()`, or if `index >= self.universe()`.
513    pub unsafe fn set_unchecked(&mut self, index: usize) {
514        let parts = self.data.split(index);
515        self.high.set_bit(parts.high + self.len, true);
516        self.data.low.set(self.len, parts.low as u64);
517        self.len += 1; self.next = index + self.increment;
518    }
519
520    /// Tries to set the specified bit in the bitvector.
521    ///
522    /// Returns [`Err`] if the builder is full, if `index < self.next_index()`, or if `index >= self.universe()`.
523    pub fn try_set(&mut self, index: usize) -> Result<(), &'static str> {
524        if self.is_full() {
525            return Err("The builder is full");
526        }
527        if index < self.next_index() {
528            if self.increment == 0 {
529                return Err("Index must be >= previous set position");
530            } else {
531                return Err("Index must be > previous set position");
532            }
533        }
534        if index >= self.universe() {
535            return Err("Index is larger than universe size");
536        }
537        unsafe { self.set_unchecked(index); }
538        Ok(())
539    }
540}
541
542impl Extend<usize> for SparseBuilder {
543    fn extend<T: IntoIterator<Item = usize>>(&mut self, iter: T) {
544        for index in iter {
545            self.set(index);
546        }
547    }
548}
549
550impl TryFrom<SparseBuilder> for SparseVector {
551    type Error = &'static str;
552
553    fn try_from(builder: SparseBuilder) -> Result<Self, Self::Error> {
554        let mut builder = builder;
555        if !builder.is_full() {
556            return Err("The builder is not full");
557        }
558        builder.data.high = BitVector::from(builder.high);
559        builder.data.high.enable_select();
560        builder.data.high.enable_select_zero();
561        Ok(builder.data)
562    }
563}
564
565//-----------------------------------------------------------------------------
566
567/// A read-only iterator over [`SparseVector`].
568///
569/// The type of `Item` is [`bool`].
570///
571/// # Examples
572///
573/// ```
574/// use simple_sds_sbwt::ops::BitVec;
575/// use simple_sds_sbwt::sparse_vector::{SparseVector, SparseBuilder};
576/// use std::convert::TryFrom;
577///
578/// let source: Vec<bool> = vec![true, false, true, true, false, true, true, false];
579/// let ones = source.iter().filter(|&b| *b).count();
580/// let mut builder = SparseBuilder::new(source.len(), ones).unwrap();
581/// for (index, _) in source.iter().enumerate().filter(|v| *v.1) {
582///     builder.set(index);
583/// }
584/// let sv = SparseVector::try_from(builder).unwrap();
585///
586/// assert_eq!(sv.iter().len(), source.len());
587/// for (index, value) in sv.iter().enumerate() {
588///     assert_eq!(source[index], value);
589/// }
590/// ```
591#[derive(Clone, Debug)]
592pub struct Iter<'a> {
593    parent: OneIter<'a>,
594    // The first index we have not visited.
595    next: usize,
596    // The first set bit we have not visited.
597    next_set: Option<usize>,
598    // The first index we should not visit.
599    limit: usize,
600    // The last set bit we have not visited.
601    last_set: Option<usize>,
602}
603
604impl<'a> Iterator for Iter<'a> {
605    type Item = bool;
606
607    fn next(&mut self) -> Option<Self::Item> {
608        if self.next >= self.limit {
609            return None;
610        }
611        match self.next_set {
612            Some(value) => {
613                if value == self.next {
614                    // We have to find the next unvisited (unique) value, and `last_set` is the initial candidate.
615                    self.next_set = self.last_set;
616                    // Skip duplicates until we find a new value or run out of values.
617                    for (_, index) in self.parent.by_ref() {
618                        if index > self.next {
619                            self.next_set = Some(index);
620                            break;
621                        }
622                    }
623                    self.next += 1;
624                    Some(true)
625                } else {
626                    self.next += 1;
627                    Some(false)
628                }
629            },
630            None => {
631                self.next += 1;
632                Some(false)
633            },
634        }
635    }
636
637    #[inline]
638    fn size_hint(&self) -> (usize, Option<usize>) {
639        let remaining = self.limit - self.next;
640        (remaining, Some(remaining))
641    }
642}
643
644impl<'a> DoubleEndedIterator for Iter<'a> {
645    fn next_back(&mut self) -> Option<Self::Item> {
646        if self.next >= self.limit {
647            return None;
648        }
649        self.limit -= 1;
650        match self.last_set {
651            Some(value) => {
652                if value == self.limit {
653                    // We have to find the previous unvisited (unique) value, and `next_set` is the initial candidate.
654                    self.last_set = self.next_set;
655                    // Skip duplicates until we find a new value or run out of values.
656                    while let Some((_, index)) = self.parent.next_back() {
657                        if index < self.limit {
658                            self.last_set = Some(index);
659                            break;
660                        }
661                    }
662                    Some(true)
663                } else {
664                    Some(false)
665                }
666            },
667            None => Some(false),
668        }
669    }
670}
671
672impl<'a> ExactSizeIterator for Iter<'a> {}
673
674impl<'a> FusedIterator for Iter<'a> {}
675
676//-----------------------------------------------------------------------------
677
678impl<'a> BitVec<'a> for SparseVector {
679    type Iter = Iter<'a>;
680
681    #[inline]
682    fn len(&self) -> usize {
683        self.len
684    }
685
686    #[inline]
687    fn count_ones(&self) -> usize {
688        self.low.len()
689    }
690
691    // Override the default implementation, because it may underflow with multisets.
692    #[inline]
693    fn count_zeros(&self) -> usize {
694        if self.count_ones() >= self.len() {
695            0
696        } else {
697            self.len() - self.count_ones()
698        }
699    }
700
701    fn get(&self, index: usize) -> bool {
702        // Find the first value with the same high part, if it exists.
703        let parts = self.split(index);
704        let mut pos = self.lower_bound(parts.high);
705
706        // Iterate forward over the values with the same high part until we find
707        // a value no less than `value` or we run out of such values.
708        while pos.high < self.high.len() && self.high.get(pos.high) {
709            let low = self.low.get(pos.low) as usize;
710            if low >= parts.low {
711                return low == parts.low;
712            }
713            pos.high += 1; pos.low += 1;
714        }
715
716        false
717    }
718
719    fn iter(&'a self) -> Self::Iter {
720        let mut one_iter = self.one_iter();
721        let next_set = if let Some((_, index)) = one_iter.next() {
722            Some(index)
723        } else {
724            None
725        };
726        let last_set = if let Some((_, index)) = one_iter.next_back() {
727            Some(index)
728        } else {
729            next_set
730        };
731        Self::Iter {
732            parent: one_iter,
733            next: 0,
734            next_set,
735            limit: self.len(),
736            last_set,
737        }
738    }
739}
740
741//-----------------------------------------------------------------------------
742
743impl<'a> Rank<'a> for SparseVector {
744    fn supports_rank(&self) -> bool {
745        true
746    }
747
748    fn enable_rank(&mut self) {}
749
750    fn rank(&self, index: usize) -> usize {
751        if index >= self.len() {
752            return self.count_ones();
753        }
754
755        // Find the last value with the same high part, if it exists.
756        let parts = self.split(index);
757        let mut pos = self.upper_bound(parts.high);
758        if pos.low == 0 {
759            return 0;
760        }
761        pos.high -= 1; pos.low -= 1;
762
763        // Iterate backward over the values with the same high part until we find
764        // as value lower than `index` or we run out of such values.
765        while self.high.get(pos.high) && (self.low.get(pos.low) as usize) >= parts.low {
766            if pos.low == 0 {
767                return 0;
768            }
769            pos.high -= 1; pos.low -= 1;
770        }
771
772        pos.low + 1
773    }
774}
775
776//-----------------------------------------------------------------------------
777
778/// An iterator over the set bits in [`SparseVector`].
779///
780/// The type of `Item` is `(`[`usize`]`, `[`usize`]`)`.
781/// This can be interpreted as:
782///
783/// * `(index, value)` or `(i, select(i))` in the integer array; or
784/// * `(rank(j), j)` in the bit array with `j` such that `self.get(j) == true`.
785///
786/// Note that `index` is not always the index provided by [`Iterator::enumerate`].
787/// Queries may create iterators in the middle of the bitvector.
788///
789/// # Examples
790///
791/// ```
792/// use simple_sds_sbwt::ops::{BitVec, Select};
793/// use simple_sds_sbwt::sparse_vector::{SparseVector, SparseBuilder};
794/// use std::convert::TryFrom;
795///
796/// let source: Vec<bool> = vec![true, false, true, true, false, true, true, false];
797/// let ones = source.iter().filter(|&b| *b).count();
798/// let mut builder = SparseBuilder::new(source.len(), ones).unwrap();
799/// for (index, _) in source.iter().enumerate().filter(|v| *v.1) {
800///     builder.set(index);
801/// }
802/// let sv = SparseVector::try_from(builder).unwrap();
803///
804/// let mut iter = sv.one_iter();
805/// assert_eq!(iter.len(), ones);
806/// assert_eq!(iter.next(), Some((0, 0)));
807/// assert_eq!(iter.next(), Some((1, 2)));
808/// assert_eq!(iter.next(), Some((2, 3)));
809/// assert_eq!(iter.next(), Some((3, 5)));
810/// assert_eq!(iter.next(), Some((4, 6)));
811/// assert!(iter.next().is_none());
812/// ```
813#[derive(Clone, Debug)]
814pub struct OneIter<'a> {
815    parent: &'a SparseVector,
816    // The first position we have not visited.
817    next: Pos,
818    // The first position we should not visit.
819    limit: Pos,
820}
821
822impl<'a> OneIter<'a> {
823    // Build an empty iterator for the parent bitvector.
824    fn empty_iter(parent: &'a SparseVector) -> OneIter<'a> {
825        OneIter {
826            parent,
827            next: Pos { high: parent.high.len(), low: parent.low.len(), },
828            limit: Pos { high: parent.high.len(), low: parent.low.len(), },
829        }
830    }
831}
832
833impl<'a> Iterator for OneIter<'a> {
834    type Item = (usize, usize);
835
836    fn next(&mut self) -> Option<Self::Item> {
837        if self.next.low >= self.limit.low {
838            None
839        } else {
840            while !self.parent.high.get(self.next.high) {
841                self.next.high += 1;
842            }
843            let result = self.parent.combine(self.next);
844            self.next.high += 1; self.next.low += 1;
845            Some(result)
846        }
847    }
848
849    #[inline]
850    fn size_hint(&self) -> (usize, Option<usize>) {
851        let remaining = self.limit.low - self.next.low;
852        (remaining, Some(remaining))
853    }
854}
855
856impl<'a> DoubleEndedIterator for OneIter<'a> {
857    fn next_back(&mut self) -> Option<Self::Item> {
858        if self.next.low >= self.limit.low {
859            None
860        } else {
861            self.limit.high -= 1; self.limit.low -= 1;
862            while !self.parent.high.get(self.limit.high) {
863                self.limit.high -= 1;
864            }
865            Some(self.parent.combine(self.limit))
866        }
867    }
868}
869
870impl<'a> ExactSizeIterator for OneIter<'a> {}
871
872impl<'a> FusedIterator for OneIter<'a> {}
873
874//-----------------------------------------------------------------------------
875
876/// An iterator over the unset bits in [`SparseVector`].
877///
878/// The type of `Item` is `(`[`usize`]`, `[`usize`]`)`.
879/// This can be interpreted as:
880///
881/// * `(index, value)` or `(i, select(i))` in the integer array of the complement; or
882/// * `(rank(j), j)` in the bit array with `j` such that `self.get(j) == false`.
883///
884/// Note that `index` is not always the index provided by [`Iterator::enumerate`].
885/// Queries may create iterators in the middle of the bitvector.
886///
887/// This iterator does not work correctly with multisets.
888///
889/// # Examples
890///
891/// ```
892/// use simple_sds_sbwt::ops::{BitVec, SelectZero};
893/// use simple_sds_sbwt::sparse_vector::{SparseVector, SparseBuilder};
894/// use std::convert::TryFrom;
895///
896/// let source: Vec<bool> = vec![true, false, true, true, false, true, true, false];
897/// let ones = source.iter().filter(|&b| *b).count();
898/// let mut builder = SparseBuilder::new(source.len(), ones).unwrap();
899/// for (index, _) in source.iter().enumerate().filter(|v| *v.1) {
900///     builder.set(index);
901/// }
902/// let sv = SparseVector::try_from(builder).unwrap();
903///
904/// let mut iter = sv.zero_iter();
905/// assert_eq!(iter.len(), source.len() - ones);
906/// assert_eq!(iter.next(), Some((0, 1)));
907/// assert_eq!(iter.next(), Some((1, 4)));
908/// assert_eq!(iter.next(), Some((2, 7)));
909/// assert!(iter.next().is_none());
910/// ```
911#[derive(Clone, Debug)]
912pub struct ZeroIter<'a> {
913    iter: OneIter<'a>,
914    // The position of the next one, or the length of the bitvector.
915    one_pos: usize,
916    // The first position we have not visited.
917    next: (usize, usize),
918    // The first position we should not visit.
919    limit: (usize, usize),
920}
921
922impl<'a> ZeroIter<'a> {
923    // Build an empty iterator for the parent bitvector.
924    fn empty_iter(parent: &'a SparseVector) -> ZeroIter<'a> {
925        ZeroIter {
926            iter: OneIter::empty_iter(parent),
927            one_pos: 0,
928            next: (0, 0),
929            limit: (0, 0),
930        }
931    }
932
933    // Go to the next run of zeros if necessary, assuming that we are not at the end.
934    fn next_run(&mut self) {
935        while self.next.1 >= self.one_pos {
936            self.next.1 = self.one_pos + 1;
937            self.one_pos = if let Some((_, pos)) = self.iter.next() { pos } else { self.limit.1 };
938        }
939    }
940}
941
942impl<'a> Iterator for ZeroIter<'a> {
943    type Item = (usize, usize);
944
945    fn next(&mut self) -> Option<Self::Item> {
946        if self.next.0 >= self.limit.0 {
947            None
948        } else {
949            self.next_run();
950            let result = self.next;
951            self.next.0 += 1;
952            self.next.1 += 1;
953            Some(result)
954        }
955    }
956
957    #[inline]
958    fn size_hint(&self) -> (usize, Option<usize>) {
959        let remaining = self.limit.0 - self.next.0;
960        (remaining, Some(remaining))
961    }
962}
963
964// TODO: DoubleEndedIterator?
965
966impl<'a> ExactSizeIterator for ZeroIter<'a> {}
967
968impl<'a> FusedIterator for ZeroIter<'a> {}
969
970//-----------------------------------------------------------------------------
971
972impl<'a> Select<'a> for SparseVector {
973    type OneIter = OneIter<'a>;
974
975    fn supports_select(&self) -> bool {
976        true
977    }
978
979    fn enable_select(&mut self) {}
980
981    fn one_iter(&'a self) -> Self::OneIter {
982        Self::OneIter {
983            parent: self,
984            next: Pos { high: 0, low: 0, },
985            limit: Pos { high: self.high.len(), low: self.low.len(), },
986        }
987    }
988
989    fn select(&'a self, rank: usize) -> Option<usize> {
990         if rank >= self.count_ones() {
991             None
992        } else {
993            Some(self.combine(self.pos(rank)).1)
994        }
995    }
996
997    fn select_iter(&'a self, rank: usize) -> Self::OneIter {
998         if rank >= self.count_ones() {
999             Self::OneIter::empty_iter(self)
1000        } else {
1001            Self::OneIter {
1002                parent: self,
1003                next: self.pos(rank),
1004                limit: Pos { high: self.high.len(), low: self.low.len(), },
1005            }
1006        }
1007    }
1008}
1009
1010//-----------------------------------------------------------------------------
1011
1012impl<'a> SelectZero<'a> for SparseVector {
1013    type ZeroIter = ZeroIter<'a>;
1014
1015    fn supports_select_zero(&self) -> bool {
1016        true
1017    }
1018
1019    fn enable_select_zero(&mut self) {}
1020
1021    fn zero_iter(&'a self) -> Self::ZeroIter {
1022        let mut iter = self.one_iter();
1023        let one_pos = if let Some((_, pos)) = iter.next() { pos } else { self.len() };
1024        ZeroIter {
1025            iter,
1026            one_pos,
1027            next: (0, 0),
1028            limit: (self.count_zeros(), self.len()),
1029        }
1030    }
1031
1032    fn select_zero(&'a self, rank: usize) -> Option<usize> {
1033        if rank >= self.count_zeros() {
1034            return None;
1035        }
1036        let (run_rank, _) = self.find_zero_run(rank);
1037        Some(run_rank + rank)
1038    }
1039
1040    fn select_zero_iter(&'a self, rank: usize) -> Self::ZeroIter {
1041        if rank >= self.count_zeros() {
1042            return Self::ZeroIter::empty_iter(self);
1043        }
1044        let (run_rank, mut iter) = self.find_zero_run(rank);
1045        let one_pos = if let Some((_, pos)) = iter.next() { pos } else { self.len() };
1046        ZeroIter {
1047            iter,
1048            one_pos,
1049            next: (rank, run_rank + rank),
1050            limit: (self.count_zeros(), self.len()),
1051        }
1052    }
1053}
1054
1055//-----------------------------------------------------------------------------
1056
1057impl<'a> PredSucc<'a> for SparseVector {
1058    type OneIter = OneIter<'a>;
1059
1060    fn supports_pred_succ(&self) -> bool {
1061        true
1062    }
1063
1064    fn enable_pred_succ(&mut self) {}
1065
1066    fn predecessor(&'a self, value: usize) -> Self::OneIter {
1067        if self.is_empty() {
1068            return Self::OneIter::empty_iter(self);
1069        }
1070
1071        // Find the last value with the same high part, if it exists.
1072        let parts = self.split(cmp::min(value, self.len() - 1));
1073        let mut pos = self.upper_bound(parts.high);
1074        if pos.low == 0 {
1075            return Self::OneIter::empty_iter(self);
1076        }
1077        pos.high -= 1; pos.low -= 1;
1078
1079        // Iterate backward over the values with the same high part until we find
1080        // a value no greater than `value` or we run out of such values.
1081        while self.high.get(pos.high) && (self.low.get(pos.low) as usize) > parts.low {
1082            if pos.low == 0 {
1083                return Self::OneIter::empty_iter(self);
1084            }
1085            pos.high -= 1; pos.low -= 1;
1086        }
1087
1088        // The predecessor has a lower high part, so we continue iterating until we find it.
1089        while !self.high.get(pos.high) {
1090            pos.high -= 1;
1091        }
1092
1093        Self::OneIter {
1094            parent: self,
1095            next: pos,
1096            limit: Pos { high: self.high.len(), low: self.low.len(), },
1097        }
1098    }
1099
1100    fn successor(&'a self, value: usize) -> Self::OneIter {
1101        if value >= self.len() {
1102            return Self::OneIter::empty_iter(self);
1103        }
1104
1105        // Find the first value with the same high part, if it exists.
1106        let parts = self.split(value);
1107        let mut pos = self.lower_bound(parts.high);
1108
1109        // Iterate forward over the values with the same high part until we find
1110        // a value no less than `value` or we run out of such values.
1111        while pos.high < self.high.len() && self.high.get(pos.high) {
1112            if (self.low.get(pos.low) as usize) >= parts.low {
1113                return Self::OneIter {
1114                    parent: self,
1115                    next: pos,
1116                    limit: Pos { high: self.high.len(), low: self.low.len(), },
1117                };
1118            }
1119            pos.high += 1; pos.low += 1;
1120        }
1121
1122        // The successor has a greater high part, so we continue iterating until we find it.
1123        while pos.high < self.high.len() {
1124            if self.high.get(pos.high) {
1125                return Self::OneIter {
1126                    parent: self,
1127                    next: pos,
1128                    limit: Pos { high: self.high.len(), low: self.low.len(), },
1129                };
1130            }
1131            pos.high += 1;
1132        }
1133
1134        Self::OneIter::empty_iter(self)
1135    }
1136}
1137
1138//-----------------------------------------------------------------------------
1139
1140impl Serialize for SparseVector {
1141    fn serialize_header<T: io::Write>(&self, writer: &mut T) -> io::Result<()> {
1142        self.len.serialize(writer)?;
1143        Ok(())
1144    }
1145
1146    fn serialize_body<T: io::Write>(&self, writer: &mut T) -> io::Result<()> {
1147        self.high.serialize(writer)?;
1148        self.low.serialize(writer)?;
1149        Ok(())
1150    }
1151
1152    fn load<T: io::Read>(reader: &mut T) -> io::Result<Self> {
1153        let len = usize::load(reader)?;
1154        let mut high = BitVector::load(reader)?;
1155        let low = IntVector::load(reader)?;
1156
1157        // Enable support structures, because the data may be from a library that does not know
1158        // how to build them.
1159        high.enable_select();
1160        high.enable_select_zero();
1161
1162        // Sanity checks.
1163        if low.len() != high.count_ones() {
1164            return Err(Error::new(ErrorKind::InvalidData, "Inconsistent number of set bits"));
1165        }
1166        if high.len() != low.len() + SparseBuilder::get_buckets(len, low.width()){
1167            return Err(Error::new(ErrorKind::InvalidData, "Invalid number of buckets"));
1168        }
1169
1170        let result = SparseVector {
1171            len, high, low,
1172        };
1173        Ok(result)
1174    }
1175
1176    fn size_in_elements(&self) -> usize {
1177        self.len.size_in_elements() +
1178        self.high.size_in_elements() +
1179        self.low.size_in_elements()
1180    }
1181}
1182
1183//-----------------------------------------------------------------------------