simple_sds_sbwt/rl_vector/
index.rs

1//! An index for speeding up rank/select queries in [`RLVector`].
2
3use super::*;
4
5use std::ops::Range;
6
7//-----------------------------------------------------------------------------
8
9/// An index for narrowing down a range before binary search.
10///
11/// `SampleIndex` takes a strictly increasing sequence of `n` values in `0..universe`.
12/// The first value must be `0`.
13/// The index chooses a divisor that partitions the universe into approximately `n / 8` ranges.
14/// Each range `i` contains the values in range `(i * divisor)..((i + 1) * divisor)`.
15/// Given a value `x`, we can restrict the binary search to the range `self.range(x)` of values.
16///
17/// # Examples
18///
19/// ```
20/// use simple_sds_sbwt::int_vector::IntVector;
21/// use simple_sds_sbwt::ops::{Vector, Access};
22/// use simple_sds_sbwt::rl_vector::index::SampleIndex;
23///
24/// let values: Vec<usize> = vec![0, 33, 124, 131, 224, 291, 322, 341, 394, 466, 501];
25/// let index = SampleIndex::new(values.iter().copied(), 540);
26///
27/// let range = index.range(300);
28/// assert!(values[range.start] <= 300);
29/// let upper_bound = *values.get(range.end).unwrap_or(&540);
30/// assert!(upper_bound > 300);
31/// ```
32///
33/// # Notes
34///
35/// * This is a simple support structure not intended to be serialized.
36#[derive(Clone, Debug, PartialEq, Eq)]
37pub struct SampleIndex {
38    num_values: usize,
39    divisor: usize,
40    samples: IntVector,
41}
42
43impl SampleIndex {
44    /// Ratio of number of values to number of samples.
45    pub const RATIO: usize = 8;
46
47    /// Returns a `SampleIndex` for the given values.
48    ///
49    /// # Arguments
50    ///
51    /// * `values`: A strictly increasing sequence of values starting from `0`.
52    /// * `universe`: Universe size. The values must be in range `0..universe`.
53    ///
54    /// # Panics
55    ///
56    /// Panics if the first value is not `0`, the sequence of values is not strictly increasing, or if universe size is too small for the values.
57    pub fn new<T: Iterator<Item = usize> + ExactSizeIterator>(iter: T, universe: usize) -> Self {
58        let mut iter = iter;
59        let len = iter.len();
60        if len == 0 || universe == 0 {
61            return SampleIndex {
62                num_values: 0,
63                divisor: usize::MAX,
64                samples: IntVector::with_len(1, 1, 0).unwrap(),
65            }
66        }
67
68        let (num_samples, divisor) = Self::parameters(len, universe);
69        let width = bits::bit_len((len - 1) as u64);
70        let mut samples = IntVector::with_len(num_samples, width, 0).unwrap();
71
72        // Invariant: `values[samples[i]] <= i * divisor` and `values[samples[i] + 1] > i * divisor`.
73        let mut offset = 0;
74        let mut prev = iter.next().unwrap();
75        let mut next = iter.next();
76        assert_eq!(prev, 0, "SampleIndex::new(): The initial value must be 0");
77        for sample in 1..samples.len() {
78            let threshold = sample * divisor;
79            while next.is_some() {
80                let value = next.unwrap();
81                if value > threshold {
82                    break;
83                }
84                assert!(prev < value, "SampleIndex::new(): The values must be strictly increasing");
85                offset += 1;
86                prev = value;
87                next = iter.next();
88            }
89            samples.set(sample, offset as u64);
90        }
91        assert!(universe > prev, "SampleIndex::new(): Universe size ({}) must be larger than the last value ({})", universe, prev);
92
93        SampleIndex {
94            num_values: len,
95            divisor,
96            samples,
97        }
98    }
99
100    /// Returns a range that will contain the given value if it is present in the values.
101    ///
102    /// If `value < universe`, the range guarantees the following invariants:
103    ///
104    /// * `values[range.start] <= value`
105    /// * `values.get(range.end).unwrap_or(universe) > value`
106    pub fn range(&self, value: usize) -> Range<usize> {
107        let offset = value / self.divisor;
108        let start = self.samples.get_or(offset, self.num_values as u64) as usize;
109        let mut limit = self.samples.get_or(offset + 1, self.num_values as u64) as usize;
110        // We need + 1 because the next sample may be too small for some values in the range.
111        if limit < self.num_values {
112            limit += 1;
113        }
114        start..limit
115    }
116
117    // Returns `(samples, divisor)` such that `i / divisor` for `i in 0..universe` maps evenly to `0..samples`.
118    fn parameters(values: usize, universe: usize) -> (usize, usize) {
119        let num_samples = bits::div_round_up(values, Self::RATIO);
120        let divisor = bits::div_round_up(universe, num_samples);
121        let num_samples = bits::div_round_up(universe, divisor);
122        (num_samples, divisor)
123    }
124}
125
126//-----------------------------------------------------------------------------
127
128#[cfg(test)]
129mod tests {
130    use super::*;
131
132    use rand::Rng;
133    use rand::rngs::ThreadRng;
134
135    fn check_parameters(values: usize, universe: usize) {
136        let (samples, divisor) = SampleIndex::parameters(values, universe);
137        assert!((samples - 1) * divisor < universe, "Divisor {} is too large with {} samples and universe size {}", divisor, samples, universe);
138        assert!(samples * divisor >= universe, "Divisor {} is too small with {} samples and universe size {}", divisor, samples, universe);
139        assert_eq!((universe - 1) / divisor, samples - 1, "Last value {} does not map to to last sample {} with {} values", universe - 1, samples - 1, values);
140    }
141
142    fn generate_values(universe: usize, density: usize, rng: &mut ThreadRng) -> Vec<usize> {
143        let mut values: Vec<usize> = Vec::new();
144        let mut last = 0;
145        while last < universe {
146            values.push(last);
147            last += 1 + rng.gen_range(0..density);
148        }
149        values
150    }
151
152    #[test]
153    fn parameters() {
154        let mut rng = rand::thread_rng();
155        let mut values: Vec<usize> = (1..10).collect();
156        for _ in 0..100 {
157            values.push(rng.gen_range(1..10000));
158        }
159
160        for v in values {
161            let mut universe: Vec<usize> = vec![v, v + 1, v * SampleIndex::RATIO - 1, v * SampleIndex::RATIO, v * SampleIndex::RATIO + 1];
162            for _ in 0..10 {
163                universe.push(v * rng.gen_range(1..10) + rng.gen_range(0..v));
164            }
165            for u in universe {
166                check_parameters(v, u);
167            }
168        }
169    }
170
171    #[test]
172    fn sample_index() {
173        let mut rng = rand::thread_rng();
174        for _ in 0..10 {
175            let universe = rng.gen_range(10_000..100_000);
176            let density = rng.gen_range(2..1000);
177            let values = generate_values(universe, density, &mut rng);
178            let index = SampleIndex::new(values.iter().copied(), universe);
179            for _ in 0..1000 {
180                let query = rng.gen_range(0..universe);
181                let range = index.range(query);
182                let start = values[range.start];
183                assert!(start <= query, "Range start values[{}] = {} is too large for query {} (universe size {}, density {})", range.start, start, query, universe, density);
184                let end = if range.end >= values.len() { universe } else { values[range.end] };
185                assert!(end > query, "Range end values[{}] = {} is too small for query {} (universe size {}, density {})", range.end, end, query, universe, density);
186            }
187        }
188    }
189}
190
191//-----------------------------------------------------------------------------