vers_vecs/wavelet/mod.rs
1//! This module contains the implementation of a [wavelet matrix].
2//! The wavelet matrix is a data structure that encodes a sequence of `n` `k`-bit words
3//! into a matrix of bit vectors, allowing for fast rank and select queries on the encoded sequence.
4//!
5//! This implementation further supports quantile queries, and range-predecessor and -successor queries.
6//! All operations are `O(k)` time complexity.
7//!
8//! See the struct documentation for more information.
9//!
10//! [wavelet matrix]: WaveletMatrix
11
12use crate::util::impl_vector_iterator;
13use crate::{BitVec, RsVec};
14use std::mem;
15use std::ops::Range;
16
17/// A wavelet matrix implementation implemented as described in
18/// [Navarro and Claude, 2021](http://dx.doi.org/10.1007/978-3-642-34109-0_18).
19/// The implementation is designed to allow for extremely large alphabet sizes, without
20/// sacrificing performance for small alphabets.
21///
22/// There are two constructor algorithms available:
23/// - [`from_bit_vec`] and [`from_slice`] construct the wavelet matrix by repeatedly sorting the elements.
24/// These constructors have linear space overhead and run in `O(kn * log n)` time complexity.
25/// - [`from_bit_vec_pc`] and [`from_slice_pc`] construct the wavelet matrix by counting the
26/// prefixes of the elements. These constructors have a space complexity of `O(2^k)` and run
27/// in `O(kn)`, which makes this constructor preferable for large sequences over small alphabets.
28///
29/// They encode a sequence of `n` `k`-bit words into a wavelet matrix which supports constant-time
30/// rank and select queries on elements of its `k`-bit alphabet.
31/// All query functions are mirrored for both `BitVec` and `u64` query elements, so
32/// if `k <= 64`, no heap allocation is needed for the query element.
33///
34/// Other than rank and select queries, the matrix also supports quantile queries (range select i), and
35/// range-predecessor and -successor queries, all of which are loosely based on
36/// [Külekci and Thankachan](https://doi.org/10.1016/j.jda.2017.01.002) with better time complexity.
37///
38/// All operations implemented on the matrix are `O(k)` time complexity.
39/// The space complexity of the wavelet matrix is `O(n * k)` with a small linear overhead
40/// (see [`RsVec`]).
41///
42/// # Examples
43/// ```
44/// use vers_vecs::{BitVec, WaveletMatrix};
45///
46/// // pack elements from a 3-bit alphabet into a bit vector and construct a wavelet matrix from them
47/// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
48/// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
49///
50/// // query the wavelet matrix
51/// assert_eq!(wavelet_matrix.get_u64(0), Some(1));
52/// assert_eq!(wavelet_matrix.get_u64(1), Some(4));
53///
54// // rank and select queries
55/// assert_eq!(wavelet_matrix.rank_u64(3, 4), Some(2));
56/// assert_eq!(wavelet_matrix.rank_u64(3, 1), Some(1));
57/// assert_eq!(wavelet_matrix.select_u64(0, 7), Some(5));
58///
59/// // statistics
60/// assert_eq!(wavelet_matrix.range_median_u64(0..3), Some(4));
61/// assert_eq!(wavelet_matrix.predecessor_u64(0..6, 3), Some(2));
62/// ```
63///
64/// [`RsVec`]: RsVec
65#[derive(Clone, Debug)]
66#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
67pub struct WaveletMatrix {
68 data: Box<[RsVec]>,
69}
70
71impl WaveletMatrix {
72 /// Generic constructor that constructs the wavelet matrix by repeatedly sorting the elements.
73 /// The runtime complexity is `O(kn * log n)`.
74 ///
75 /// # Parameters
76 /// - `bits_per_element`: The number of bits in each word. Cannot exceed 1 << 16.
77 /// - `num_elements`: The number of elements in the sequence.
78 /// - `bit_lookup`: A closure that returns the `bit`-th bit of the `element`-th word.
79 #[inline(always)] // should get rid of closures in favor of static calls
80 fn permutation_sorting<LOOKUP: Fn(usize, usize) -> u64>(
81 bits_per_element: u16,
82 num_elements: usize,
83 bit_lookup: LOOKUP,
84 ) -> Self {
85 let element_len = bits_per_element as usize;
86
87 let mut data = vec![BitVec::from_zeros(num_elements); element_len];
88
89 // insert the first bit of each word into the first bit vector
90 // for each following level, insert the next bit of each word into the next bit vector
91 // sorted stably by the previous bit vector
92 let mut permutation = (0..num_elements).collect::<Vec<_>>();
93 let mut next_permutation = vec![0; num_elements];
94
95 for (level, data) in data.iter_mut().enumerate() {
96 let mut total_zeros = 0;
97 for (i, p) in permutation.iter().enumerate() {
98 if bit_lookup(*p, element_len - level - 1) == 0 {
99 total_zeros += 1;
100 } else {
101 data.set(i, 1).unwrap();
102 }
103 }
104
105 // scan through the generated bit array and move the elements to the correct position
106 // for the next permutation
107 if level < element_len - 1 {
108 let mut zero_boundary = 0;
109 let mut one_boundary = total_zeros;
110 for (i, p) in permutation.iter().enumerate() {
111 if data.get_unchecked(i) == 0 {
112 next_permutation[zero_boundary] = *p;
113 zero_boundary += 1;
114 } else {
115 next_permutation[one_boundary] = *p;
116 one_boundary += 1;
117 }
118 }
119
120 mem::swap(&mut permutation, &mut next_permutation);
121 }
122 }
123
124 Self {
125 data: data.into_iter().map(BitVec::into).collect(),
126 }
127 }
128
129 /// Create a new wavelet matrix from a sequence of `n` `k`-bit words.
130 /// The constructor runs in `O(kn * log n)` time complexity.
131 ///
132 /// # Parameters
133 /// - `bit_vec`: A packed sequence of `n` `k`-bit words. The `i`-th word begins in the
134 /// `bits_per_element * i`-th bit of the bit vector. Words are stored from least to most
135 /// significant bit.
136 /// - `bits_per_element`: The number `k` of bits in each word. Cannot exceed 1 << 16.
137 ///
138 /// # Panics
139 /// Panics if the number of bits in the bit vector is not a multiple of the number of bits per element.
140 #[must_use]
141 pub fn from_bit_vec(bit_vec: &BitVec, bits_per_element: u16) -> Self {
142 assert_eq!(bit_vec.len() % bits_per_element as usize, 0, "The number of bits in the bit vector must be a multiple of the number of bits per element.");
143 let num_elements = bit_vec.len() / bits_per_element as usize;
144 Self::permutation_sorting(bits_per_element, num_elements, |element, bit| {
145 bit_vec.get_unchecked(element * bits_per_element as usize + bit)
146 })
147 }
148
149 /// Create a new wavelet matrix from a sequence of `n` `k`-bit words.
150 /// The constructor runs in `O(kn * log n)` time complexity.
151 ///
152 /// # Parameters
153 /// - `sequence`: A slice of `n` u64 values, each encoding a `k`-bit word.
154 /// - `bits_per_element`: The number `k` of bits in each word. Cannot exceed 64.
155 ///
156 /// # Panics
157 /// Panics if the number of bits per element exceeds 64.
158 #[must_use]
159 pub fn from_slice(sequence: &[u64], bits_per_element: u16) -> Self {
160 assert!(
161 bits_per_element <= 64,
162 "The number of bits per element cannot exceed 64."
163 );
164 Self::permutation_sorting(bits_per_element, sequence.len(), |element, bit| {
165 (sequence[element] >> bit) & 1
166 })
167 }
168
169 /// Generic constructor that constructs the wavelet matrix by counting the prefixes of the elements.
170 /// The runtime complexity is `O(kn)`.
171 /// This constructor is only recommended for small alphabets.
172 ///
173 /// # Parameters
174 /// - `bits_per_element`: The number of bits in each word. Cannot exceed 64.
175 /// - `num_elements`: The number of elements in the sequence.
176 /// - `bit_lookup`: A closure that returns the `bit`-th bit of the `element`-th word.
177 /// - `element_lookup`: A closure that returns the `element`-th word.
178 #[inline(always)] // should get rid of closures in favor of static calls
179 fn prefix_counting<LOOKUP: Fn(usize, usize) -> u64, ELEMENT: Fn(usize) -> u64>(
180 bits_per_element: u16,
181 num_elements: usize,
182 bit_lookup: LOOKUP,
183 element_lookup: ELEMENT,
184 ) -> Self {
185 let element_len = bits_per_element as usize;
186 let mut histogram = vec![0usize; 1 << bits_per_element];
187 let mut borders = vec![0usize; 1 << bits_per_element];
188 let mut data = vec![BitVec::from_zeros(num_elements); element_len];
189
190 for i in 0..num_elements {
191 histogram[element_lookup(i) as usize] += 1;
192 data[0].set_unchecked(i, bit_lookup(i, element_len - 1));
193 }
194
195 for level in (1..element_len).rev() {
196 // combine histograms of prefixes
197 for h in 0..1 << level {
198 histogram[h] = histogram[2 * h] + histogram[2 * h + 1];
199 }
200
201 // compute borders of current level, using bit reverse patterns because of the weird
202 // node ordering in wavelet matrices
203 borders[0] = 0;
204 for h in 1usize..1 << level {
205 let h_minus_1 = (h - 1).reverse_bits() >> (64 - level);
206 borders[h.reverse_bits() >> (64 - level)] =
207 borders[h_minus_1] + histogram[h_minus_1];
208 }
209
210 for i in 0..num_elements {
211 let bit = bit_lookup(i, element_len - level - 1);
212 data[level].set_unchecked(
213 borders[element_lookup(i) as usize >> (element_len - level)],
214 bit,
215 );
216 borders[element_lookup(i) as usize >> (element_len - level)] += 1;
217 }
218 }
219
220 Self {
221 data: data.into_iter().map(BitVec::into).collect(),
222 }
223 }
224
225 /// Create a new wavelet matrix from a sequence of `n` `k`-bit words using the prefix counting
226 /// algorithm [Dinklage et al.](https://doi.org/10.1145/3457197)
227 /// The constructor runs in `O(kn)` time complexity but requires `O(2^k)` space during construction,
228 /// so it is only recommended for small alphabets.
229 /// Use the [`from_bit_vec`] or [`from_slice`] constructors for larger alphabets.
230 ///
231 /// # Parameters
232 /// - `bit_vec`: A packed sequence of `n` `k`-bit words. The `i`-th word begins in the
233 /// `bits_per_element * i`-th bit of the bit vector. Words are stored from least to most
234 /// significant bit.
235 /// - `bits_per_element`: The number `k` of bits in each word. Cannot exceed 1 << 16.
236 ///
237 /// # Panics
238 /// Panics if the number of bits in the bit vector is not a multiple of the number of bits per element,
239 /// or if the number of bits per element exceeds 64.
240 ///
241 /// [`from_bit_vec`]: WaveletMatrix::from_bit_vec
242 /// [`from_slice`]: WaveletMatrix::from_slice
243 #[must_use]
244 pub fn from_bit_vec_pc(bit_vec: &BitVec, bits_per_element: u16) -> Self {
245 assert_eq!(bit_vec.len() % bits_per_element as usize, 0, "The number of bits in the bit vector must be a multiple of the number of bits per element.");
246 assert!(
247 bits_per_element <= 64,
248 "The number of bits per element cannot exceed 64."
249 );
250 let num_elements = bit_vec.len() / bits_per_element as usize;
251 Self::prefix_counting(
252 bits_per_element,
253 num_elements,
254 |element, bit| bit_vec.get_unchecked(element * bits_per_element as usize + bit),
255 |element| {
256 bit_vec.get_bits_unchecked(
257 element * bits_per_element as usize,
258 bits_per_element as usize,
259 )
260 },
261 )
262 }
263
264 /// Create a new wavelet matrix from a sequence of `n` `k`-bit words using the prefix counting
265 /// algorithm [Dinklage et al.](https://doi.org/10.1145/3457197)
266 /// The constructor runs in `O(kn)` time complexity but requires `O(2^k)` space during construction,
267 /// so it is only recommended for small alphabets.
268 /// Use the [`from_bit_vec`] or [`from_slice`] constructors for larger alphabets.
269 ///
270 /// # Parameters
271 /// - `sequence`: A slice of `n` u64 values, each encoding a `k`-bit word.
272 /// - `bits_per_element`: The number `k` of bits in each word. Cannot exceed 64.
273 ///
274 /// # Panics
275 /// Panics if the number of bits per element exceeds 64.
276 ///
277 /// [`from_bit_vec`]: WaveletMatrix::from_bit_vec
278 /// [`from_slice`]: WaveletMatrix::from_slice
279 #[must_use]
280 pub fn from_slice_pc(sequence: &[u64], bits_per_element: u16) -> Self {
281 assert!(
282 bits_per_element <= 64,
283 "The number of bits per element cannot exceed 64."
284 );
285 Self::prefix_counting(
286 bits_per_element,
287 sequence.len(),
288 |element, bit| (sequence[element] >> bit) & 1,
289 |element| sequence[element],
290 )
291 }
292
293 /// Generic function to read a value from the wavelet matrix and consume it with a closure.
294 /// The function is used by the `get_value` and `get_u64` functions, deduplicating code.
295 #[inline(always)]
296 fn reconstruct_value_unchecked<F: FnMut(u64)>(&self, mut i: usize, mut target_func: F) {
297 for level in 0..self.bits_per_element() {
298 let bit = self.data[level].get_unchecked(i);
299 target_func(bit);
300 if bit == 0 {
301 i = self.data[level].rank0(i);
302 } else {
303 i = self.data[level].rank0 + self.data[level].rank1(i);
304 }
305 }
306 }
307
308 /// Get the `i`-th element of the encoded sequence in a `k`-bit word.
309 /// The `k`-bit word is returned as a `BitVec`.
310 /// The first element of the bit vector is the least significant bit.
311 ///
312 /// Returns `None` if the index is out of bounds.
313 ///
314 /// # Example
315 /// ```
316 /// use vers_vecs::{BitVec, WaveletMatrix};
317 ///
318 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
319 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
320 ///
321 /// assert_eq!(wavelet_matrix.get_value(0), Some(BitVec::pack_sequence_u8(&[1], 3)));
322 /// assert_eq!(wavelet_matrix.get_value(1), Some(BitVec::pack_sequence_u8(&[4], 3)));
323 /// assert_eq!(wavelet_matrix.get_value(100), None);
324 /// ```
325 #[must_use]
326 pub fn get_value(&self, i: usize) -> Option<BitVec> {
327 if self.data.is_empty() || i >= self.data[0].len() {
328 None
329 } else {
330 Some(self.get_value_unchecked(i))
331 }
332 }
333
334 /// Get the `i`-th element of the encoded sequence in a `k`-bit word.
335 /// The `k`-bit word is returned as a `BitVec`.
336 /// The first element of the bit vector is the least significant bit.
337 /// This function does not perform bounds checking.
338 /// Use [`get_value`] for a checked version.
339 ///
340 /// # Panics
341 /// May panic if the index is out of bounds. May instead return an empty bit vector.
342 ///
343 /// [`get_value`]: WaveletMatrix::get_value
344 #[must_use]
345 pub fn get_value_unchecked(&self, i: usize) -> BitVec {
346 let mut value = BitVec::from_zeros(self.bits_per_element());
347 let mut level = self.bits_per_element() - 1;
348 self.reconstruct_value_unchecked(i, |bit| {
349 value.set_unchecked(level, bit);
350 level = level.saturating_sub(1);
351 });
352 value
353 }
354
355 /// Get the `i`-th element of the encoded sequence as a `u64`.
356 /// The `u64` is constructed from the `k`-bit word stored in the wavelet matrix.
357 ///
358 /// Returns `None` if the index is out of bounds, or if the number of bits per element exceeds 64.
359 ///
360 /// # Example
361 /// ```
362 /// use vers_vecs::{BitVec, WaveletMatrix};
363 ///
364 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
365 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
366 ///
367 /// assert_eq!(wavelet_matrix.get_u64(0), Some(1));
368 /// assert_eq!(wavelet_matrix.get_u64(1), Some(4));
369 /// assert_eq!(wavelet_matrix.get_u64(100), None);
370 /// ```
371 #[must_use]
372 pub fn get_u64(&self, i: usize) -> Option<u64> {
373 if self.bits_per_element() > 64 || self.data.is_empty() || i >= self.data[0].len() {
374 None
375 } else {
376 Some(self.get_u64_unchecked(i))
377 }
378 }
379
380 /// Get the `i`-th element of the encoded sequence as a `u64` numeral.
381 /// The element is encoded in the lowest `k` bits of the numeral.
382 /// If the number of bits per element exceeds 64, the value is truncated.
383 /// This function does not perform bounds checking.
384 /// Use [`get_u64`] for a checked version.
385 ///
386 /// # Panic
387 /// May panic if the value of `i` is out of bounds. May instead return 0.
388 ///
389 /// [`get_u64`]: WaveletMatrix::get_u64
390 #[must_use]
391 pub fn get_u64_unchecked(&self, i: usize) -> u64 {
392 let mut value = 0;
393 self.reconstruct_value_unchecked(i, |bit| {
394 value <<= 1;
395 value |= bit;
396 });
397 value
398 }
399
400 /// Get the number of occurrences of the given `symbol` in the encoded sequence in the `range`.
401 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
402 /// where the least significant bit is the first element, and `k` is the number of bits per element
403 /// in the wavelet matrix.
404 ///
405 /// This method does not perform bounds checking, nor does it check if the `symbol` is a valid
406 /// `k`-bit word.
407 /// Use [`rank_range`] for a checked version.
408 ///
409 /// # Panics
410 /// May panic if the `range` is out of bounds,
411 /// or if the number of bits in `symbol` is lower than `k`.
412 /// May instead return 0.
413 ///
414 /// [`BitVec`]: BitVec
415 /// [`rank_range`]: WaveletMatrix::rank_range
416 #[must_use]
417 pub fn rank_range_unchecked(&self, mut range: Range<usize>, symbol: &BitVec) -> usize {
418 for (level, data) in self.data.iter().enumerate() {
419 if symbol.get_unchecked((self.bits_per_element() - 1) - level) == 0 {
420 range.start = data.rank0(range.start);
421 range.end = data.rank0(range.end);
422 } else {
423 range.start = data.rank0 + data.rank1(range.start);
424 range.end = data.rank0 + data.rank1(range.end);
425 }
426 }
427
428 range.end - range.start
429 }
430
431 /// Get the number of occurrences of the given `symbol` in the encoded sequence in the `range`.
432 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
433 /// where the least significant bit is the first element, and `k` is the number of bits per element
434 /// in the wavelet matrix.
435 ///
436 /// Returns `None` if the `range` is out of bounds (greater than the length of the encoded sequence,
437 /// but since it is exclusive, it may be equal to the length),
438 /// or if the number of bits in `symbol` is not equal to `k`.
439 ///
440 /// # Example
441 /// ```
442 /// use vers_vecs::{BitVec, WaveletMatrix};
443 ///
444 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
445 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
446 ///
447 /// assert_eq!(wavelet_matrix.rank_range(0..3, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
448 /// assert_eq!(wavelet_matrix.rank_range(2..4, &BitVec::pack_sequence_u8(&[4], 3)), Some(1));
449 /// ```
450 ///
451 /// [`BitVec`]: BitVec
452 #[must_use]
453 pub fn rank_range(&self, range: Range<usize>, symbol: &BitVec) -> Option<usize> {
454 if range.start >= self.len()
455 || range.end > self.len()
456 || symbol.len() != self.bits_per_element()
457 {
458 None
459 } else {
460 Some(self.rank_range_unchecked(range, symbol))
461 }
462 }
463
464 /// Get the number of occurrences of the given `symbol` in the encoded sequence in the `range`.
465 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
466 /// where k is less than or equal to 64.
467 /// The interval is half-open, meaning `rank_range_u64(0..0, symbol)` returns 0.
468 ///
469 /// This method does not perform bounds checking, nor does it check if the elements of the
470 /// wavelet matrix can be represented in a u64 numeral.
471 /// Use [`rank_range_u64`] for a checked version.
472 ///
473 /// # Panics
474 /// May panic if the `range` is out of bounds.
475 /// May instead return 0.
476 /// If the number of bits in wavelet matrix elements exceed `64`, the behavior is
477 /// platform-dependent.
478 ///
479 /// [`rank_range_u64`]: WaveletMatrix::rank_range_u64
480 #[must_use]
481 pub fn rank_range_u64_unchecked(&self, mut range: Range<usize>, symbol: u64) -> usize {
482 for (level, data) in self.data.iter().enumerate() {
483 if (symbol >> ((self.bits_per_element() - 1) - level)) & 1 == 0 {
484 range.start = data.rank0(range.start);
485 range.end = data.rank0(range.end);
486 } else {
487 range.start = data.rank0 + data.rank1(range.start);
488 range.end = data.rank0 + data.rank1(range.end);
489 }
490 }
491
492 range.end - range.start
493 }
494
495 /// Get the number of occurrences of the given `symbol` in the encoded sequence in the `range`.
496 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
497 /// where k is less than or equal to 64.
498 /// The interval is half-open, meaning `rank_range_u64(0..0, symbol)` returns 0.
499 ///
500 /// Returns `None` if the `range` is out of bounds (greater than the length of the encoded sequence,
501 /// but since it is exclusive, it may be equal to the length),
502 /// or if the number of bits in the wavelet matrix elements exceed `64`.
503 ///
504 /// # Example
505 /// ```
506 /// use vers_vecs::{BitVec, WaveletMatrix};
507 ///
508 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
509 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
510 ///
511 /// assert_eq!(wavelet_matrix.rank_range_u64(0..3, 4), Some(2));
512 /// assert_eq!(wavelet_matrix.rank_range_u64(2..4, 4), Some(1));
513 /// ```
514 #[must_use]
515 pub fn rank_range_u64(&self, range: Range<usize>, symbol: u64) -> Option<usize> {
516 if range.start >= self.len() || range.end > self.len() || self.bits_per_element() > 64 {
517 None
518 } else {
519 Some(self.rank_range_u64_unchecked(range, symbol))
520 }
521 }
522
523 /// Get the number of occurrences of the given `symbol` in the encoded sequence between the
524 /// `offset`-th and `i`-th element (exclusive).
525 /// This is equivalent to ```rank_range_unchecked(offset..i, symbol)```.
526 /// The interval is half-open, meaning ```rank_offset_unchecked(0, 0, symbol)``` returns 0.
527 ///
528 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
529 /// where the least significant bit is the first element, and `k` is the number of bits per element
530 /// in the wavelet matrix.
531 ///
532 /// This method does not perform bounds checking, nor does it check if the `symbol` is a valid
533 /// `k`-bit word.
534 /// Use [`rank_offset`] for a checked version.
535 ///
536 /// # Panics
537 /// May panic if `offset` is out of bounds,
538 /// or if `offset + i` is larger than the length of the encoded sequence,
539 /// or if `offset` is greater than `i`,
540 /// or if the number of bits in `symbol` is lower than `k`.
541 /// May instead return 0.
542 ///
543 /// [`BitVec`]: BitVec
544 /// [`rank_offset`]: WaveletMatrix::rank_offset
545 #[must_use]
546 pub fn rank_offset_unchecked(&self, offset: usize, i: usize, symbol: &BitVec) -> usize {
547 self.rank_range_unchecked(offset..i, symbol)
548 }
549
550 /// Get the number of occurrences of the given `symbol` in the encoded sequence between the
551 /// `offset`-th and `i`-th element (exclusive).
552 /// This is equivalent to ``rank_range(offset..i, symbol)``.
553 /// The interval is half-open, meaning ``rank_offset(0, 0, symbol)`` returns 0,
554 /// because the interval is empty.
555 ///
556 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
557 /// where the least significant bit is the first element, and `k` is the number of bits per element
558 /// in the wavelet matrix.
559 ///
560 /// Returns `None` if `offset` is out of bounds,
561 /// or if `i` is larger than the length of the encoded sequence,
562 /// or if `offset` is greater than `i`,
563 /// or if the number of bits in `symbol` is not equal to `k`.
564 /// `i` may equal the length of the encoded sequence,
565 /// which will return the number of occurrences of the symbol up to the end of the sequence.
566 ///
567 /// # Example
568 /// ```
569 /// use vers_vecs::{BitVec, WaveletMatrix};
570 ///
571 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
572 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
573 ///
574 /// assert_eq!(wavelet_matrix.rank_offset(0, 3, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
575 /// assert_eq!(wavelet_matrix.rank_offset(2, 4, &BitVec::pack_sequence_u8(&[4], 3)), Some(1));
576 /// ```
577 ///
578 /// [`BitVec`]: BitVec
579 #[must_use]
580 pub fn rank_offset(&self, offset: usize, i: usize, symbol: &BitVec) -> Option<usize> {
581 if offset > i
582 || offset >= self.len()
583 || i > self.len()
584 || symbol.len() != self.bits_per_element()
585 {
586 None
587 } else {
588 Some(self.rank_offset_unchecked(offset, i, symbol))
589 }
590 }
591
592 /// Get the number of occurrences of the given `symbol` in the encoded sequence between the
593 /// `offset`-th and `i`-th element (exclusive).
594 /// This is equivalent to ``rank_range(offset..i, symbol)``.
595 /// The interval is half-open, meaning ``rank_offset(0, 0, symbol)`` returns 0.
596 ///
597 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
598 /// where k is less than or equal to 64.
599 ///
600 /// This method does not perform bounds checking, nor does it check if the elements of the
601 /// wavelet matrix can be represented in a u64 numeral.
602 /// Use [`rank_offset_u64`] for a checked version.
603 ///
604 /// # Panics
605 /// May panic if `offset` is out of bounds,
606 /// or if `i` is larger than the length of the encoded sequence,
607 /// or if `offset` is greater than `i`,
608 /// or if the number of bits in wavelet matrix elements exceed `64`.
609 /// May instead return 0.
610 ///
611 /// [`rank_offset_u64`]: WaveletMatrix::rank_offset_u64
612 #[must_use]
613 pub fn rank_offset_u64_unchecked(&self, offset: usize, i: usize, symbol: u64) -> usize {
614 self.rank_range_u64_unchecked(offset..i, symbol)
615 }
616
617 /// Get the number of occurrences of the given `symbol` in the encoded sequence between the
618 /// `offset`-th and `i`-th element (exclusive).
619 /// This is equivalent to ``rank_range(offset..i, symbol)``.
620 /// The interval is half-open, meaning ``rank_offset(0, 0, symbol)`` returns 0.
621 ///
622 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
623 /// where k is less than or equal to 64.
624 ///
625 /// Returns `None` if `offset` is out of bounds,
626 /// or if `i` is larger than the length of the encoded sequence,
627 /// or if `offset` is greater than `i`,
628 /// or if the number of bits in the wavelet matrix elements exceed `64`.
629 /// `i` may equal the length of the encoded sequence,
630 /// which will return the number of occurrences of the symbol up to the end of the sequence.
631 ///
632 /// # Example
633 /// ```
634 /// use vers_vecs::{BitVec, WaveletMatrix};
635 ///
636 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
637 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
638 ///
639 /// assert_eq!(wavelet_matrix.rank_offset_u64(0, 3, 4), Some(2));
640 /// assert_eq!(wavelet_matrix.rank_offset_u64(2, 4, 4), Some(1));
641 /// ```
642 #[must_use]
643 pub fn rank_offset_u64(&self, offset: usize, i: usize, symbol: u64) -> Option<usize> {
644 if offset > i || offset >= self.len() || i > self.len() || self.bits_per_element() > 64 {
645 None
646 } else {
647 Some(self.rank_offset_u64_unchecked(offset, i, symbol))
648 }
649 }
650
651 /// Get the number of occurrences of the given `symbol` in the encoded sequence up to the `i`-th
652 /// element (exclusive).
653 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
654 /// where the least significant bit is the first element, and `k` is the number of bits per element
655 /// in the wavelet matrix.
656 ///
657 /// This method does not perform bounds checking, nor does it check if the `symbol` is a valid
658 /// `k`-bit word.
659 /// Use [`rank`] for a checked version.
660 ///
661 /// # Panics
662 /// May panic if `i` is out of bounds, or if the number of bits in `symbol` is lower than `k`.
663 /// May instead return 0.
664 /// If the number of bits in `symbol` exceeds `k`, the remaining bits are ignored.
665 ///
666 /// [`BitVec`]: BitVec
667 /// [`rank`]: WaveletMatrix::rank
668 #[must_use]
669 pub fn rank_unchecked(&self, i: usize, symbol: &BitVec) -> usize {
670 self.rank_range_unchecked(0..i, symbol)
671 }
672
673 /// Get the number of occurrences of the given `symbol` in the encoded sequence up to the `i`-th
674 /// element (exclusive).
675 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
676 /// where the least significant bit is the first element, and `k` is the number of bits per element
677 /// in the wavelet matrix.
678 ///
679 /// Returns `None` if `i` is out of bounds (greater than the length of the encoded sequence, but
680 /// since it is exclusive, it may be equal to the length),
681 /// or if the number of bits in `symbol` is not equal to `k`.
682 ///
683 /// # Example
684 /// ```
685 /// use vers_vecs::{BitVec, WaveletMatrix};
686 ///
687 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
688 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
689 ///
690 /// assert_eq!(wavelet_matrix.rank(3, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
691 /// assert_eq!(wavelet_matrix.rank(3, &BitVec::pack_sequence_u8(&[1], 3)), Some(1));
692 /// ```
693 ///
694 /// [`BitVec`]: BitVec
695 #[must_use]
696 pub fn rank(&self, i: usize, symbol: &BitVec) -> Option<usize> {
697 if i > self.len() || symbol.len() != self.bits_per_element() {
698 None
699 } else {
700 Some(self.rank_range_unchecked(0..i, symbol))
701 }
702 }
703
704 /// Get the number of occurrences of the given `symbol` in the encoded sequence up to the `i`-th
705 /// element (exclusive).
706 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
707 /// where k is less than or equal to 64.
708 ///
709 /// This method does not perform bounds checking, nor does it check if the elements of the
710 /// wavelet matrix can be represented in a u64 numeral.
711 /// Use [`rank_u64`] for a checked version.
712 ///
713 /// # Panics
714 /// May panic if `i` is out of bounds,
715 /// or if the number of bits in wavelet matrix elements exceed `64`.
716 /// May instead return 0.
717 ///
718 /// [`rank_u64`]: WaveletMatrix::rank_u64
719 #[must_use]
720 pub fn rank_u64_unchecked(&self, i: usize, symbol: u64) -> usize {
721 self.rank_range_u64_unchecked(0..i, symbol)
722 }
723
724 /// Get the number of occurrences of the given `symbol` in the encoded sequence up to the `i`-th
725 /// element (exclusive).
726 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
727 /// where k is less than or equal to 64.
728 ///
729 /// Returns `None` if `i` is out of bounds (greater than the length of the encoded sequence, but
730 /// since it is exclusive, it may be equal to the length),
731 /// or if the number of bits in the wavelet matrix elements exceed `64`.
732 ///
733 /// # Example
734 /// ```
735 /// use vers_vecs::{BitVec, WaveletMatrix};
736 ///
737 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
738 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
739 ///
740 /// assert_eq!(wavelet_matrix.rank_u64(3, 4), Some(2));
741 /// assert_eq!(wavelet_matrix.rank_u64(3, 1), Some(1));
742 /// ```
743 #[must_use]
744 pub fn rank_u64(&self, i: usize, symbol: u64) -> Option<usize> {
745 if i > self.len() || self.bits_per_element() > 64 {
746 None
747 } else {
748 Some(self.rank_range_u64_unchecked(0..i, symbol))
749 }
750 }
751
752 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence,
753 /// starting from the `offset`-th element.
754 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
755 /// where the least significant bit is the first element, and `k` is the number of bits per element
756 /// in the wavelet matrix.
757 ///
758 /// This method does not perform bounds checking, nor does it check if the `symbol` is a valid
759 /// `k`-bit word.
760 /// Use [`select_offset`] for a checked version.
761 ///
762 /// Returns the index of the `rank`-th occurrence of the `symbol` in the encoded sequence,
763 /// or the length of the encoded sequence if the `rank`-th occurrence does not exist.
764 ///
765 /// # Panics
766 /// May panic if the `offset` is out of bounds,
767 /// or if the number of bits in `symbol` is lower than `k`.
768 /// May instead return the length of the encoded sequence.
769 ///
770 /// [`BitVec`]: BitVec
771 /// [`select_offset`]: WaveletMatrix::select_offset
772 #[must_use]
773 pub fn select_offset_unchecked(&self, offset: usize, rank: usize, symbol: &BitVec) -> usize {
774 let mut range_start = offset;
775
776 for (level, data) in self.data.iter().enumerate() {
777 if symbol.get_unchecked((self.bits_per_element() - 1) - level) == 0 {
778 range_start = data.rank0(range_start);
779 } else {
780 range_start = data.rank0 + data.rank1(range_start);
781 }
782 }
783
784 let mut range_end = range_start + rank;
785
786 for (level, data) in self.data.iter().enumerate().rev() {
787 if symbol.get_unchecked((self.bits_per_element() - 1) - level) == 0 {
788 range_end = data.select0(range_end);
789 } else {
790 range_end = data.select1(range_end - data.rank0);
791 }
792 }
793
794 range_end
795 }
796
797 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence,
798 /// starting from the `offset`-th element.
799 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
800 /// where the least significant bit is the first element, and `k` is the number of bits per element
801 /// in the wavelet matrix.
802 ///
803 /// Returns `None` if `offset` is out of bounds, or if the number of bits in `symbol` is not equal to `k`,
804 /// or if the `rank`-th occurrence of the `symbol` does not exist.
805 ///
806 /// # Example
807 /// ```
808 /// use vers_vecs::{BitVec, WaveletMatrix};
809 ///
810 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
811 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
812 ///
813 /// assert_eq!(wavelet_matrix.select_offset(0, 0, &BitVec::pack_sequence_u8(&[4], 3)), Some(1));
814 /// assert_eq!(wavelet_matrix.select_offset(0, 1, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
815 /// assert_eq!(wavelet_matrix.select_offset(2, 0, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
816 /// assert_eq!(wavelet_matrix.select_offset(2, 1, &BitVec::pack_sequence_u8(&[4], 3)), None);
817 /// ```
818 ///
819 /// [`BitVec`]: BitVec
820 #[must_use]
821 pub fn select_offset(&self, offset: usize, rank: usize, symbol: &BitVec) -> Option<usize> {
822 if offset >= self.len() || symbol.len() != self.bits_per_element() {
823 None
824 } else {
825 let idx = self.select_offset_unchecked(offset, rank, symbol);
826 if idx < self.len() {
827 Some(idx)
828 } else {
829 None
830 }
831 }
832 }
833
834 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence,
835 /// starting from the `offset`-th element.
836 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
837 /// where k is less than or equal to 64.
838 ///
839 /// This method does not perform bounds checking, nor does it check if the elements of the
840 /// wavelet matrix can be represented in a u64 numeral.
841 /// Use [`select_offset_u64`] for a checked version.
842 ///
843 /// Returns the index of the `rank`-th occurrence of the `symbol` in the encoded sequence,
844 /// or the length of the encoded sequence if the `rank`-th occurrence does not exist.
845 ///
846 /// # Panics
847 /// May panic if the `offset` is out of bounds,
848 /// or if the number of bits in wavelet matrix elements exceed `64`.
849 /// May instead return the length of the encoded sequence.
850 ///
851 /// [`select_offset_u64`]: WaveletMatrix::select_offset_u64
852 #[must_use]
853 pub fn select_offset_u64_unchecked(&self, offset: usize, rank: usize, symbol: u64) -> usize {
854 let mut range_start = offset;
855
856 for (level, data) in self.data.iter().enumerate() {
857 if (symbol >> ((self.bits_per_element() - 1) - level)) & 1 == 0 {
858 range_start = data.rank0(range_start);
859 } else {
860 range_start = data.rank0 + data.rank1(range_start);
861 }
862 }
863
864 let mut range_end = range_start + rank;
865
866 for (level, data) in self.data.iter().enumerate().rev() {
867 if (symbol >> ((self.bits_per_element() - 1) - level)) & 1 == 0 {
868 range_end = data.select0(range_end);
869 } else {
870 range_end = data.select1(range_end - data.rank0);
871 }
872 }
873
874 range_end
875 }
876
877 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence,
878 /// starting from the `offset`-th element.
879 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
880 /// where k is less than or equal to 64.
881 ///
882 /// Returns `None` if `offset` is out of bounds, or if the number of bits in the wavelet matrix
883 /// elements exceed `64`, or if the `rank`-th occurrence of the `symbol` does not exist.
884 ///
885 /// # Example
886 /// ```
887 /// use vers_vecs::{BitVec, WaveletMatrix};
888 ///
889 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
890 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
891 ///
892 /// assert_eq!(wavelet_matrix.select_offset_u64(0, 0, 4), Some(1));
893 /// assert_eq!(wavelet_matrix.select_offset_u64(0, 1, 4), Some(2));
894 /// assert_eq!(wavelet_matrix.select_offset_u64(2, 0, 4), Some(2));
895 /// assert_eq!(wavelet_matrix.select_offset_u64(2, 1, 4), None);
896 /// ```
897 #[must_use]
898 pub fn select_offset_u64(&self, offset: usize, rank: usize, symbol: u64) -> Option<usize> {
899 if offset >= self.len() || self.bits_per_element() > 64 {
900 None
901 } else {
902 let idx = self.select_offset_u64_unchecked(offset, rank, symbol);
903 if idx < self.len() {
904 Some(idx)
905 } else {
906 None
907 }
908 }
909 }
910
911 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence.
912 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
913 /// where the least significant bit is the first element, and `k` is the number of bits per element
914 /// in the wavelet matrix.
915 ///
916 /// This method does not perform bounds checking, nor does it check if the `symbol` is a valid
917 /// `k`-bit word.
918 /// Use [`select`] for a checked version.
919 ///
920 /// Returns the index of the `rank`-th occurrence of the `symbol` in the encoded sequence,
921 /// or the length of the encoded sequence if the `rank`-th occurrence does not exist.
922 ///
923 /// # Panics
924 /// May panic if the number of bits in `symbol` is not equal to `k`.
925 /// May instead return the length of the encoded sequence.
926 ///
927 /// [`BitVec`]: BitVec
928 /// [`select`]: WaveletMatrix::select
929 #[must_use]
930 pub fn select_unchecked(&self, rank: usize, symbol: &BitVec) -> usize {
931 self.select_offset_unchecked(0, rank, symbol)
932 }
933
934 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence.
935 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
936 /// where the least significant bit is the first element, and `k` is the number of bits per element
937 /// in the wavelet matrix.
938 ///
939 /// Returns `None` if the number of bits in `symbol` is not equal to `k`,
940 /// or if the `rank`-th occurrence of the `symbol` does not exist.
941 ///
942 /// # Example
943 /// ```
944 /// use vers_vecs::{BitVec, WaveletMatrix};
945 ///
946 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
947 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
948 ///
949 /// assert_eq!(wavelet_matrix.select(0, &BitVec::pack_sequence_u8(&[4], 3)), Some(1));
950 /// assert_eq!(wavelet_matrix.select(1, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
951 /// ```
952 ///
953 /// [`BitVec`]: BitVec
954 #[must_use]
955 pub fn select(&self, rank: usize, symbol: &BitVec) -> Option<usize> {
956 if symbol.len() == self.bits_per_element() {
957 let idx = self.select_unchecked(rank, symbol);
958 if idx < self.len() {
959 Some(idx)
960 } else {
961 None
962 }
963 } else {
964 None
965 }
966 }
967
968 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence.
969 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
970 /// where k is less than or equal to 64.
971 ///
972 /// This method does not perform bounds checking, nor does it check if the elements of the
973 /// wavelet matrix can be represented in a u64 numeral.
974 /// Use [`select_u64`] for a checked version.
975 ///
976 /// Returns the index of the `rank`-th occurrence of the `symbol` in the encoded sequence,
977 /// or the length of the encoded sequence if the `rank`-th occurrence does not exist.
978 ///
979 /// # Panics
980 /// May panic if the number of bits in wavelet matrix elements exceed `64`.
981 /// May instead return the length of the encoded sequence.
982 ///
983 /// [`select_u64`]: WaveletMatrix::select_u64
984 #[must_use]
985 pub fn select_u64_unchecked(&self, rank: usize, symbol: u64) -> usize {
986 self.select_offset_u64_unchecked(0, rank, symbol)
987 }
988
989 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence.
990 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
991 /// where k is less than or equal to 64.
992 ///
993 /// Returns `None` if the number of bits in the wavelet matrix elements exceed `64`,
994 /// or if the `rank`-th occurrence of the `symbol` does not exist.
995 ///
996 /// # Example
997 /// ```
998 /// use vers_vecs::{BitVec, WaveletMatrix};
999 ///
1000 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1001 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1002 ///
1003 /// assert_eq!(wavelet_matrix.select_u64(0, 4), Some(1));
1004 /// assert_eq!(wavelet_matrix.select_u64(1, 4), Some(2));
1005 /// ```
1006 #[must_use]
1007 pub fn select_u64(&self, rank: usize, symbol: u64) -> Option<usize> {
1008 if self.bits_per_element() > 64 {
1009 None
1010 } else {
1011 let idx = self.select_u64_unchecked(rank, symbol);
1012 if idx < self.len() {
1013 Some(idx)
1014 } else {
1015 None
1016 }
1017 }
1018 }
1019
1020 /// Get the `k`-th smallest element in the encoded sequence in the specified `range`,
1021 /// where `k = 0` returns the smallest element.
1022 /// The `range` is a half-open interval, meaning that the `end` index is exclusive.
1023 /// The `k`-th smallest element is returned as a `BitVec`,
1024 /// where the least significant bit is the first element.
1025 ///
1026 /// This method does not perform bounds checking.
1027 /// It returns a nonsensical result if the `k` is greater than the size of the range.
1028 /// Use [`quantile`] for a checked version.
1029 ///
1030 /// # Panics
1031 /// May panic if the `range` is out of bounds. May instead return an empty bit vector.
1032 ///
1033 /// [`quantile`]: WaveletMatrix::quantile
1034 #[must_use]
1035 pub fn quantile_unchecked(&self, range: Range<usize>, k: usize) -> BitVec {
1036 let result = BitVec::from_zeros(self.bits_per_element());
1037
1038 self.partial_quantile_search_unchecked(range, k, 0, result)
1039 }
1040
1041 /// Internal function to reuse the quantile code for the predecessor and successor search.
1042 /// This function performs the quantile search starting at the given level with the given prefix.
1043 ///
1044 /// The function does not perform any checks, so the caller must ensure that the range is valid,
1045 /// and that the prefix is a valid prefix for the given level.
1046 #[inline(always)]
1047 fn partial_quantile_search_unchecked(
1048 &self,
1049 mut range: Range<usize>,
1050 mut k: usize,
1051 start_level: usize,
1052 mut prefix: BitVec,
1053 ) -> BitVec {
1054 debug_assert!(prefix.len() == self.bits_per_element());
1055 debug_assert!(!range.is_empty());
1056 debug_assert!(range.end <= self.len());
1057
1058 for (level, data) in self.data.iter().enumerate().skip(start_level) {
1059 let zeros_start = data.rank0(range.start);
1060 let zeros_end = data.rank0(range.end);
1061 let zeros = zeros_end - zeros_start;
1062
1063 // if k < zeros, the element is among the zeros
1064 if k < zeros {
1065 range.start = zeros_start;
1066 range.end = zeros_end;
1067 } else {
1068 // the element is among the ones, so we set the bit to 1, and move the range
1069 // into the 1-partition of the next level
1070 prefix.set_unchecked((self.bits_per_element() - 1) - level, 1);
1071 k -= zeros;
1072 range.start = data.rank0 + (range.start - zeros_start); // range.start - zeros_start is the rank1 of range.start
1073 range.end = data.rank0 + (range.end - zeros_end); // same here
1074 }
1075 }
1076
1077 prefix
1078 }
1079
1080 /// Get the `k`-th smallest element in the encoded sequence in the specified `range`,
1081 /// where `k = 0` returns the smallest element.
1082 /// The `range` is a half-open interval, meaning that the `end` index is exclusive.
1083 /// The `k`-th smallest element is returned as a `BitVec`,
1084 /// where the least significant bit is the first element.
1085 ///
1086 /// Returns `None` if the `range` is out of bounds, or if `k` is greater than the size of the range.
1087 ///
1088 /// # Example
1089 /// ```
1090 /// use vers_vecs::{BitVec, WaveletMatrix};
1091 ///
1092 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1093 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1094 ///
1095 /// assert_eq!(wavelet_matrix.quantile(0..3, 0), Some(BitVec::pack_sequence_u8(&[1], 3)));
1096 /// assert_eq!(wavelet_matrix.quantile(0..3, 1), Some(BitVec::pack_sequence_u8(&[4], 3)));
1097 /// assert_eq!(wavelet_matrix.quantile(1..4, 0), Some(BitVec::pack_sequence_u8(&[1], 3)));
1098 /// ```
1099 #[must_use]
1100 pub fn quantile(&self, range: Range<usize>, k: usize) -> Option<BitVec> {
1101 if range.start >= self.len() || range.end > self.len() || k >= range.end - range.start {
1102 None
1103 } else {
1104 Some(self.quantile_unchecked(range, k))
1105 }
1106 }
1107
1108 /// Get the `i`-th smallest element in the entire wavelet matrix.
1109 /// The `i`-th smallest element is returned as a `BitVec`,
1110 /// where the least significant bit is the first element.
1111 ///
1112 /// This method does not perform bounds checking.
1113 /// Use [`get_sorted`] for a checked version.
1114 ///
1115 /// # Panics
1116 /// May panic if the `i` is out of bounds, or returns an empty bit vector.
1117 #[must_use]
1118 pub fn get_sorted_unchecked(&self, i: usize) -> BitVec {
1119 self.quantile_unchecked(0..self.len(), i)
1120 }
1121
1122 /// Get the `i`-th smallest element in the wavelet matrix.
1123 /// The `i`-th smallest element is returned as a `BitVec`,
1124 /// where the least significant bit is the first element.
1125 /// This method call is equivalent to `self.quantile(0..self.len(), i)`.
1126 ///
1127 /// Returns `None` if the `i` is out of bounds.
1128 ///
1129 /// # Example
1130 /// ```
1131 /// use vers_vecs::{BitVec, WaveletMatrix};
1132 ///
1133 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1134 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1135 ///
1136 /// assert_eq!(wavelet_matrix.get_sorted(0), Some(BitVec::pack_sequence_u8(&[1], 3)));
1137 /// assert_eq!(wavelet_matrix.get_sorted(1), Some(BitVec::pack_sequence_u8(&[1], 3)));
1138 /// assert_eq!(wavelet_matrix.get_sorted(2), Some(BitVec::pack_sequence_u8(&[2], 3)));
1139 /// ```
1140 #[must_use]
1141 pub fn get_sorted(&self, i: usize) -> Option<BitVec> {
1142 if i >= self.len() {
1143 None
1144 } else {
1145 Some(self.get_sorted_unchecked(i))
1146 }
1147 }
1148
1149 /// Get the `k`-th smallest element in the encoded sequence in the specified `range`,
1150 /// where `k = 0` returns the smallest element.
1151 /// The `range` is a half-open interval, meaning that the `end` index is exclusive.
1152 /// The `k`-th smallest element is returned as a `u64` numeral.
1153 /// If the number of bits per element exceeds 64, the value is truncated.
1154 ///
1155 /// This method does not perform bounds checking.
1156 /// It returns a nonsensical result if the `k` is greater than the size of the range.
1157 /// Use [`quantile_u64`] for a checked version.
1158 ///
1159 /// # Panics
1160 /// May panic if the `range` is out of bounds.
1161 /// May instead return 0.
1162 ///
1163 /// [`quantile_u64`]: WaveletMatrix::quantile_u64
1164 #[must_use]
1165 pub fn quantile_u64_unchecked(&self, range: Range<usize>, k: usize) -> u64 {
1166 self.partial_quantile_search_u64_unchecked(range, k, 0, 0)
1167 }
1168
1169 /// Internal function to reuse the quantile code for the predecessor and successor search.
1170 /// This function performs the quantile search starting at the given level with the given prefix.
1171 ///
1172 /// The function does not perform any checks, so the caller must ensure that the range is valid,
1173 /// and that the prefix is a valid prefix for the given level (i.e. the prefix is not shifted
1174 /// to another level).
1175 #[inline(always)]
1176 fn partial_quantile_search_u64_unchecked(
1177 &self,
1178 mut range: Range<usize>,
1179 mut k: usize,
1180 start_level: usize,
1181 mut prefix: u64,
1182 ) -> u64 {
1183 debug_assert!(self.bits_per_element() <= 64);
1184 debug_assert!(!range.is_empty());
1185 debug_assert!(range.end <= self.len());
1186
1187 for data in self.data.iter().skip(start_level) {
1188 prefix <<= 1;
1189 let zeros_start = data.rank0(range.start);
1190 let zeros_end = data.rank0(range.end);
1191 let zeros = zeros_end - zeros_start;
1192
1193 if k < zeros {
1194 range.start = zeros_start;
1195 range.end = zeros_end;
1196 } else {
1197 prefix |= 1;
1198 k -= zeros;
1199 range.start = data.rank0 + (range.start - zeros_start);
1200 range.end = data.rank0 + (range.end - zeros_end);
1201 }
1202 }
1203
1204 prefix
1205 }
1206
1207 /// Get the `k`-th smallest element in the encoded sequence in the specified `range`,
1208 /// where `k = 0` returns the smallest element.
1209 /// The `range` is a half-open interval, meaning that the `end` index is exclusive.
1210 /// The `k`-th smallest element is returned as a `u64` numeral.
1211 ///
1212 /// Returns `None` if the `range` is out of bounds, or if the number of bits per element exceeds 64,
1213 /// or if `k` is greater than the size of the range.
1214 ///
1215 /// # Example
1216 /// ```
1217 /// use vers_vecs::{BitVec, WaveletMatrix};
1218 ///
1219 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1220 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1221 ///
1222 /// assert_eq!(wavelet_matrix.quantile_u64(0..3, 0), Some(1));
1223 /// assert_eq!(wavelet_matrix.quantile_u64(0..3, 1), Some(4));
1224 /// assert_eq!(wavelet_matrix.quantile_u64(1..4, 0), Some(1));
1225 /// ```
1226 #[must_use]
1227 pub fn quantile_u64(&self, range: Range<usize>, k: usize) -> Option<u64> {
1228 if range.start >= self.len()
1229 || range.end > self.len()
1230 || self.bits_per_element() > 64
1231 || k >= range.end - range.start
1232 {
1233 None
1234 } else {
1235 Some(self.quantile_u64_unchecked(range, k))
1236 }
1237 }
1238
1239 /// Get the `i`-th smallest element in the wavelet matrix.
1240 /// The `i`-th smallest element is returned as a u64 numeral.
1241 ///
1242 /// If the number of bits per element exceeds 64, the value is truncated.
1243 ///
1244 /// This method does not perform bounds checking.
1245 /// Use [`get_sorted_u64`] for a checked version.
1246 ///
1247 /// # Panics
1248 /// May panic if the `i` is out of bounds, or returns an empty bit vector.
1249 ///
1250 /// [`get_sorted_u64`]: WaveletMatrix::get_sorted_u64
1251 #[must_use]
1252 pub fn get_sorted_u64_unchecked(&self, i: usize) -> u64 {
1253 self.quantile_u64_unchecked(0..self.len(), i)
1254 }
1255
1256 /// Get the `i`-th smallest element in the entire wavelet matrix.
1257 /// The `i`-th smallest element is returned as a u64 numeral.
1258 ///
1259 /// Returns `None` if the `i` is out of bounds, or if the number of bits per element exceeds 64.
1260 ///
1261 /// # Example
1262 /// ```
1263 /// use vers_vecs::{BitVec, WaveletMatrix};
1264 ///
1265 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1266 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1267 ///
1268 /// assert_eq!(wavelet_matrix.get_sorted_u64(0), Some(1));
1269 /// assert_eq!(wavelet_matrix.get_sorted_u64(1), Some(1));
1270 /// assert_eq!(wavelet_matrix.get_sorted_u64(2), Some(2));
1271 /// ```
1272 #[must_use]
1273 pub fn get_sorted_u64(&self, i: usize) -> Option<u64> {
1274 if i >= self.len() || self.bits_per_element() > 64 {
1275 None
1276 } else {
1277 Some(self.get_sorted_u64_unchecked(i))
1278 }
1279 }
1280
1281 /// Get the smallest element in the encoded sequence in the specified `range`.
1282 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1283 /// The smallest element is returned as a `BitVec`,
1284 ///
1285 /// This method does not perform bounds checking.
1286 /// Use [`range_min`] for a checked version.
1287 ///
1288 /// # Panics
1289 /// May panic if the `range` is out of bounds or if the range is empty.
1290 /// May instead return an empty bit vector.
1291 ///
1292 /// [`range_min`]: WaveletMatrix::range_min
1293 #[must_use]
1294 pub fn range_min_unchecked(&self, range: Range<usize>) -> BitVec {
1295 self.quantile_unchecked(range, 0)
1296 }
1297
1298 /// Get the smallest element in the encoded sequence in the specified `range`.
1299 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1300 /// The smallest element is returned as a `BitVec`,
1301 ///
1302 /// Returns `None` if the `range` is out of bounds or if the range is empty.
1303 ///
1304 /// # Example
1305 /// ```
1306 /// use vers_vecs::{BitVec, WaveletMatrix};
1307 ///
1308 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1309 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1310 ///
1311 /// assert_eq!(wavelet_matrix.range_min(0..3), Some(BitVec::pack_sequence_u8(&[1], 3)));
1312 /// assert_eq!(wavelet_matrix.range_min(1..4), Some(BitVec::pack_sequence_u8(&[1], 3)));
1313 /// assert_eq!(wavelet_matrix.range_min(1..3), Some(BitVec::pack_sequence_u8(&[4], 3)));
1314 /// ```
1315 #[must_use]
1316 pub fn range_min(&self, range: Range<usize>) -> Option<BitVec> {
1317 self.quantile(range, 0)
1318 }
1319
1320 /// Get the smallest element in the encoded sequence in the specified `range`.
1321 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1322 /// The smallest element is returned as a `u64` numeral.
1323 /// If the number of bits per element exceeds 64, the value is truncated.
1324 ///
1325 /// This method does not perform bounds checking.
1326 /// Use [`range_min_u64`] for a checked version.
1327 ///
1328 /// # Panics
1329 /// May panic if the `range` is out of bounds or if the range is empty.
1330 /// May instead return 0.
1331 ///
1332 /// [`range_min_u64`]: WaveletMatrix::range_min_u64
1333 #[must_use]
1334 pub fn range_min_u64_unchecked(&self, range: Range<usize>) -> u64 {
1335 self.quantile_u64_unchecked(range, 0)
1336 }
1337
1338 /// Get the smallest element in the encoded sequence in the specified `range`.
1339 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1340 /// The smallest element is returned as a `u64` numeral.
1341 ///
1342 /// Returns `None` if the `range` is out of bounds, if the range is empty, or if the number of bits
1343 /// per element exceeds 64.
1344 ///
1345 /// # Example
1346 /// ```
1347 /// use vers_vecs::{BitVec, WaveletMatrix};
1348 ///
1349 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1350 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1351 ///
1352 /// assert_eq!(wavelet_matrix.range_min_u64(0..3), Some(1));
1353 /// assert_eq!(wavelet_matrix.range_min_u64(1..4), Some(1));
1354 /// assert_eq!(wavelet_matrix.range_min_u64(1..3), Some(4));
1355 /// ```
1356 #[must_use]
1357 pub fn range_min_u64(&self, range: Range<usize>) -> Option<u64> {
1358 self.quantile_u64(range, 0)
1359 }
1360
1361 /// Get the largest element in the encoded sequence in the specified `range`.
1362 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1363 /// The largest element is returned as a `BitVec`,
1364 /// where the least significant bit is the first element.
1365 ///
1366 /// This method does not perform bounds checking.
1367 /// Use [`range_max`] for a checked version.
1368 ///
1369 /// # Panics
1370 /// May panic if the `range` is out of bounds or if the range is empty.
1371 /// May instead return an empty bit vector.
1372 ///
1373 /// [`range_max`]: WaveletMatrix::range_max
1374 #[must_use]
1375 pub fn range_max_unchecked(&self, range: Range<usize>) -> BitVec {
1376 let k = range.end - range.start - 1;
1377 self.quantile_unchecked(range, k)
1378 }
1379
1380 /// Get the largest element in the encoded sequence in the specified `range`.
1381 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1382 /// The largest element is returned as a `BitVec`,
1383 /// where the least significant bit is the first element.
1384 ///
1385 /// Returns `None` if the `range` is out of bounds or if the range is empty.
1386 ///
1387 /// # Example
1388 /// ```
1389 /// use vers_vecs::{BitVec, WaveletMatrix};
1390 ///
1391 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1392 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1393 ///
1394 /// assert_eq!(wavelet_matrix.range_max(0..3), Some(BitVec::pack_sequence_u8(&[4], 3)));
1395 /// assert_eq!(wavelet_matrix.range_max(3..6), Some(BitVec::pack_sequence_u8(&[7], 3)));
1396 /// ```
1397 #[must_use]
1398 pub fn range_max(&self, range: Range<usize>) -> Option<BitVec> {
1399 if range.is_empty() {
1400 None
1401 } else {
1402 let k = range.end - range.start - 1;
1403 self.quantile(range, k)
1404 }
1405 }
1406
1407 /// Get the largest element in the encoded sequence in the specified `range`.
1408 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1409 /// The largest element is returned as a `u64` numeral.
1410 /// If the number of bits per element exceeds 64, the value is truncated.
1411 ///
1412 /// This method does not perform bounds checking.
1413 /// Use [`range_max_u64`] for a checked version.
1414 ///
1415 /// # Panics
1416 /// May panic if the `range` is out of bounds or if the range is empty.
1417 /// May instead return 0.
1418 ///
1419 /// [`range_max_u64`]: WaveletMatrix::range_max_u64
1420 #[must_use]
1421 pub fn range_max_u64_unchecked(&self, range: Range<usize>) -> u64 {
1422 let k = range.end - range.start - 1;
1423 self.quantile_u64_unchecked(range, k)
1424 }
1425
1426 /// Get the largest element in the encoded sequence in the specified `range`.
1427 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1428 /// The largest element is returned as a `u64` numeral.
1429 ///
1430 /// Returns `None` if the `range` is out of bounds, if the range is empty, or if the number of bits
1431 /// per element exceeds 64.
1432 ///
1433 /// # Example
1434 /// ```
1435 /// use vers_vecs::{BitVec, WaveletMatrix};
1436 ///
1437 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1438 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1439 ///
1440 /// assert_eq!(wavelet_matrix.range_max_u64(0..3), Some(4));
1441 /// assert_eq!(wavelet_matrix.range_max_u64(3..6), Some(7));
1442 /// ```
1443 #[must_use]
1444 pub fn range_max_u64(&self, range: Range<usize>) -> Option<u64> {
1445 if range.is_empty() {
1446 None
1447 } else {
1448 let k = range.end - range.start - 1;
1449 self.quantile_u64(range, k)
1450 }
1451 }
1452
1453 /// Get the median element in the encoded sequence in the specified `range`.
1454 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1455 /// The median element is returned as a `BitVec`,
1456 /// where the least significant bit is the first element.
1457 ///
1458 /// If the range does not contain an odd number of elements, the position is rounded down.
1459 ///
1460 /// This method does not perform bounds checking.
1461 /// Use [`range_median`] for a checked version.
1462 ///
1463 /// # Panics
1464 /// May panic if the `range` is out of bounds or if the range is empty.
1465 /// May instead return an empty bit vector.
1466 ///
1467 /// [`range_median`]: WaveletMatrix::range_median
1468 #[must_use]
1469 pub fn range_median_unchecked(&self, range: Range<usize>) -> BitVec {
1470 let k = (range.end - 1 - range.start) / 2;
1471 self.quantile_unchecked(range, k)
1472 }
1473
1474 /// Get the median element in the encoded sequence in the specified `range`.
1475 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1476 /// The median element is returned as a `BitVec`,
1477 /// where the least significant bit is the first element.
1478 ///
1479 /// If the range does not contain an odd number of elements, the position is rounded down.
1480 ///
1481 /// Returns `None` if the `range` is out of bounds or if the range is empty.
1482 ///
1483 /// # Example
1484 /// ```
1485 /// use vers_vecs::{BitVec, WaveletMatrix};
1486 ///
1487 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1488 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1489 ///
1490 /// assert_eq!(wavelet_matrix.range_median(0..3), Some(BitVec::pack_sequence_u8(&[4], 3)));
1491 /// assert_eq!(wavelet_matrix.range_median(1..4), Some(BitVec::pack_sequence_u8(&[4], 3)));
1492 /// assert_eq!(wavelet_matrix.range_median(0..6), Some(BitVec::pack_sequence_u8(&[2], 3)));
1493 /// ```
1494 #[must_use]
1495 pub fn range_median(&self, range: Range<usize>) -> Option<BitVec> {
1496 if range.is_empty() {
1497 None
1498 } else {
1499 let k = (range.end - 1 - range.start) / 2;
1500 self.quantile(range, k)
1501 }
1502 }
1503
1504 /// Get the median element in the encoded sequence in the specified `range`.
1505 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1506 /// The median element is returned as a `u64` numeral.
1507 /// If the number of bits per element exceeds 64, the value is truncated.
1508 ///
1509 /// If the range does not contain an odd number of elements, the position is rounded down.
1510 ///
1511 /// This method does not perform bounds checking.
1512 /// Use [`range_median_u64`] for a checked version.
1513 ///
1514 /// # Panics
1515 /// May panic if the `range` is out of bounds or if the range is empty.
1516 /// May instead return 0.
1517 ///
1518 /// [`range_median_u64`]: WaveletMatrix::range_median_u64
1519 #[must_use]
1520 pub fn range_median_u64_unchecked(&self, range: Range<usize>) -> u64 {
1521 let k = (range.end - 1 - range.start) / 2;
1522 self.quantile_u64_unchecked(range, k)
1523 }
1524
1525 /// Get the median element in the encoded sequence in the specified `range`.
1526 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1527 /// The median element is returned as a `u64` numeral.
1528 ///
1529 /// If the range does not contain an odd number of elements, the position is rounded down.
1530 ///
1531 /// Returns `None` if the `range` is out of bounds, if the range is empty, or if the number of bits
1532 /// per element exceeds 64.
1533 ///
1534 /// # Example
1535 /// ```
1536 /// use vers_vecs::{BitVec, WaveletMatrix};
1537 ///
1538 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1539 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1540 ///
1541 /// assert_eq!(wavelet_matrix.range_median_u64(0..3), Some(4));
1542 /// assert_eq!(wavelet_matrix.range_median_u64(1..4), Some(4));
1543 /// assert_eq!(wavelet_matrix.range_median_u64(0..6), Some(2));
1544 /// ```
1545 #[must_use]
1546 pub fn range_median_u64(&self, range: Range<usize>) -> Option<u64> {
1547 if range.is_empty() || self.bits_per_element() > 64 || range.end > self.len() {
1548 None
1549 } else {
1550 let k = (range.end - 1 - range.start) / 2;
1551 self.quantile_u64(range, k)
1552 }
1553 }
1554
1555 /// Get the predecessor of the given `symbol` in the given `range`.
1556 /// This is a private generic helper function to implement the public `predecessor` functions.
1557 ///
1558 /// The read and write access to the query and result values are abstracted by the `Reader` and `Writer` closures.
1559 #[inline(always)] // even though the function is pretty large, inlining probably gets rid of the closure calls in favor of static calls
1560 fn predecessor_generic_unchecked<
1561 T: Clone,
1562 Reader: Fn(usize, &T) -> u64,
1563 Writer: Fn(u64, usize, &mut T),
1564 Quantile: Fn(&Self, Range<usize>, usize, usize, T) -> T,
1565 >(
1566 &self,
1567 mut range: Range<usize>,
1568 symbol: &T,
1569 mut result_value: T,
1570 bit_reader: Reader,
1571 result_writer: Writer,
1572 quantile_search: Quantile,
1573 ) -> Option<T> {
1574 // the bit-prefix at the last node where we could go to an interval with smaller elements,
1575 // i.e. where we need to go if the current prefix has no elements smaller than the query
1576 let mut last_smaller_prefix = result_value.clone();
1577 // the level of the last node where we could go to an interval with smaller elements
1578 let mut last_one_level: Option<usize> = None;
1579 // the range of the last node where we could go to an interval with smaller elements
1580 let mut next_smaller_range: Option<Range<usize>> = None;
1581
1582 for (level, data) in self.data.iter().enumerate() {
1583 let query_bit = bit_reader(level, symbol);
1584
1585 // read the amount of elements with the current result-prefix plus one 0 bit.
1586 // if the query_bit is 1, we can calculate the amount of elements with the result-prefix
1587 // plus one 1 from there
1588 let zeros_start = data.rank0(range.start);
1589 let zeros_end = data.rank0(range.end);
1590 let elements_zero = zeros_end - zeros_start;
1591
1592 if query_bit == 0 {
1593 if elements_zero == 0 {
1594 // if our query bit is zero in this level and suddenly our new interval is empty,
1595 // all elements that were in the previous interval are bigger than the query element,
1596 // because they all have a 1 in the current level.
1597 // this means the predecessor is the largest element in the last smaller interval,
1598 // i.e. the interval that has a 0 bit at the last level where our prefix had a 1 bit.
1599
1600 return next_smaller_range.map(|r| {
1601 let idx = r.end - r.start - 1;
1602 result_writer(0, last_one_level.unwrap(), &mut last_smaller_prefix);
1603 quantile_search(
1604 self,
1605 r,
1606 idx,
1607 last_one_level.unwrap() + 1,
1608 last_smaller_prefix,
1609 )
1610 });
1611 }
1612
1613 // update the prefix
1614 result_writer(0, level, &mut result_value);
1615
1616 // update the range to the interval of the new prefix
1617 range.start = zeros_start;
1618 range.end = zeros_end;
1619 } else {
1620 if elements_zero == range.end - range.start {
1621 // if our query element is 1 in this level and suddenly our new interval is empty,
1622 // all elements that were in the previous interval are smaller than the query element,
1623 // because they all have a 0 in the current level.
1624 // this means the predecessor is the largest element in the last level's interval.
1625
1626 let idx = range.end - range.start - 1;
1627 return Some(quantile_search(self, range, idx, level, result_value));
1628 }
1629
1630 // if the other interval is not empty, we update the last smaller interval to the last interval where we can switch to a 0 prefix
1631 if !(zeros_start..zeros_end).is_empty() {
1632 last_one_level = Some(level);
1633 next_smaller_range = Some(zeros_start..zeros_end);
1634 last_smaller_prefix = result_value.clone();
1635 }
1636
1637 // update the prefix
1638 result_writer(1, level, &mut result_value);
1639
1640 // update the range to the interval of the new prefix
1641 range.start = data.rank0 + (range.start - zeros_start);
1642 range.end = data.rank0 + (range.end - zeros_end);
1643 }
1644 }
1645
1646 Some(result_value)
1647 }
1648
1649 /// Get the predecessor of the given `symbol` in the given `range`.
1650 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
1651 /// where the least significant bit is the first element, and `k` is the number of bits per element
1652 /// in the wavelet matrix.
1653 /// The `symbol` does not have to be in the encoded sequence.
1654 /// The predecessor is the largest element in the `range` that is smaller than or equal
1655 /// to the `symbol`.
1656 ///
1657 /// Returns `None` if the number of bits in the `symbol` is not equal to `k`,
1658 /// if the range is empty, if the wavelet matrix is empty, if the range is out of bounds,
1659 /// or if the `symbol` is smaller than all elements in the range.
1660 ///
1661 /// # Example
1662 /// ```
1663 /// use vers_vecs::{BitVec, WaveletMatrix};
1664 ///
1665 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1666 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1667 ///
1668 /// assert_eq!(wavelet_matrix.predecessor(0..3, &BitVec::pack_sequence_u8(&[7], 3)), Some(BitVec::pack_sequence_u8(&[4], 3)));
1669 /// assert_eq!(wavelet_matrix.predecessor(0..3, &BitVec::pack_sequence_u8(&[4], 3)), Some(BitVec::pack_sequence_u8(&[4], 3)));
1670 /// assert_eq!(wavelet_matrix.predecessor(0..6, &BitVec::pack_sequence_u8(&[7], 3)), Some(BitVec::pack_sequence_u8(&[7], 3)));
1671 /// ```
1672 ///
1673 /// [`BitVec`]: BitVec
1674 #[must_use]
1675 pub fn predecessor(&self, range: Range<usize>, symbol: &BitVec) -> Option<BitVec> {
1676 if symbol.len() != self.bits_per_element()
1677 || range.is_empty()
1678 || self.is_empty()
1679 || range.end > self.len()
1680 {
1681 return None;
1682 }
1683
1684 self.predecessor_generic_unchecked(
1685 range,
1686 symbol,
1687 BitVec::from_zeros(self.bits_per_element()),
1688 |level, symbol| symbol.get_unchecked((self.bits_per_element() - 1) - level),
1689 |bit, level, result| {
1690 result.set_unchecked((self.bits_per_element() - 1) - level, bit);
1691 },
1692 Self::partial_quantile_search_unchecked,
1693 )
1694 }
1695
1696 /// Get the predecessor of the given `symbol` in the given `range`.
1697 /// The `symbol` is a `k`-bit word encoded in a `u64` numeral,
1698 /// where k is less than or equal to 64.
1699 /// The `symbol` does not have to be in the encoded sequence.
1700 /// The predecessor is the largest element in the `range` that is smaller than or equal
1701 /// to the `symbol`.
1702 ///
1703 /// Returns `None` if the number of bits in the matrix is greater than 64,
1704 /// if the range is empty, if the wavelet matrix is empty, if the range is out of bounds,
1705 /// or if the `symbol` is smaller than all elements in the range.
1706 ///
1707 /// # Example
1708 /// ```
1709 /// use vers_vecs::{BitVec, WaveletMatrix};
1710 ///
1711 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1712 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1713 ///
1714 /// assert_eq!(wavelet_matrix.predecessor_u64(0..3, 7), Some(4));
1715 /// assert_eq!(wavelet_matrix.predecessor_u64(0..3, 4), Some(4));
1716 /// assert_eq!(wavelet_matrix.predecessor_u64(0..6, 7), Some(7));
1717 /// ```
1718 #[must_use]
1719 pub fn predecessor_u64(&self, range: Range<usize>, symbol: u64) -> Option<u64> {
1720 if self.bits_per_element() > 64
1721 || range.is_empty()
1722 || self.is_empty()
1723 || range.end > self.len()
1724 {
1725 return None;
1726 }
1727
1728 self.predecessor_generic_unchecked(
1729 range,
1730 &symbol,
1731 0,
1732 |level, symbol| (symbol >> ((self.bits_per_element() - 1) - level)) & 1,
1733 |bit, _level, result| {
1734 // we ignore the level here, and instead rely on the fact that the bits are set in order.
1735 // we have to do that, because the quantile_search_u64 does the same.
1736 *result <<= 1;
1737 *result |= bit;
1738 },
1739 Self::partial_quantile_search_u64_unchecked,
1740 )
1741 }
1742
1743 #[inline(always)]
1744 fn successor_generic_unchecked<
1745 T: Clone,
1746 Reader: Fn(usize, &T) -> u64,
1747 Writer: Fn(u64, usize, &mut T),
1748 Quantile: Fn(&Self, Range<usize>, usize, usize, T) -> T,
1749 >(
1750 &self,
1751 mut range: Range<usize>,
1752 symbol: &T,
1753 mut result_value: T,
1754 bit_reader: Reader,
1755 result_writer: Writer,
1756 quantile_search: Quantile,
1757 ) -> Option<T> {
1758 // the bit-prefix at the last node where we could go to an interval with larger elements,
1759 // i.e. where we need to go if the current prefix has no elements larger than the query
1760 let mut last_larger_prefix = result_value.clone();
1761 // the level of the last node where we could go to an interval with larger elements
1762 let mut last_zero_level: Option<usize> = None;
1763 // the range of the last node where we could go to an interval with larger elements
1764 let mut next_larger_range: Option<Range<usize>> = None;
1765
1766 for (level, data) in self.data.iter().enumerate() {
1767 let query_bit = bit_reader(level, symbol);
1768
1769 // read the amount of elements with the current result-prefix plus one 0 bit.
1770 // if the query_bit is 1, we can calculate the amount of elements with the result-prefix
1771 // plus one 1 from there
1772 let zeros_start = data.rank0(range.start);
1773 let zeros_end = data.rank0(range.end);
1774 let elements_zero = zeros_end - zeros_start;
1775
1776 if query_bit == 0 {
1777 if elements_zero == 0 {
1778 // if our query element is 0 in this level and suddenly our new interval is empty,
1779 // all elements that were in the previous interval are larger than the query element,
1780 // because they all have a 1 in the current level.
1781 // this means the successor is the smallest element in the last level's interval.
1782
1783 return Some(quantile_search(self, range, 0, level, result_value));
1784 }
1785
1786 // if the other interval is not empty, we update the last interval where we can switch to a prefix with a 1
1787 if !(data.rank0 + (range.start - zeros_start)..data.rank0 + (range.end - zeros_end))
1788 .is_empty()
1789 {
1790 last_zero_level = Some(level);
1791 next_larger_range = Some(
1792 data.rank0 + (range.start - zeros_start)
1793 ..data.rank0 + (range.end - zeros_end),
1794 );
1795 last_larger_prefix = result_value.clone();
1796 }
1797
1798 // update the prefix
1799 result_writer(0, level, &mut result_value);
1800
1801 // update the range to the interval of the new prefix
1802 range.start = zeros_start;
1803 range.end = zeros_end;
1804 } else {
1805 if elements_zero == range.end - range.start {
1806 // if our query bit is 1 in this level and suddenly our new interval is empty,
1807 // all elements that were in the previous interval are smaller than the query element,
1808 // because they all have a 0 in the current level.
1809 // this means the successor is the smallest element in the last interval with a larger prefix,
1810 // i.e. the interval that has a 1 bit at the last level where our prefix had a 0 bit.
1811
1812 return next_larger_range.map(|r| {
1813 result_writer(1, last_zero_level.unwrap(), &mut last_larger_prefix);
1814 quantile_search(
1815 self,
1816 r,
1817 0,
1818 last_zero_level.unwrap() + 1,
1819 last_larger_prefix,
1820 )
1821 });
1822 }
1823
1824 // update the prefix
1825 result_writer(1, level, &mut result_value);
1826
1827 // update the range to the interval of the new prefix
1828 range.start = data.rank0 + (range.start - zeros_start);
1829 range.end = data.rank0 + (range.end - zeros_end);
1830 }
1831 }
1832
1833 Some(result_value)
1834 }
1835
1836 /// Get the successor of the given `symbol` in the given range.
1837 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
1838 /// where the least significant bit is the first element, and `k` is the number of bits per element
1839 /// in the wavelet matrix.
1840 /// The `symbol` does not have to be in the encoded sequence.
1841 /// The successor is the smallest element in the range that is greater than or equal
1842 /// to the `symbol`.
1843 ///
1844 /// Returns `None` if the number of bits in the `symbol` is not equal to `k`,
1845 /// if the range is empty, if the wavelet matrix is empty, if the range is out of bounds,
1846 /// or if the `symbol` is greater than all elements in the range.
1847 ///
1848 /// # Example
1849 /// ```
1850 /// use vers_vecs::{BitVec, WaveletMatrix};
1851 ///
1852 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1853 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1854 ///
1855 /// assert_eq!(wavelet_matrix.successor(0..3, &BitVec::pack_sequence_u8(&[2], 3)), Some(BitVec::pack_sequence_u8(&[4], 3)));
1856 /// assert_eq!(wavelet_matrix.successor(0..3, &BitVec::pack_sequence_u8(&[5], 3)), None);
1857 /// assert_eq!(wavelet_matrix.successor(0..6, &BitVec::pack_sequence_u8(&[2], 3)), Some(BitVec::pack_sequence_u8(&[2], 3)));
1858 /// ```
1859 ///
1860 /// [`BitVec`]: BitVec
1861 #[must_use]
1862 pub fn successor(&self, range: Range<usize>, symbol: &BitVec) -> Option<BitVec> {
1863 if symbol.len() != self.bits_per_element()
1864 || range.is_empty()
1865 || self.is_empty()
1866 || range.end > self.len()
1867 {
1868 return None;
1869 }
1870
1871 self.successor_generic_unchecked(
1872 range,
1873 symbol,
1874 BitVec::from_zeros(self.bits_per_element()),
1875 |level, symbol| symbol.get_unchecked((self.bits_per_element() - 1) - level),
1876 |bit, level, result| {
1877 result.set_unchecked((self.bits_per_element() - 1) - level, bit);
1878 },
1879 Self::partial_quantile_search_unchecked,
1880 )
1881 }
1882
1883 /// Get the successor of the given `symbol` in the given range.
1884 /// The `symbol` is a `k`-bit word encoded in a `u64` numeral,
1885 /// where k is less than or equal to 64.
1886 /// The `symbol` does not have to be in the encoded sequence.
1887 /// The successor is the smallest element in the range that is greater than or equal
1888 /// to the `symbol`.
1889 ///
1890 /// Returns `None` if the number of bits in the matrix is greater than 64,
1891 /// if the range is empty, if the wavelet matrix is empty, if the range is out of bounds,
1892 /// or if the `symbol` is greater than all elements in the range.
1893 ///
1894 /// # Example
1895 /// ```
1896 /// use vers_vecs::{BitVec, WaveletMatrix};
1897 ///
1898 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1899 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1900 ///
1901 /// assert_eq!(wavelet_matrix.successor_u64(0..3, 2), Some(4));
1902 /// assert_eq!(wavelet_matrix.successor_u64(0..3, 5), None);
1903 /// assert_eq!(wavelet_matrix.successor_u64(0..6, 2), Some(2));
1904 /// ```
1905 #[must_use]
1906 pub fn successor_u64(&self, range: Range<usize>, symbol: u64) -> Option<u64> {
1907 if self.bits_per_element() > 64
1908 || range.is_empty()
1909 || self.is_empty()
1910 || range.end > self.len()
1911 {
1912 return None;
1913 }
1914
1915 self.successor_generic_unchecked(
1916 range,
1917 &symbol,
1918 0,
1919 |level, symbol| (symbol >> ((self.bits_per_element() - 1) - level)) & 1,
1920 |bit, _level, result| {
1921 // we ignore the level here, and instead rely on the fact that the bits are set in order.
1922 // we have to do that, because the quantile_search_u64 does the same.
1923 *result <<= 1;
1924 *result |= bit;
1925 },
1926 Self::partial_quantile_search_u64_unchecked,
1927 )
1928 }
1929
1930 /// Get an iterator over the elements of the encoded sequence.
1931 /// The iterator yields `u64` elements.
1932 /// If the number of bits per element exceeds 64, `None` is returned.
1933 ///
1934 /// # Example
1935 /// ```
1936 /// use vers_vecs::{BitVec, WaveletMatrix};
1937 ///
1938 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1939 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1940 ///
1941 /// let mut iter = wavelet_matrix.iter_u64().unwrap();
1942 /// assert_eq!(iter.collect::<Vec<_>>(), vec![1, 4, 4, 1, 2, 7]);
1943 /// ```
1944 #[must_use]
1945 pub fn iter_u64(&self) -> Option<WaveletNumRefIter> {
1946 if self.bits_per_element() > 64 {
1947 None
1948 } else {
1949 Some(WaveletNumRefIter::new(self))
1950 }
1951 }
1952
1953 /// Turn the encoded sequence into an iterator.
1954 /// The iterator yields `u64` elements.
1955 /// If the number of bits per element exceeds 64, `None` is returned.
1956 #[must_use]
1957 pub fn into_iter_u64(self) -> Option<WaveletNumIter> {
1958 if self.bits_per_element() > 64 {
1959 None
1960 } else {
1961 Some(WaveletNumIter::new(self))
1962 }
1963 }
1964
1965 /// Get an iterator over the sorted elements of the encoded sequence.
1966 /// The iterator yields `BitVec` elements.
1967 ///
1968 /// See also [`iter_sorted_u64`] for an iterator that yields `u64` elements.
1969 #[must_use]
1970 pub fn iter_sorted(&self) -> WaveletSortedRefIter {
1971 WaveletSortedRefIter::new(self)
1972 }
1973
1974 /// Turn the encoded sequence into an iterator over the sorted sequence.
1975 /// The iterator yields `BitVec` elements.
1976 #[must_use]
1977 pub fn into_iter_sorted(self) -> WaveletSortedIter {
1978 WaveletSortedIter::new(self)
1979 }
1980
1981 /// Get an iterator over the sorted elements of the encoded sequence.
1982 /// The iterator yields `u64` elements.
1983 /// If the number of bits per element exceeds 64, `None` is returned.
1984 ///
1985 /// # Example
1986 /// ```
1987 /// use vers_vecs::{BitVec, WaveletMatrix};
1988 ///
1989 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1990 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1991 ///
1992 /// let mut iter = wavelet_matrix.iter_sorted_u64().unwrap();
1993 /// assert_eq!(iter.collect::<Vec<_>>(), vec![1, 1, 2, 4, 4, 7]);
1994 /// ```
1995 #[must_use]
1996 pub fn iter_sorted_u64(&self) -> Option<WaveletSortedNumRefIter> {
1997 if self.bits_per_element() > 64 {
1998 None
1999 } else {
2000 Some(WaveletSortedNumRefIter::new(self))
2001 }
2002 }
2003
2004 /// Turn the encoded sequence into an iterator over the sorted sequence.
2005 /// The iterator yields `u64` elements.
2006 /// If the number of bits per element exceeds 64, `None` is returned.
2007 #[must_use]
2008 pub fn into_iter_sorted_u64(self) -> Option<WaveletSortedNumIter> {
2009 if self.bits_per_element() > 64 {
2010 None
2011 } else {
2012 Some(WaveletSortedNumIter::new(self))
2013 }
2014 }
2015
2016 /// Get the number of bits per element in the alphabet of the encoded sequence.
2017 #[must_use]
2018 #[inline(always)]
2019 pub fn bits_per_element(&self) -> usize {
2020 self.data.len()
2021 }
2022
2023 /// Get the number of bits per element in the alphabet of the encoded sequence.
2024 #[must_use]
2025 #[deprecated(since = "1.5.1", note = "please use `bits_per_element` instead")]
2026 #[allow(clippy::cast_possible_truncation)]
2027 pub fn bit_len(&self) -> u16 {
2028 self.bits_per_element() as u16
2029 }
2030
2031 /// Get the number of elements stored in the encoded sequence.
2032 #[must_use]
2033 pub fn len(&self) -> usize {
2034 if self.data.is_empty() {
2035 0
2036 } else {
2037 self.data[0].len()
2038 }
2039 }
2040
2041 /// Check if the wavelet matrix is empty.
2042 #[must_use]
2043 pub fn is_empty(&self) -> bool {
2044 self.len() == 0
2045 }
2046
2047 /// Get the number of bytes allocated on the heap for the wavelet matrix.
2048 /// This does not include memory that is allocated but unused due to allocation policies of
2049 /// internal data structures.
2050 #[must_use]
2051 pub fn heap_size(&self) -> usize {
2052 self.data.iter().map(RsVec::heap_size).sum::<usize>()
2053 }
2054}
2055
2056impl_vector_iterator!(
2057 WaveletMatrix,
2058 WaveletIter,
2059 WaveletRefIter,
2060 get_value_unchecked,
2061 get_value,
2062 BitVec
2063);
2064
2065impl_vector_iterator!(
2066 WaveletMatrix,
2067 WaveletNumIter,
2068 WaveletNumRefIter,
2069 get_u64_unchecked,
2070 get_u64,
2071 u64,
2072 special
2073);
2074
2075impl_vector_iterator!(
2076 WaveletMatrix,
2077 WaveletSortedIter,
2078 WaveletSortedRefIter,
2079 get_sorted_unchecked,
2080 get_sorted,
2081 BitVec,
2082 special
2083);
2084
2085impl_vector_iterator!(
2086 WaveletMatrix,
2087 WaveletSortedNumIter,
2088 WaveletSortedNumRefIter,
2089 get_sorted_u64_unchecked,
2090 get_sorted_u64,
2091 u64,
2092 special
2093);
2094
2095#[cfg(test)]
2096mod tests;