vers_vecs/wavelet/mod.rs
1//! This module contains the implementation of a [wavelet matrix].
2//! The wavelet matrix is a data structure that encodes a sequence of `n` `k`-bit words
3//! into a matrix of bit vectors, allowing for fast rank and select queries on the encoded sequence.
4//!
5//! This implementation further supports quantile queries, and range-predecessor and -successor queries.
6//! All operations are `O(k)` time complexity.
7//!
8//! See the struct documentation for more information.
9//!
10//! [wavelet matrix]: WaveletMatrix
11
12use crate::util::impl_vector_iterator;
13use crate::{BitVec, RsVec};
14use std::mem;
15use std::ops::Range;
16
17/// A wavelet matrix implementation implemented as described in
18/// [Navarro and Claude, 2021](http://dx.doi.org/10.1007/978-3-642-34109-0_18).
19/// The implementation is designed to allow for extremely large alphabet sizes, without
20/// sacrificing performance for small alphabets.
21///
22/// There are two constructor algorithms available:
23/// - [`from_bit_vec`] and [`from_slice`] construct the wavelet matrix by repeatedly sorting the elements.
24/// These constructors have linear space overhead and run in `O(kn * log n)` time complexity.
25/// - [`from_bit_vec_pc`] and [`from_slice_pc`] construct the wavelet matrix by counting the
26/// prefixes of the elements. These constructors have a space complexity of `O(2^k)` and run
27/// in `O(kn)`, which makes this constructor preferable for large sequences over small alphabets.
28///
29/// They encode a sequence of `n` `k`-bit words into a wavelet matrix which supports constant-time
30/// rank and select queries on elements of its `k`-bit alphabet.
31/// All query functions are mirrored for both `BitVec` and `u64` query elements, so
32/// if `k <= 64`, no heap allocation is needed for the query element.
33///
34/// Other than rank and select queries, the matrix also supports quantile queries (range select i), and
35/// range-predecessor and -successor queries, all of which are loosely based on
36/// [Külekci and Thankachan](https://doi.org/10.1016/j.jda.2017.01.002) with better time complexity.
37///
38/// All operations implemented on the matrix are `O(k)` time complexity.
39/// The space complexity of the wavelet matrix is `O(n * k)` with a small linear overhead
40/// (see [`RsVec`]).
41///
42/// # Examples
43/// ```
44/// use vers_vecs::{BitVec, WaveletMatrix};
45///
46/// // pack elements from a 3-bit alphabet into a bit vector and construct a wavelet matrix from them
47/// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
48/// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
49///
50/// // query the wavelet matrix
51/// assert_eq!(wavelet_matrix.get_u64(0), Some(1));
52/// assert_eq!(wavelet_matrix.get_u64(1), Some(4));
53///
54// // rank and select queries
55/// assert_eq!(wavelet_matrix.rank_u64(3, 4), Some(2));
56/// assert_eq!(wavelet_matrix.rank_u64(3, 1), Some(1));
57/// assert_eq!(wavelet_matrix.select_u64(0, 7), Some(5));
58///
59/// // statistics
60/// assert_eq!(wavelet_matrix.range_median_u64(0..3), Some(4));
61/// assert_eq!(wavelet_matrix.predecessor_u64(0..6, 3), Some(2));
62/// ```
63///
64/// [`RsVec`]: RsVec
65/// [`from_bit_vec`]: WaveletMatrix::from_bit_vec
66/// [`from_slice`]: WaveletMatrix::from_slice
67/// [`from_bit_vec_pc`]: WaveletMatrix::from_bit_vec_pc
68/// [`from_slice_pc`]: WaveletMatrix::from_slice_pc
69#[derive(Clone, Debug)]
70#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
71pub struct WaveletMatrix {
72 data: Box<[RsVec]>,
73}
74
75impl WaveletMatrix {
76 /// Generic constructor that constructs the wavelet matrix by repeatedly sorting the elements.
77 /// The runtime complexity is `O(kn * log n)`.
78 ///
79 /// # Parameters
80 /// - `bits_per_element`: The number of bits in each word. Cannot exceed 1 << 16.
81 /// - `num_elements`: The number of elements in the sequence.
82 /// - `bit_lookup`: A closure that returns the `bit`-th bit of the `element`-th word.
83 #[inline(always)] // should get rid of closures in favor of static calls
84 fn permutation_sorting<LOOKUP: Fn(usize, usize) -> u64>(
85 bits_per_element: u16,
86 num_elements: usize,
87 bit_lookup: LOOKUP,
88 ) -> Self {
89 let element_len = bits_per_element as usize;
90
91 let mut data = vec![BitVec::from_zeros(num_elements); element_len];
92
93 // insert the first bit of each word into the first bit vector
94 // for each following level, insert the next bit of each word into the next bit vector
95 // sorted stably by the previous bit vector
96 let mut permutation = (0..num_elements).collect::<Vec<_>>();
97 let mut next_permutation = vec![0; num_elements];
98
99 for (level, data) in data.iter_mut().enumerate() {
100 let mut total_zeros = 0;
101 for (i, p) in permutation.iter().enumerate() {
102 if bit_lookup(*p, element_len - level - 1) == 0 {
103 total_zeros += 1;
104 } else {
105 data.set(i, 1).unwrap();
106 }
107 }
108
109 // scan through the generated bit array and move the elements to the correct position
110 // for the next permutation
111 if level < element_len - 1 {
112 let mut zero_boundary = 0;
113 let mut one_boundary = total_zeros;
114 for (i, p) in permutation.iter().enumerate() {
115 if data.get_unchecked(i) == 0 {
116 next_permutation[zero_boundary] = *p;
117 zero_boundary += 1;
118 } else {
119 next_permutation[one_boundary] = *p;
120 one_boundary += 1;
121 }
122 }
123
124 mem::swap(&mut permutation, &mut next_permutation);
125 }
126 }
127
128 Self {
129 data: data.into_iter().map(BitVec::into).collect(),
130 }
131 }
132
133 /// Create a new wavelet matrix from a sequence of `n` `k`-bit words.
134 /// The constructor runs in `O(kn * log n)` time complexity.
135 ///
136 /// # Parameters
137 /// - `bit_vec`: A packed sequence of `n` `k`-bit words. The `i`-th word begins in the
138 /// `bits_per_element * i`-th bit of the bit vector. Words are stored from least to most
139 /// significant bit.
140 /// - `bits_per_element`: The number `k` of bits in each word. Cannot exceed 1 << 16.
141 ///
142 /// # Panics
143 /// Panics if the number of bits in the bit vector is not a multiple of the number of bits per element.
144 #[must_use]
145 pub fn from_bit_vec(bit_vec: &BitVec, bits_per_element: u16) -> Self {
146 assert_eq!(bit_vec.len() % bits_per_element as usize, 0, "The number of bits in the bit vector must be a multiple of the number of bits per element.");
147 let num_elements = bit_vec.len() / bits_per_element as usize;
148 Self::permutation_sorting(bits_per_element, num_elements, |element, bit| {
149 bit_vec.get_unchecked(element * bits_per_element as usize + bit)
150 })
151 }
152
153 /// Create a new wavelet matrix from a sequence of `n` `k`-bit words.
154 /// The constructor runs in `O(kn * log n)` time complexity.
155 ///
156 /// # Parameters
157 /// - `sequence`: A slice of `n` u64 values, each encoding a `k`-bit word.
158 /// - `bits_per_element`: The number `k` of bits in each word. Cannot exceed 64.
159 ///
160 /// # Panics
161 /// Panics if the number of bits per element exceeds 64.
162 #[must_use]
163 pub fn from_slice(sequence: &[u64], bits_per_element: u16) -> Self {
164 assert!(
165 bits_per_element <= 64,
166 "The number of bits per element cannot exceed 64."
167 );
168 Self::permutation_sorting(bits_per_element, sequence.len(), |element, bit| {
169 (sequence[element] >> bit) & 1
170 })
171 }
172
173 /// Generic constructor that constructs the wavelet matrix by counting the prefixes of the elements.
174 /// The runtime complexity is `O(kn)`.
175 /// This constructor is only recommended for small alphabets.
176 ///
177 /// # Parameters
178 /// - `bits_per_element`: The number of bits in each word. Cannot exceed 64.
179 /// - `num_elements`: The number of elements in the sequence.
180 /// - `bit_lookup`: A closure that returns the `bit`-th bit of the `element`-th word.
181 /// - `element_lookup`: A closure that returns the `element`-th word.
182 #[inline(always)] // should get rid of closures in favor of static calls
183 fn prefix_counting<LOOKUP: Fn(usize, usize) -> u64, ELEMENT: Fn(usize) -> u64>(
184 bits_per_element: u16,
185 num_elements: usize,
186 bit_lookup: LOOKUP,
187 element_lookup: ELEMENT,
188 ) -> Self {
189 let element_len = bits_per_element as usize;
190 let mut histogram = vec![0usize; 1 << bits_per_element];
191 let mut borders = vec![0usize; 1 << bits_per_element];
192 let mut data = vec![BitVec::from_zeros(num_elements); element_len];
193
194 for i in 0..num_elements {
195 histogram[element_lookup(i) as usize] += 1;
196 data[0].set_unchecked(i, bit_lookup(i, element_len - 1));
197 }
198
199 for level in (1..element_len).rev() {
200 // combine histograms of prefixes
201 for h in 0..1 << level {
202 histogram[h] = histogram[2 * h] + histogram[2 * h + 1];
203 }
204
205 // compute borders of current level, using bit reverse patterns because of the weird
206 // node ordering in wavelet matrices
207 borders[0] = 0;
208 for h in 1usize..1 << level {
209 let h_minus_1 = (h - 1).reverse_bits() >> (64 - level);
210 borders[h.reverse_bits() >> (64 - level)] =
211 borders[h_minus_1] + histogram[h_minus_1];
212 }
213
214 for i in 0..num_elements {
215 let bit = bit_lookup(i, element_len - level - 1);
216 data[level].set_unchecked(
217 borders[element_lookup(i) as usize >> (element_len - level)],
218 bit,
219 );
220 borders[element_lookup(i) as usize >> (element_len - level)] += 1;
221 }
222 }
223
224 Self {
225 data: data.into_iter().map(BitVec::into).collect(),
226 }
227 }
228
229 /// Create a new wavelet matrix from a sequence of `n` `k`-bit words using the prefix counting
230 /// algorithm [Dinklage et al.](https://doi.org/10.1145/3457197)
231 /// The constructor runs in `O(kn)` time complexity but requires `O(2^k)` space during construction,
232 /// so it is only recommended for small alphabets.
233 /// Use the [`from_bit_vec`] or [`from_slice`] constructors for larger alphabets.
234 ///
235 /// # Parameters
236 /// - `bit_vec`: A packed sequence of `n` `k`-bit words. The `i`-th word begins in the
237 /// `bits_per_element * i`-th bit of the bit vector. Words are stored from least to most
238 /// significant bit.
239 /// - `bits_per_element`: The number `k` of bits in each word. Cannot exceed 1 << 16.
240 ///
241 /// # Panics
242 /// Panics if the number of bits in the bit vector is not a multiple of the number of bits per element,
243 /// or if the number of bits per element exceeds 64.
244 ///
245 /// [`from_bit_vec`]: WaveletMatrix::from_bit_vec
246 /// [`from_slice`]: WaveletMatrix::from_slice
247 #[must_use]
248 pub fn from_bit_vec_pc(bit_vec: &BitVec, bits_per_element: u16) -> Self {
249 assert_eq!(bit_vec.len() % bits_per_element as usize, 0, "The number of bits in the bit vector must be a multiple of the number of bits per element.");
250 assert!(
251 bits_per_element <= 64,
252 "The number of bits per element cannot exceed 64."
253 );
254 let num_elements = bit_vec.len() / bits_per_element as usize;
255 Self::prefix_counting(
256 bits_per_element,
257 num_elements,
258 |element, bit| bit_vec.get_unchecked(element * bits_per_element as usize + bit),
259 |element| {
260 bit_vec.get_bits_unchecked(
261 element * bits_per_element as usize,
262 bits_per_element as usize,
263 )
264 },
265 )
266 }
267
268 /// Create a new wavelet matrix from a sequence of `n` `k`-bit words using the prefix counting
269 /// algorithm [Dinklage et al.](https://doi.org/10.1145/3457197)
270 /// The constructor runs in `O(kn)` time complexity but requires `O(2^k)` space during construction,
271 /// so it is only recommended for small alphabets.
272 /// Use the [`from_bit_vec`] or [`from_slice`] constructors for larger alphabets.
273 ///
274 /// # Parameters
275 /// - `sequence`: A slice of `n` u64 values, each encoding a `k`-bit word.
276 /// - `bits_per_element`: The number `k` of bits in each word. Cannot exceed 64.
277 ///
278 /// # Panics
279 /// Panics if the number of bits per element exceeds 64.
280 ///
281 /// [`from_bit_vec`]: WaveletMatrix::from_bit_vec
282 /// [`from_slice`]: WaveletMatrix::from_slice
283 #[must_use]
284 pub fn from_slice_pc(sequence: &[u64], bits_per_element: u16) -> Self {
285 assert!(
286 bits_per_element <= 64,
287 "The number of bits per element cannot exceed 64."
288 );
289 Self::prefix_counting(
290 bits_per_element,
291 sequence.len(),
292 |element, bit| (sequence[element] >> bit) & 1,
293 |element| sequence[element],
294 )
295 }
296
297 /// Generic function to read a value from the wavelet matrix and consume it with a closure.
298 /// The function is used by the `get_value` and `get_u64` functions, deduplicating code.
299 #[inline(always)]
300 fn reconstruct_value_unchecked<F: FnMut(u64)>(&self, mut i: usize, mut target_func: F) {
301 for level in 0..self.bits_per_element() {
302 let bit = self.data[level].get_unchecked(i);
303 target_func(bit);
304 if bit == 0 {
305 i = self.data[level].rank0(i);
306 } else {
307 i = self.data[level].rank0 + self.data[level].rank1(i);
308 }
309 }
310 }
311
312 /// Get the `i`-th element of the encoded sequence in a `k`-bit word.
313 /// The `k`-bit word is returned as a `BitVec`.
314 /// The first element of the bit vector is the least significant bit.
315 ///
316 /// Returns `None` if the index is out of bounds.
317 ///
318 /// # Example
319 /// ```
320 /// use vers_vecs::{BitVec, WaveletMatrix};
321 ///
322 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
323 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
324 ///
325 /// assert_eq!(wavelet_matrix.get_value(0), Some(BitVec::pack_sequence_u8(&[1], 3)));
326 /// assert_eq!(wavelet_matrix.get_value(1), Some(BitVec::pack_sequence_u8(&[4], 3)));
327 /// assert_eq!(wavelet_matrix.get_value(100), None);
328 /// ```
329 #[must_use]
330 pub fn get_value(&self, i: usize) -> Option<BitVec> {
331 if self.data.is_empty() || i >= self.data[0].len() {
332 None
333 } else {
334 Some(self.get_value_unchecked(i))
335 }
336 }
337
338 /// Get the `i`-th element of the encoded sequence in a `k`-bit word.
339 /// The `k`-bit word is returned as a `BitVec`.
340 /// The first element of the bit vector is the least significant bit.
341 /// This function does not perform bounds checking.
342 /// Use [`get_value`] for a checked version.
343 ///
344 /// # Panics
345 /// May panic if the index is out of bounds. May instead return an empty bit vector.
346 ///
347 /// [`get_value`]: WaveletMatrix::get_value
348 #[must_use]
349 pub fn get_value_unchecked(&self, i: usize) -> BitVec {
350 let mut value = BitVec::from_zeros(self.bits_per_element());
351 let mut level = self.bits_per_element() - 1;
352 self.reconstruct_value_unchecked(i, |bit| {
353 value.set_unchecked(level, bit);
354 level = level.saturating_sub(1);
355 });
356 value
357 }
358
359 /// Get the `i`-th element of the encoded sequence as a `u64`.
360 /// The `u64` is constructed from the `k`-bit word stored in the wavelet matrix.
361 ///
362 /// Returns `None` if the index is out of bounds, or if the number of bits per element exceeds 64.
363 ///
364 /// # Example
365 /// ```
366 /// use vers_vecs::{BitVec, WaveletMatrix};
367 ///
368 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
369 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
370 ///
371 /// assert_eq!(wavelet_matrix.get_u64(0), Some(1));
372 /// assert_eq!(wavelet_matrix.get_u64(1), Some(4));
373 /// assert_eq!(wavelet_matrix.get_u64(100), None);
374 /// ```
375 #[must_use]
376 pub fn get_u64(&self, i: usize) -> Option<u64> {
377 if self.bits_per_element() > 64 || self.data.is_empty() || i >= self.data[0].len() {
378 None
379 } else {
380 Some(self.get_u64_unchecked(i))
381 }
382 }
383
384 /// Get the `i`-th element of the encoded sequence as a `u64` numeral.
385 /// The element is encoded in the lowest `k` bits of the numeral.
386 /// If the number of bits per element exceeds 64, the value is truncated.
387 /// This function does not perform bounds checking.
388 /// Use [`get_u64`] for a checked version.
389 ///
390 /// # Panic
391 /// May panic if the value of `i` is out of bounds. May instead return 0.
392 ///
393 /// [`get_u64`]: WaveletMatrix::get_u64
394 #[must_use]
395 pub fn get_u64_unchecked(&self, i: usize) -> u64 {
396 let mut value = 0;
397 self.reconstruct_value_unchecked(i, |bit| {
398 value <<= 1;
399 value |= bit;
400 });
401 value
402 }
403
404 /// Get the number of occurrences of the given `symbol` in the encoded sequence in the `range`.
405 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
406 /// where the least significant bit is the first element, and `k` is the number of bits per element
407 /// in the wavelet matrix.
408 ///
409 /// This method does not perform bounds checking, nor does it check if the `symbol` is a valid
410 /// `k`-bit word.
411 /// Use [`rank_range`] for a checked version.
412 ///
413 /// # Panics
414 /// May panic if the `range` is out of bounds,
415 /// or if the number of bits in `symbol` is lower than `k`.
416 /// May instead return 0.
417 ///
418 /// [`BitVec`]: BitVec
419 /// [`rank_range`]: WaveletMatrix::rank_range
420 #[must_use]
421 pub fn rank_range_unchecked(&self, mut range: Range<usize>, symbol: &BitVec) -> usize {
422 for (level, data) in self.data.iter().enumerate() {
423 if symbol.get_unchecked((self.bits_per_element() - 1) - level) == 0 {
424 range.start = data.rank0(range.start);
425 range.end = data.rank0(range.end);
426 } else {
427 range.start = data.rank0 + data.rank1(range.start);
428 range.end = data.rank0 + data.rank1(range.end);
429 }
430 }
431
432 range.end - range.start
433 }
434
435 /// Get the number of occurrences of the given `symbol` in the encoded sequence in the `range`.
436 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
437 /// where the least significant bit is the first element, and `k` is the number of bits per element
438 /// in the wavelet matrix.
439 ///
440 /// Returns `None` if the `range` is out of bounds (greater than the length of the encoded sequence,
441 /// but since it is exclusive, it may be equal to the length),
442 /// or if the number of bits in `symbol` is not equal to `k`.
443 ///
444 /// # Example
445 /// ```
446 /// use vers_vecs::{BitVec, WaveletMatrix};
447 ///
448 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
449 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
450 ///
451 /// assert_eq!(wavelet_matrix.rank_range(0..3, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
452 /// assert_eq!(wavelet_matrix.rank_range(2..4, &BitVec::pack_sequence_u8(&[4], 3)), Some(1));
453 /// ```
454 ///
455 /// [`BitVec`]: BitVec
456 #[must_use]
457 pub fn rank_range(&self, range: Range<usize>, symbol: &BitVec) -> Option<usize> {
458 if range.start >= self.len()
459 || range.end > self.len()
460 || symbol.len() != self.bits_per_element()
461 {
462 None
463 } else {
464 Some(self.rank_range_unchecked(range, symbol))
465 }
466 }
467
468 /// Get the number of occurrences of the given `symbol` in the encoded sequence in the `range`.
469 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
470 /// where k is less than or equal to 64.
471 /// The interval is half-open, meaning `rank_range_u64(0..0, symbol)` returns 0.
472 ///
473 /// This method does not perform bounds checking, nor does it check if the elements of the
474 /// wavelet matrix can be represented in a u64 numeral.
475 /// Use [`rank_range_u64`] for a checked version.
476 ///
477 /// # Panics
478 /// May panic if the `range` is out of bounds.
479 /// May instead return 0.
480 /// If the number of bits in wavelet matrix elements exceed `64`, the behavior is
481 /// platform-dependent.
482 ///
483 /// [`rank_range_u64`]: WaveletMatrix::rank_range_u64
484 #[must_use]
485 pub fn rank_range_u64_unchecked(&self, mut range: Range<usize>, symbol: u64) -> usize {
486 for (level, data) in self.data.iter().enumerate() {
487 if (symbol >> ((self.bits_per_element() - 1) - level)) & 1 == 0 {
488 range.start = data.rank0(range.start);
489 range.end = data.rank0(range.end);
490 } else {
491 range.start = data.rank0 + data.rank1(range.start);
492 range.end = data.rank0 + data.rank1(range.end);
493 }
494 }
495
496 range.end - range.start
497 }
498
499 /// Get the number of occurrences of the given `symbol` in the encoded sequence in the `range`.
500 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
501 /// where k is less than or equal to 64.
502 /// The interval is half-open, meaning `rank_range_u64(0..0, symbol)` returns 0.
503 ///
504 /// Returns `None` if the `range` is out of bounds (greater than the length of the encoded sequence,
505 /// but since it is exclusive, it may be equal to the length),
506 /// or if the number of bits in the wavelet matrix elements exceed `64`.
507 ///
508 /// # Example
509 /// ```
510 /// use vers_vecs::{BitVec, WaveletMatrix};
511 ///
512 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
513 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
514 ///
515 /// assert_eq!(wavelet_matrix.rank_range_u64(0..3, 4), Some(2));
516 /// assert_eq!(wavelet_matrix.rank_range_u64(2..4, 4), Some(1));
517 /// ```
518 #[must_use]
519 pub fn rank_range_u64(&self, range: Range<usize>, symbol: u64) -> Option<usize> {
520 if range.start >= self.len() || range.end > self.len() || self.bits_per_element() > 64 {
521 None
522 } else {
523 Some(self.rank_range_u64_unchecked(range, symbol))
524 }
525 }
526
527 /// Get the number of occurrences of the given `symbol` in the encoded sequence between the
528 /// `offset`-th and `i`-th element (exclusive).
529 /// This is equivalent to ```rank_range_unchecked(offset..i, symbol)```.
530 /// The interval is half-open, meaning ```rank_offset_unchecked(0, 0, symbol)``` returns 0.
531 ///
532 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
533 /// where the least significant bit is the first element, and `k` is the number of bits per element
534 /// in the wavelet matrix.
535 ///
536 /// This method does not perform bounds checking, nor does it check if the `symbol` is a valid
537 /// `k`-bit word.
538 /// Use [`rank_offset`] for a checked version.
539 ///
540 /// # Panics
541 /// May panic if `offset` is out of bounds,
542 /// or if `offset + i` is larger than the length of the encoded sequence,
543 /// or if `offset` is greater than `i`,
544 /// or if the number of bits in `symbol` is lower than `k`.
545 /// May instead return 0.
546 ///
547 /// [`BitVec`]: BitVec
548 /// [`rank_offset`]: WaveletMatrix::rank_offset
549 #[must_use]
550 pub fn rank_offset_unchecked(&self, offset: usize, i: usize, symbol: &BitVec) -> usize {
551 self.rank_range_unchecked(offset..i, symbol)
552 }
553
554 /// Get the number of occurrences of the given `symbol` in the encoded sequence between the
555 /// `offset`-th and `i`-th element (exclusive).
556 /// This is equivalent to ``rank_range(offset..i, symbol)``.
557 /// The interval is half-open, meaning ``rank_offset(0, 0, symbol)`` returns 0,
558 /// because the interval is empty.
559 ///
560 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
561 /// where the least significant bit is the first element, and `k` is the number of bits per element
562 /// in the wavelet matrix.
563 ///
564 /// Returns `None` if `offset` is out of bounds,
565 /// or if `i` is larger than the length of the encoded sequence,
566 /// or if `offset` is greater than `i`,
567 /// or if the number of bits in `symbol` is not equal to `k`.
568 /// `i` may equal the length of the encoded sequence,
569 /// which will return the number of occurrences of the symbol up to the end of the sequence.
570 ///
571 /// # Example
572 /// ```
573 /// use vers_vecs::{BitVec, WaveletMatrix};
574 ///
575 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
576 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
577 ///
578 /// assert_eq!(wavelet_matrix.rank_offset(0, 3, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
579 /// assert_eq!(wavelet_matrix.rank_offset(2, 4, &BitVec::pack_sequence_u8(&[4], 3)), Some(1));
580 /// ```
581 ///
582 /// [`BitVec`]: BitVec
583 #[must_use]
584 pub fn rank_offset(&self, offset: usize, i: usize, symbol: &BitVec) -> Option<usize> {
585 if offset > i
586 || offset >= self.len()
587 || i > self.len()
588 || symbol.len() != self.bits_per_element()
589 {
590 None
591 } else {
592 Some(self.rank_offset_unchecked(offset, i, symbol))
593 }
594 }
595
596 /// Get the number of occurrences of the given `symbol` in the encoded sequence between the
597 /// `offset`-th and `i`-th element (exclusive).
598 /// This is equivalent to ``rank_range(offset..i, symbol)``.
599 /// The interval is half-open, meaning ``rank_offset(0, 0, symbol)`` returns 0.
600 ///
601 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
602 /// where k is less than or equal to 64.
603 ///
604 /// This method does not perform bounds checking, nor does it check if the elements of the
605 /// wavelet matrix can be represented in a u64 numeral.
606 /// Use [`rank_offset_u64`] for a checked version.
607 ///
608 /// # Panics
609 /// May panic if `offset` is out of bounds,
610 /// or if `i` is larger than the length of the encoded sequence,
611 /// or if `offset` is greater than `i`,
612 /// or if the number of bits in wavelet matrix elements exceed `64`.
613 /// May instead return 0.
614 ///
615 /// [`rank_offset_u64`]: WaveletMatrix::rank_offset_u64
616 #[must_use]
617 pub fn rank_offset_u64_unchecked(&self, offset: usize, i: usize, symbol: u64) -> usize {
618 self.rank_range_u64_unchecked(offset..i, symbol)
619 }
620
621 /// Get the number of occurrences of the given `symbol` in the encoded sequence between the
622 /// `offset`-th and `i`-th element (exclusive).
623 /// This is equivalent to ``rank_range(offset..i, symbol)``.
624 /// The interval is half-open, meaning ``rank_offset(0, 0, symbol)`` returns 0.
625 ///
626 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
627 /// where k is less than or equal to 64.
628 ///
629 /// Returns `None` if `offset` is out of bounds,
630 /// or if `i` is larger than the length of the encoded sequence,
631 /// or if `offset` is greater than `i`,
632 /// or if the number of bits in the wavelet matrix elements exceed `64`.
633 /// `i` may equal the length of the encoded sequence,
634 /// which will return the number of occurrences of the symbol up to the end of the sequence.
635 ///
636 /// # Example
637 /// ```
638 /// use vers_vecs::{BitVec, WaveletMatrix};
639 ///
640 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
641 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
642 ///
643 /// assert_eq!(wavelet_matrix.rank_offset_u64(0, 3, 4), Some(2));
644 /// assert_eq!(wavelet_matrix.rank_offset_u64(2, 4, 4), Some(1));
645 /// ```
646 #[must_use]
647 pub fn rank_offset_u64(&self, offset: usize, i: usize, symbol: u64) -> Option<usize> {
648 if offset > i || offset >= self.len() || i > self.len() || self.bits_per_element() > 64 {
649 None
650 } else {
651 Some(self.rank_offset_u64_unchecked(offset, i, symbol))
652 }
653 }
654
655 /// Get the number of occurrences of the given `symbol` in the encoded sequence up to the `i`-th
656 /// element (exclusive).
657 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
658 /// where the least significant bit is the first element, and `k` is the number of bits per element
659 /// in the wavelet matrix.
660 ///
661 /// This method does not perform bounds checking, nor does it check if the `symbol` is a valid
662 /// `k`-bit word.
663 /// Use [`rank`] for a checked version.
664 ///
665 /// # Panics
666 /// May panic if `i` is out of bounds, or if the number of bits in `symbol` is lower than `k`.
667 /// May instead return 0.
668 /// If the number of bits in `symbol` exceeds `k`, the remaining bits are ignored.
669 ///
670 /// [`BitVec`]: BitVec
671 /// [`rank`]: WaveletMatrix::rank
672 #[must_use]
673 pub fn rank_unchecked(&self, i: usize, symbol: &BitVec) -> usize {
674 self.rank_range_unchecked(0..i, symbol)
675 }
676
677 /// Get the number of occurrences of the given `symbol` in the encoded sequence up to the `i`-th
678 /// element (exclusive).
679 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
680 /// where the least significant bit is the first element, and `k` is the number of bits per element
681 /// in the wavelet matrix.
682 ///
683 /// Returns `None` if `i` is out of bounds (greater than the length of the encoded sequence, but
684 /// since it is exclusive, it may be equal to the length),
685 /// or if the number of bits in `symbol` is not equal to `k`.
686 ///
687 /// # Example
688 /// ```
689 /// use vers_vecs::{BitVec, WaveletMatrix};
690 ///
691 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
692 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
693 ///
694 /// assert_eq!(wavelet_matrix.rank(3, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
695 /// assert_eq!(wavelet_matrix.rank(3, &BitVec::pack_sequence_u8(&[1], 3)), Some(1));
696 /// ```
697 ///
698 /// [`BitVec`]: BitVec
699 #[must_use]
700 pub fn rank(&self, i: usize, symbol: &BitVec) -> Option<usize> {
701 if i > self.len() || symbol.len() != self.bits_per_element() {
702 None
703 } else {
704 Some(self.rank_range_unchecked(0..i, symbol))
705 }
706 }
707
708 /// Get the number of occurrences of the given `symbol` in the encoded sequence up to the `i`-th
709 /// element (exclusive).
710 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
711 /// where k is less than or equal to 64.
712 ///
713 /// This method does not perform bounds checking, nor does it check if the elements of the
714 /// wavelet matrix can be represented in a u64 numeral.
715 /// Use [`rank_u64`] for a checked version.
716 ///
717 /// # Panics
718 /// May panic if `i` is out of bounds,
719 /// or if the number of bits in wavelet matrix elements exceed `64`.
720 /// May instead return 0.
721 ///
722 /// [`rank_u64`]: WaveletMatrix::rank_u64
723 #[must_use]
724 pub fn rank_u64_unchecked(&self, i: usize, symbol: u64) -> usize {
725 self.rank_range_u64_unchecked(0..i, symbol)
726 }
727
728 /// Get the number of occurrences of the given `symbol` in the encoded sequence up to the `i`-th
729 /// element (exclusive).
730 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
731 /// where k is less than or equal to 64.
732 ///
733 /// Returns `None` if `i` is out of bounds (greater than the length of the encoded sequence, but
734 /// since it is exclusive, it may be equal to the length),
735 /// or if the number of bits in the wavelet matrix elements exceed `64`.
736 ///
737 /// # Example
738 /// ```
739 /// use vers_vecs::{BitVec, WaveletMatrix};
740 ///
741 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
742 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
743 ///
744 /// assert_eq!(wavelet_matrix.rank_u64(3, 4), Some(2));
745 /// assert_eq!(wavelet_matrix.rank_u64(3, 1), Some(1));
746 /// ```
747 #[must_use]
748 pub fn rank_u64(&self, i: usize, symbol: u64) -> Option<usize> {
749 if i > self.len() || self.bits_per_element() > 64 {
750 None
751 } else {
752 Some(self.rank_range_u64_unchecked(0..i, symbol))
753 }
754 }
755
756 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence,
757 /// starting from the `offset`-th element.
758 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
759 /// where the least significant bit is the first element, and `k` is the number of bits per element
760 /// in the wavelet matrix.
761 ///
762 /// This method does not perform bounds checking, nor does it check if the `symbol` is a valid
763 /// `k`-bit word.
764 /// Use [`select_offset`] for a checked version.
765 ///
766 /// Returns the index of the `rank`-th occurrence of the `symbol` in the encoded sequence,
767 /// or the length of the encoded sequence if the `rank`-th occurrence does not exist.
768 ///
769 /// # Panics
770 /// May panic if the `offset` is out of bounds,
771 /// or if the number of bits in `symbol` is lower than `k`.
772 /// May instead return the length of the encoded sequence.
773 ///
774 /// [`BitVec`]: BitVec
775 /// [`select_offset`]: WaveletMatrix::select_offset
776 #[must_use]
777 pub fn select_offset_unchecked(&self, offset: usize, rank: usize, symbol: &BitVec) -> usize {
778 let mut range_start = offset;
779
780 for (level, data) in self.data.iter().enumerate() {
781 if symbol.get_unchecked((self.bits_per_element() - 1) - level) == 0 {
782 range_start = data.rank0(range_start);
783 } else {
784 range_start = data.rank0 + data.rank1(range_start);
785 }
786 }
787
788 let mut range_end = range_start + rank;
789
790 for (level, data) in self.data.iter().enumerate().rev() {
791 if symbol.get_unchecked((self.bits_per_element() - 1) - level) == 0 {
792 range_end = data.select0(range_end);
793 } else {
794 range_end = data.select1(range_end - data.rank0);
795 }
796 }
797
798 range_end
799 }
800
801 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence,
802 /// starting from the `offset`-th element.
803 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
804 /// where the least significant bit is the first element, and `k` is the number of bits per element
805 /// in the wavelet matrix.
806 ///
807 /// Returns `None` if `offset` is out of bounds, or if the number of bits in `symbol` is not equal to `k`,
808 /// or if the `rank`-th occurrence of the `symbol` does not exist.
809 ///
810 /// # Example
811 /// ```
812 /// use vers_vecs::{BitVec, WaveletMatrix};
813 ///
814 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
815 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
816 ///
817 /// assert_eq!(wavelet_matrix.select_offset(0, 0, &BitVec::pack_sequence_u8(&[4], 3)), Some(1));
818 /// assert_eq!(wavelet_matrix.select_offset(0, 1, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
819 /// assert_eq!(wavelet_matrix.select_offset(2, 0, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
820 /// assert_eq!(wavelet_matrix.select_offset(2, 1, &BitVec::pack_sequence_u8(&[4], 3)), None);
821 /// ```
822 ///
823 /// [`BitVec`]: BitVec
824 #[must_use]
825 pub fn select_offset(&self, offset: usize, rank: usize, symbol: &BitVec) -> Option<usize> {
826 if offset >= self.len() || symbol.len() != self.bits_per_element() {
827 None
828 } else {
829 let idx = self.select_offset_unchecked(offset, rank, symbol);
830 if idx < self.len() {
831 Some(idx)
832 } else {
833 None
834 }
835 }
836 }
837
838 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence,
839 /// starting from the `offset`-th element.
840 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
841 /// where k is less than or equal to 64.
842 ///
843 /// This method does not perform bounds checking, nor does it check if the elements of the
844 /// wavelet matrix can be represented in a u64 numeral.
845 /// Use [`select_offset_u64`] for a checked version.
846 ///
847 /// Returns the index of the `rank`-th occurrence of the `symbol` in the encoded sequence,
848 /// or the length of the encoded sequence if the `rank`-th occurrence does not exist.
849 ///
850 /// # Panics
851 /// May panic if the `offset` is out of bounds,
852 /// or if the number of bits in wavelet matrix elements exceed `64`.
853 /// May instead return the length of the encoded sequence.
854 ///
855 /// [`select_offset_u64`]: WaveletMatrix::select_offset_u64
856 #[must_use]
857 pub fn select_offset_u64_unchecked(&self, offset: usize, rank: usize, symbol: u64) -> usize {
858 let mut range_start = offset;
859
860 for (level, data) in self.data.iter().enumerate() {
861 if (symbol >> ((self.bits_per_element() - 1) - level)) & 1 == 0 {
862 range_start = data.rank0(range_start);
863 } else {
864 range_start = data.rank0 + data.rank1(range_start);
865 }
866 }
867
868 let mut range_end = range_start + rank;
869
870 for (level, data) in self.data.iter().enumerate().rev() {
871 if (symbol >> ((self.bits_per_element() - 1) - level)) & 1 == 0 {
872 range_end = data.select0(range_end);
873 } else {
874 range_end = data.select1(range_end - data.rank0);
875 }
876 }
877
878 range_end
879 }
880
881 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence,
882 /// starting from the `offset`-th element.
883 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
884 /// where k is less than or equal to 64.
885 ///
886 /// Returns `None` if `offset` is out of bounds, or if the number of bits in the wavelet matrix
887 /// elements exceed `64`, or if the `rank`-th occurrence of the `symbol` does not exist.
888 ///
889 /// # Example
890 /// ```
891 /// use vers_vecs::{BitVec, WaveletMatrix};
892 ///
893 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
894 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
895 ///
896 /// assert_eq!(wavelet_matrix.select_offset_u64(0, 0, 4), Some(1));
897 /// assert_eq!(wavelet_matrix.select_offset_u64(0, 1, 4), Some(2));
898 /// assert_eq!(wavelet_matrix.select_offset_u64(2, 0, 4), Some(2));
899 /// assert_eq!(wavelet_matrix.select_offset_u64(2, 1, 4), None);
900 /// ```
901 #[must_use]
902 pub fn select_offset_u64(&self, offset: usize, rank: usize, symbol: u64) -> Option<usize> {
903 if offset >= self.len() || self.bits_per_element() > 64 {
904 None
905 } else {
906 let idx = self.select_offset_u64_unchecked(offset, rank, symbol);
907 if idx < self.len() {
908 Some(idx)
909 } else {
910 None
911 }
912 }
913 }
914
915 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence.
916 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
917 /// where the least significant bit is the first element, and `k` is the number of bits per element
918 /// in the wavelet matrix.
919 ///
920 /// This method does not perform bounds checking, nor does it check if the `symbol` is a valid
921 /// `k`-bit word.
922 /// Use [`select`] for a checked version.
923 ///
924 /// Returns the index of the `rank`-th occurrence of the `symbol` in the encoded sequence,
925 /// or the length of the encoded sequence if the `rank`-th occurrence does not exist.
926 ///
927 /// # Panics
928 /// May panic if the number of bits in `symbol` is not equal to `k`.
929 /// May instead return the length of the encoded sequence.
930 ///
931 /// [`BitVec`]: BitVec
932 /// [`select`]: WaveletMatrix::select
933 #[must_use]
934 pub fn select_unchecked(&self, rank: usize, symbol: &BitVec) -> usize {
935 self.select_offset_unchecked(0, rank, symbol)
936 }
937
938 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence.
939 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
940 /// where the least significant bit is the first element, and `k` is the number of bits per element
941 /// in the wavelet matrix.
942 ///
943 /// Returns `None` if the number of bits in `symbol` is not equal to `k`,
944 /// or if the `rank`-th occurrence of the `symbol` does not exist.
945 ///
946 /// # Example
947 /// ```
948 /// use vers_vecs::{BitVec, WaveletMatrix};
949 ///
950 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
951 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
952 ///
953 /// assert_eq!(wavelet_matrix.select(0, &BitVec::pack_sequence_u8(&[4], 3)), Some(1));
954 /// assert_eq!(wavelet_matrix.select(1, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
955 /// ```
956 ///
957 /// [`BitVec`]: BitVec
958 #[must_use]
959 pub fn select(&self, rank: usize, symbol: &BitVec) -> Option<usize> {
960 if symbol.len() == self.bits_per_element() {
961 let idx = self.select_unchecked(rank, symbol);
962 if idx < self.len() {
963 Some(idx)
964 } else {
965 None
966 }
967 } else {
968 None
969 }
970 }
971
972 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence.
973 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
974 /// where k is less than or equal to 64.
975 ///
976 /// This method does not perform bounds checking, nor does it check if the elements of the
977 /// wavelet matrix can be represented in a u64 numeral.
978 /// Use [`select_u64`] for a checked version.
979 ///
980 /// Returns the index of the `rank`-th occurrence of the `symbol` in the encoded sequence,
981 /// or the length of the encoded sequence if the `rank`-th occurrence does not exist.
982 ///
983 /// # Panics
984 /// May panic if the number of bits in wavelet matrix elements exceed `64`.
985 /// May instead return the length of the encoded sequence.
986 ///
987 /// [`select_u64`]: WaveletMatrix::select_u64
988 #[must_use]
989 pub fn select_u64_unchecked(&self, rank: usize, symbol: u64) -> usize {
990 self.select_offset_u64_unchecked(0, rank, symbol)
991 }
992
993 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence.
994 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
995 /// where k is less than or equal to 64.
996 ///
997 /// Returns `None` if the number of bits in the wavelet matrix elements exceed `64`,
998 /// or if the `rank`-th occurrence of the `symbol` does not exist.
999 ///
1000 /// # Example
1001 /// ```
1002 /// use vers_vecs::{BitVec, WaveletMatrix};
1003 ///
1004 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1005 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1006 ///
1007 /// assert_eq!(wavelet_matrix.select_u64(0, 4), Some(1));
1008 /// assert_eq!(wavelet_matrix.select_u64(1, 4), Some(2));
1009 /// ```
1010 #[must_use]
1011 pub fn select_u64(&self, rank: usize, symbol: u64) -> Option<usize> {
1012 if self.bits_per_element() > 64 {
1013 None
1014 } else {
1015 let idx = self.select_u64_unchecked(rank, symbol);
1016 if idx < self.len() {
1017 Some(idx)
1018 } else {
1019 None
1020 }
1021 }
1022 }
1023
1024 /// Get the `k`-th smallest element in the encoded sequence in the specified `range`,
1025 /// where `k = 0` returns the smallest element.
1026 /// The `range` is a half-open interval, meaning that the `end` index is exclusive.
1027 /// The `k`-th smallest element is returned as a `BitVec`,
1028 /// where the least significant bit is the first element.
1029 ///
1030 /// This method does not perform bounds checking.
1031 /// It returns a nonsensical result if the `k` is greater than the size of the range.
1032 /// Use [`quantile`] for a checked version.
1033 ///
1034 /// # Panics
1035 /// May panic if the `range` is out of bounds. May instead return an empty bit vector.
1036 ///
1037 /// [`quantile`]: WaveletMatrix::quantile
1038 #[must_use]
1039 pub fn quantile_unchecked(&self, range: Range<usize>, k: usize) -> BitVec {
1040 let result = BitVec::from_zeros(self.bits_per_element());
1041
1042 self.partial_quantile_search_unchecked(range, k, 0, result)
1043 }
1044
1045 /// Internal function to reuse the quantile code for the predecessor and successor search.
1046 /// This function performs the quantile search starting at the given level with the given prefix.
1047 ///
1048 /// The function does not perform any checks, so the caller must ensure that the range is valid,
1049 /// and that the prefix is a valid prefix for the given level.
1050 #[inline(always)]
1051 fn partial_quantile_search_unchecked(
1052 &self,
1053 mut range: Range<usize>,
1054 mut k: usize,
1055 start_level: usize,
1056 mut prefix: BitVec,
1057 ) -> BitVec {
1058 debug_assert!(prefix.len() == self.bits_per_element());
1059 debug_assert!(!range.is_empty());
1060 debug_assert!(range.end <= self.len());
1061
1062 for (level, data) in self.data.iter().enumerate().skip(start_level) {
1063 let zeros_start = data.rank0(range.start);
1064 let zeros_end = data.rank0(range.end);
1065 let zeros = zeros_end - zeros_start;
1066
1067 // if k < zeros, the element is among the zeros
1068 if k < zeros {
1069 range.start = zeros_start;
1070 range.end = zeros_end;
1071 } else {
1072 // the element is among the ones, so we set the bit to 1, and move the range
1073 // into the 1-partition of the next level
1074 prefix.set_unchecked((self.bits_per_element() - 1) - level, 1);
1075 k -= zeros;
1076 range.start = data.rank0 + (range.start - zeros_start); // range.start - zeros_start is the rank1 of range.start
1077 range.end = data.rank0 + (range.end - zeros_end); // same here
1078 }
1079 }
1080
1081 prefix
1082 }
1083
1084 /// Get the `k`-th smallest element in the encoded sequence in the specified `range`,
1085 /// where `k = 0` returns the smallest element.
1086 /// The `range` is a half-open interval, meaning that the `end` index is exclusive.
1087 /// The `k`-th smallest element is returned as a [`BitVec`],
1088 /// where the least significant bit is the first element.
1089 ///
1090 /// Returns `None` if the `range` is out of bounds, or if `k` is greater than the size of the range.
1091 ///
1092 /// # Example
1093 /// ```
1094 /// use vers_vecs::{BitVec, WaveletMatrix};
1095 ///
1096 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1097 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1098 ///
1099 /// assert_eq!(wavelet_matrix.quantile(0..3, 0), Some(BitVec::pack_sequence_u8(&[1], 3)));
1100 /// assert_eq!(wavelet_matrix.quantile(0..3, 1), Some(BitVec::pack_sequence_u8(&[4], 3)));
1101 /// assert_eq!(wavelet_matrix.quantile(1..4, 0), Some(BitVec::pack_sequence_u8(&[1], 3)));
1102 /// ```
1103 #[must_use]
1104 pub fn quantile(&self, range: Range<usize>, k: usize) -> Option<BitVec> {
1105 if range.start >= self.len() || range.end > self.len() || k >= range.end - range.start {
1106 None
1107 } else {
1108 Some(self.quantile_unchecked(range, k))
1109 }
1110 }
1111
1112 /// Get the `i`-th smallest element in the entire wavelet matrix.
1113 /// The `i`-th smallest element is returned as a `BitVec`,
1114 /// where the least significant bit is the first element.
1115 ///
1116 /// This method does not perform bounds checking.
1117 /// Use [`get_sorted`] for a checked version.
1118 ///
1119 /// # Panics
1120 /// May panic if the `i` is out of bounds, or returns an empty bit vector.
1121 ///
1122 /// [`get_sorted`]: Self::get_sorted
1123 #[must_use]
1124 pub fn get_sorted_unchecked(&self, i: usize) -> BitVec {
1125 self.quantile_unchecked(0..self.len(), i)
1126 }
1127
1128 /// Get the `i`-th smallest element in the wavelet matrix.
1129 /// The `i`-th smallest element is returned as a `BitVec`,
1130 /// where the least significant bit is the first element.
1131 /// This method call is equivalent to `self.quantile(0..self.len(), i)`.
1132 ///
1133 /// Returns `None` if the `i` is out of bounds.
1134 ///
1135 /// # Example
1136 /// ```
1137 /// use vers_vecs::{BitVec, WaveletMatrix};
1138 ///
1139 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1140 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1141 ///
1142 /// assert_eq!(wavelet_matrix.get_sorted(0), Some(BitVec::pack_sequence_u8(&[1], 3)));
1143 /// assert_eq!(wavelet_matrix.get_sorted(1), Some(BitVec::pack_sequence_u8(&[1], 3)));
1144 /// assert_eq!(wavelet_matrix.get_sorted(2), Some(BitVec::pack_sequence_u8(&[2], 3)));
1145 /// ```
1146 #[must_use]
1147 pub fn get_sorted(&self, i: usize) -> Option<BitVec> {
1148 if i >= self.len() {
1149 None
1150 } else {
1151 Some(self.get_sorted_unchecked(i))
1152 }
1153 }
1154
1155 /// Get the `k`-th smallest element in the encoded sequence in the specified `range`,
1156 /// where `k = 0` returns the smallest element.
1157 /// The `range` is a half-open interval, meaning that the `end` index is exclusive.
1158 /// The `k`-th smallest element is returned as a `u64` numeral.
1159 /// If the number of bits per element exceeds 64, the value is truncated.
1160 ///
1161 /// This method does not perform bounds checking.
1162 /// It returns a nonsensical result if the `k` is greater than the size of the range.
1163 /// Use [`quantile_u64`] for a checked version.
1164 ///
1165 /// # Panics
1166 /// May panic if the `range` is out of bounds.
1167 /// May instead return 0.
1168 ///
1169 /// [`quantile_u64`]: WaveletMatrix::quantile_u64
1170 #[must_use]
1171 pub fn quantile_u64_unchecked(&self, range: Range<usize>, k: usize) -> u64 {
1172 self.partial_quantile_search_u64_unchecked(range, k, 0, 0)
1173 }
1174
1175 /// Internal function to reuse the quantile code for the predecessor and successor search.
1176 /// This function performs the quantile search starting at the given level with the given prefix.
1177 ///
1178 /// The function does not perform any checks, so the caller must ensure that the range is valid,
1179 /// and that the prefix is a valid prefix for the given level (i.e. the prefix is not shifted
1180 /// to another level).
1181 #[inline(always)]
1182 fn partial_quantile_search_u64_unchecked(
1183 &self,
1184 mut range: Range<usize>,
1185 mut k: usize,
1186 start_level: usize,
1187 mut prefix: u64,
1188 ) -> u64 {
1189 debug_assert!(self.bits_per_element() <= 64);
1190 debug_assert!(!range.is_empty());
1191 debug_assert!(range.end <= self.len());
1192
1193 for data in self.data.iter().skip(start_level) {
1194 prefix <<= 1;
1195 let zeros_start = data.rank0(range.start);
1196 let zeros_end = data.rank0(range.end);
1197 let zeros = zeros_end - zeros_start;
1198
1199 if k < zeros {
1200 range.start = zeros_start;
1201 range.end = zeros_end;
1202 } else {
1203 prefix |= 1;
1204 k -= zeros;
1205 range.start = data.rank0 + (range.start - zeros_start);
1206 range.end = data.rank0 + (range.end - zeros_end);
1207 }
1208 }
1209
1210 prefix
1211 }
1212
1213 /// Get the `k`-th smallest element in the encoded sequence in the specified `range`,
1214 /// where `k = 0` returns the smallest element.
1215 /// The `range` is a half-open interval, meaning that the `end` index is exclusive.
1216 /// The `k`-th smallest element is returned as a `u64` numeral.
1217 ///
1218 /// Returns `None` if the `range` is out of bounds, or if the number of bits per element exceeds 64,
1219 /// or if `k` is greater than the size of the range.
1220 ///
1221 /// # Example
1222 /// ```
1223 /// use vers_vecs::{BitVec, WaveletMatrix};
1224 ///
1225 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1226 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1227 ///
1228 /// assert_eq!(wavelet_matrix.quantile_u64(0..3, 0), Some(1));
1229 /// assert_eq!(wavelet_matrix.quantile_u64(0..3, 1), Some(4));
1230 /// assert_eq!(wavelet_matrix.quantile_u64(1..4, 0), Some(1));
1231 /// ```
1232 #[must_use]
1233 pub fn quantile_u64(&self, range: Range<usize>, k: usize) -> Option<u64> {
1234 if range.start >= self.len()
1235 || range.end > self.len()
1236 || self.bits_per_element() > 64
1237 || k >= range.end - range.start
1238 {
1239 None
1240 } else {
1241 Some(self.quantile_u64_unchecked(range, k))
1242 }
1243 }
1244
1245 /// Get the `i`-th smallest element in the wavelet matrix.
1246 /// The `i`-th smallest element is returned as a u64 numeral.
1247 ///
1248 /// If the number of bits per element exceeds 64, the value is truncated.
1249 ///
1250 /// This method does not perform bounds checking.
1251 /// Use [`get_sorted_u64`] for a checked version.
1252 ///
1253 /// # Panics
1254 /// May panic if the `i` is out of bounds, or returns an empty bit vector.
1255 ///
1256 /// [`get_sorted_u64`]: WaveletMatrix::get_sorted_u64
1257 #[must_use]
1258 pub fn get_sorted_u64_unchecked(&self, i: usize) -> u64 {
1259 self.quantile_u64_unchecked(0..self.len(), i)
1260 }
1261
1262 /// Get the `i`-th smallest element in the entire wavelet matrix.
1263 /// The `i`-th smallest element is returned as a u64 numeral.
1264 ///
1265 /// Returns `None` if the `i` is out of bounds, or if the number of bits per element exceeds 64.
1266 ///
1267 /// # Example
1268 /// ```
1269 /// use vers_vecs::{BitVec, WaveletMatrix};
1270 ///
1271 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1272 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1273 ///
1274 /// assert_eq!(wavelet_matrix.get_sorted_u64(0), Some(1));
1275 /// assert_eq!(wavelet_matrix.get_sorted_u64(1), Some(1));
1276 /// assert_eq!(wavelet_matrix.get_sorted_u64(2), Some(2));
1277 /// ```
1278 #[must_use]
1279 pub fn get_sorted_u64(&self, i: usize) -> Option<u64> {
1280 if i >= self.len() || self.bits_per_element() > 64 {
1281 None
1282 } else {
1283 Some(self.get_sorted_u64_unchecked(i))
1284 }
1285 }
1286
1287 /// Get the smallest element in the encoded sequence in the specified `range`.
1288 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1289 /// The smallest element is returned as a `BitVec`,
1290 ///
1291 /// This method does not perform bounds checking.
1292 /// Use [`range_min`] for a checked version.
1293 ///
1294 /// # Panics
1295 /// May panic if the `range` is out of bounds or if the range is empty.
1296 /// May instead return an empty bit vector.
1297 ///
1298 /// [`range_min`]: WaveletMatrix::range_min
1299 #[must_use]
1300 pub fn range_min_unchecked(&self, range: Range<usize>) -> BitVec {
1301 self.quantile_unchecked(range, 0)
1302 }
1303
1304 /// Get the smallest element in the encoded sequence in the specified `range`.
1305 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1306 /// The smallest element is returned as a `BitVec`,
1307 ///
1308 /// Returns `None` if the `range` is out of bounds or if the range is empty.
1309 ///
1310 /// # Example
1311 /// ```
1312 /// use vers_vecs::{BitVec, WaveletMatrix};
1313 ///
1314 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1315 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1316 ///
1317 /// assert_eq!(wavelet_matrix.range_min(0..3), Some(BitVec::pack_sequence_u8(&[1], 3)));
1318 /// assert_eq!(wavelet_matrix.range_min(1..4), Some(BitVec::pack_sequence_u8(&[1], 3)));
1319 /// assert_eq!(wavelet_matrix.range_min(1..3), Some(BitVec::pack_sequence_u8(&[4], 3)));
1320 /// ```
1321 #[must_use]
1322 pub fn range_min(&self, range: Range<usize>) -> Option<BitVec> {
1323 self.quantile(range, 0)
1324 }
1325
1326 /// Get the smallest element in the encoded sequence in the specified `range`.
1327 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1328 /// The smallest element is returned as a `u64` numeral.
1329 /// If the number of bits per element exceeds 64, the value is truncated.
1330 ///
1331 /// This method does not perform bounds checking.
1332 /// Use [`range_min_u64`] for a checked version.
1333 ///
1334 /// # Panics
1335 /// May panic if the `range` is out of bounds or if the range is empty.
1336 /// May instead return 0.
1337 ///
1338 /// [`range_min_u64`]: WaveletMatrix::range_min_u64
1339 #[must_use]
1340 pub fn range_min_u64_unchecked(&self, range: Range<usize>) -> u64 {
1341 self.quantile_u64_unchecked(range, 0)
1342 }
1343
1344 /// Get the smallest element in the encoded sequence in the specified `range`.
1345 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1346 /// The smallest element is returned as a `u64` numeral.
1347 ///
1348 /// Returns `None` if the `range` is out of bounds, if the range is empty, or if the number of bits
1349 /// per element exceeds 64.
1350 ///
1351 /// # Example
1352 /// ```
1353 /// use vers_vecs::{BitVec, WaveletMatrix};
1354 ///
1355 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1356 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1357 ///
1358 /// assert_eq!(wavelet_matrix.range_min_u64(0..3), Some(1));
1359 /// assert_eq!(wavelet_matrix.range_min_u64(1..4), Some(1));
1360 /// assert_eq!(wavelet_matrix.range_min_u64(1..3), Some(4));
1361 /// ```
1362 #[must_use]
1363 pub fn range_min_u64(&self, range: Range<usize>) -> Option<u64> {
1364 self.quantile_u64(range, 0)
1365 }
1366
1367 /// Get the largest element in the encoded sequence in the specified `range`.
1368 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1369 /// The largest element is returned as a `BitVec`,
1370 /// where the least significant bit is the first element.
1371 ///
1372 /// This method does not perform bounds checking.
1373 /// Use [`range_max`] for a checked version.
1374 ///
1375 /// # Panics
1376 /// May panic if the `range` is out of bounds or if the range is empty.
1377 /// May instead return an empty bit vector.
1378 ///
1379 /// [`range_max`]: WaveletMatrix::range_max
1380 #[must_use]
1381 pub fn range_max_unchecked(&self, range: Range<usize>) -> BitVec {
1382 let k = range.end - range.start - 1;
1383 self.quantile_unchecked(range, k)
1384 }
1385
1386 /// Get the largest element in the encoded sequence in the specified `range`.
1387 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1388 /// The largest element is returned as a `BitVec`,
1389 /// where the least significant bit is the first element.
1390 ///
1391 /// Returns `None` if the `range` is out of bounds or if the range is empty.
1392 ///
1393 /// # Example
1394 /// ```
1395 /// use vers_vecs::{BitVec, WaveletMatrix};
1396 ///
1397 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1398 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1399 ///
1400 /// assert_eq!(wavelet_matrix.range_max(0..3), Some(BitVec::pack_sequence_u8(&[4], 3)));
1401 /// assert_eq!(wavelet_matrix.range_max(3..6), Some(BitVec::pack_sequence_u8(&[7], 3)));
1402 /// ```
1403 #[must_use]
1404 pub fn range_max(&self, range: Range<usize>) -> Option<BitVec> {
1405 if range.is_empty() {
1406 None
1407 } else {
1408 let k = range.end - range.start - 1;
1409 self.quantile(range, k)
1410 }
1411 }
1412
1413 /// Get the largest element in the encoded sequence in the specified `range`.
1414 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1415 /// The largest element is returned as a `u64` numeral.
1416 /// If the number of bits per element exceeds 64, the value is truncated.
1417 ///
1418 /// This method does not perform bounds checking.
1419 /// Use [`range_max_u64`] for a checked version.
1420 ///
1421 /// # Panics
1422 /// May panic if the `range` is out of bounds or if the range is empty.
1423 /// May instead return 0.
1424 ///
1425 /// [`range_max_u64`]: WaveletMatrix::range_max_u64
1426 #[must_use]
1427 pub fn range_max_u64_unchecked(&self, range: Range<usize>) -> u64 {
1428 let k = range.end - range.start - 1;
1429 self.quantile_u64_unchecked(range, k)
1430 }
1431
1432 /// Get the largest element in the encoded sequence in the specified `range`.
1433 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1434 /// The largest element is returned as a `u64` numeral.
1435 ///
1436 /// Returns `None` if the `range` is out of bounds, if the range is empty, or if the number of bits
1437 /// per element exceeds 64.
1438 ///
1439 /// # Example
1440 /// ```
1441 /// use vers_vecs::{BitVec, WaveletMatrix};
1442 ///
1443 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1444 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1445 ///
1446 /// assert_eq!(wavelet_matrix.range_max_u64(0..3), Some(4));
1447 /// assert_eq!(wavelet_matrix.range_max_u64(3..6), Some(7));
1448 /// ```
1449 #[must_use]
1450 pub fn range_max_u64(&self, range: Range<usize>) -> Option<u64> {
1451 if range.is_empty() {
1452 None
1453 } else {
1454 let k = range.end - range.start - 1;
1455 self.quantile_u64(range, k)
1456 }
1457 }
1458
1459 /// Get the median element in the encoded sequence in the specified `range`.
1460 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1461 /// The median element is returned as a `BitVec`,
1462 /// where the least significant bit is the first element.
1463 ///
1464 /// If the range does not contain an odd number of elements, the position is rounded down.
1465 ///
1466 /// This method does not perform bounds checking.
1467 /// Use [`range_median`] for a checked version.
1468 ///
1469 /// # Panics
1470 /// May panic if the `range` is out of bounds or if the range is empty.
1471 /// May instead return an empty bit vector.
1472 ///
1473 /// [`range_median`]: WaveletMatrix::range_median
1474 #[must_use]
1475 pub fn range_median_unchecked(&self, range: Range<usize>) -> BitVec {
1476 let k = (range.end - 1 - range.start) / 2;
1477 self.quantile_unchecked(range, k)
1478 }
1479
1480 /// Get the median element in the encoded sequence in the specified `range`.
1481 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1482 /// The median element is returned as a `BitVec`,
1483 /// where the least significant bit is the first element.
1484 ///
1485 /// If the range does not contain an odd number of elements, the position is rounded down.
1486 ///
1487 /// Returns `None` if the `range` is out of bounds or if the range is empty.
1488 ///
1489 /// # Example
1490 /// ```
1491 /// use vers_vecs::{BitVec, WaveletMatrix};
1492 ///
1493 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1494 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1495 ///
1496 /// assert_eq!(wavelet_matrix.range_median(0..3), Some(BitVec::pack_sequence_u8(&[4], 3)));
1497 /// assert_eq!(wavelet_matrix.range_median(1..4), Some(BitVec::pack_sequence_u8(&[4], 3)));
1498 /// assert_eq!(wavelet_matrix.range_median(0..6), Some(BitVec::pack_sequence_u8(&[2], 3)));
1499 /// ```
1500 #[must_use]
1501 pub fn range_median(&self, range: Range<usize>) -> Option<BitVec> {
1502 if range.is_empty() {
1503 None
1504 } else {
1505 let k = (range.end - 1 - range.start) / 2;
1506 self.quantile(range, k)
1507 }
1508 }
1509
1510 /// Get the median element in the encoded sequence in the specified `range`.
1511 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1512 /// The median element is returned as a `u64` numeral.
1513 /// If the number of bits per element exceeds 64, the value is truncated.
1514 ///
1515 /// If the range does not contain an odd number of elements, the position is rounded down.
1516 ///
1517 /// This method does not perform bounds checking.
1518 /// Use [`range_median_u64`] for a checked version.
1519 ///
1520 /// # Panics
1521 /// May panic if the `range` is out of bounds or if the range is empty.
1522 /// May instead return 0.
1523 ///
1524 /// [`range_median_u64`]: WaveletMatrix::range_median_u64
1525 #[must_use]
1526 pub fn range_median_u64_unchecked(&self, range: Range<usize>) -> u64 {
1527 let k = (range.end - 1 - range.start) / 2;
1528 self.quantile_u64_unchecked(range, k)
1529 }
1530
1531 /// Get the median element in the encoded sequence in the specified `range`.
1532 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1533 /// The median element is returned as a `u64` numeral.
1534 ///
1535 /// If the range does not contain an odd number of elements, the position is rounded down.
1536 ///
1537 /// Returns `None` if the `range` is out of bounds, if the range is empty, or if the number of bits
1538 /// per element exceeds 64.
1539 ///
1540 /// # Example
1541 /// ```
1542 /// use vers_vecs::{BitVec, WaveletMatrix};
1543 ///
1544 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1545 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1546 ///
1547 /// assert_eq!(wavelet_matrix.range_median_u64(0..3), Some(4));
1548 /// assert_eq!(wavelet_matrix.range_median_u64(1..4), Some(4));
1549 /// assert_eq!(wavelet_matrix.range_median_u64(0..6), Some(2));
1550 /// ```
1551 #[must_use]
1552 pub fn range_median_u64(&self, range: Range<usize>) -> Option<u64> {
1553 if range.is_empty() || self.bits_per_element() > 64 || range.end > self.len() {
1554 None
1555 } else {
1556 let k = (range.end - 1 - range.start) / 2;
1557 self.quantile_u64(range, k)
1558 }
1559 }
1560
1561 /// Get the predecessor of the given `symbol` in the given `range`.
1562 /// This is a private generic helper function to implement the public `predecessor` functions.
1563 ///
1564 /// The read and write access to the query and result values are abstracted by the `Reader` and `Writer` closures.
1565 #[inline(always)] // even though the function is pretty large, inlining probably gets rid of the closure calls in favor of static calls
1566 fn predecessor_generic_unchecked<
1567 T: Clone,
1568 Reader: Fn(usize, &T) -> u64,
1569 Writer: Fn(u64, usize, &mut T),
1570 Quantile: Fn(&Self, Range<usize>, usize, usize, T) -> T,
1571 >(
1572 &self,
1573 mut range: Range<usize>,
1574 symbol: &T,
1575 mut result_value: T,
1576 bit_reader: Reader,
1577 result_writer: Writer,
1578 quantile_search: Quantile,
1579 ) -> Option<T> {
1580 // the bit-prefix at the last node where we could go to an interval with smaller elements,
1581 // i.e. where we need to go if the current prefix has no elements smaller than the query
1582 let mut last_smaller_prefix = result_value.clone();
1583 // the level of the last node where we could go to an interval with smaller elements
1584 let mut last_one_level: Option<usize> = None;
1585 // the range of the last node where we could go to an interval with smaller elements
1586 let mut next_smaller_range: Option<Range<usize>> = None;
1587
1588 for (level, data) in self.data.iter().enumerate() {
1589 let query_bit = bit_reader(level, symbol);
1590
1591 // read the amount of elements with the current result-prefix plus one 0 bit.
1592 // if the query_bit is 1, we can calculate the amount of elements with the result-prefix
1593 // plus one 1 from there
1594 let zeros_start = data.rank0(range.start);
1595 let zeros_end = data.rank0(range.end);
1596 let elements_zero = zeros_end - zeros_start;
1597
1598 if query_bit == 0 {
1599 if elements_zero == 0 {
1600 // if our query bit is zero in this level and suddenly our new interval is empty,
1601 // all elements that were in the previous interval are bigger than the query element,
1602 // because they all have a 1 in the current level.
1603 // this means the predecessor is the largest element in the last smaller interval,
1604 // i.e. the interval that has a 0 bit at the last level where our prefix had a 1 bit.
1605
1606 return next_smaller_range.map(|r| {
1607 let idx = r.end - r.start - 1;
1608 result_writer(0, last_one_level.unwrap(), &mut last_smaller_prefix);
1609 quantile_search(
1610 self,
1611 r,
1612 idx,
1613 last_one_level.unwrap() + 1,
1614 last_smaller_prefix,
1615 )
1616 });
1617 }
1618
1619 // update the prefix
1620 result_writer(0, level, &mut result_value);
1621
1622 // update the range to the interval of the new prefix
1623 range.start = zeros_start;
1624 range.end = zeros_end;
1625 } else {
1626 if elements_zero == range.end - range.start {
1627 // if our query element is 1 in this level and suddenly our new interval is empty,
1628 // all elements that were in the previous interval are smaller than the query element,
1629 // because they all have a 0 in the current level.
1630 // this means the predecessor is the largest element in the last level's interval.
1631
1632 let idx = range.end - range.start - 1;
1633 return Some(quantile_search(self, range, idx, level, result_value));
1634 }
1635
1636 // if the other interval is not empty, we update the last smaller interval to the last interval where we can switch to a 0 prefix
1637 if !(zeros_start..zeros_end).is_empty() {
1638 last_one_level = Some(level);
1639 next_smaller_range = Some(zeros_start..zeros_end);
1640 last_smaller_prefix = result_value.clone();
1641 }
1642
1643 // update the prefix
1644 result_writer(1, level, &mut result_value);
1645
1646 // update the range to the interval of the new prefix
1647 range.start = data.rank0 + (range.start - zeros_start);
1648 range.end = data.rank0 + (range.end - zeros_end);
1649 }
1650 }
1651
1652 Some(result_value)
1653 }
1654
1655 /// Get the predecessor of the given `symbol` in the given `range`.
1656 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
1657 /// where the least significant bit is the first element, and `k` is the number of bits per element
1658 /// in the wavelet matrix.
1659 /// The `symbol` does not have to be in the encoded sequence.
1660 /// The predecessor is the largest element in the `range` that is smaller than or equal
1661 /// to the `symbol`.
1662 ///
1663 /// Returns `None` if the number of bits in the `symbol` is not equal to `k`,
1664 /// if the range is empty, if the wavelet matrix is empty, if the range is out of bounds,
1665 /// or if the `symbol` is smaller than all elements in the range.
1666 ///
1667 /// # Example
1668 /// ```
1669 /// use vers_vecs::{BitVec, WaveletMatrix};
1670 ///
1671 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1672 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1673 ///
1674 /// assert_eq!(wavelet_matrix.predecessor(0..3, &BitVec::pack_sequence_u8(&[7], 3)), Some(BitVec::pack_sequence_u8(&[4], 3)));
1675 /// assert_eq!(wavelet_matrix.predecessor(0..3, &BitVec::pack_sequence_u8(&[4], 3)), Some(BitVec::pack_sequence_u8(&[4], 3)));
1676 /// assert_eq!(wavelet_matrix.predecessor(0..6, &BitVec::pack_sequence_u8(&[7], 3)), Some(BitVec::pack_sequence_u8(&[7], 3)));
1677 /// ```
1678 ///
1679 /// [`BitVec`]: BitVec
1680 #[must_use]
1681 pub fn predecessor(&self, range: Range<usize>, symbol: &BitVec) -> Option<BitVec> {
1682 if symbol.len() != self.bits_per_element()
1683 || range.is_empty()
1684 || self.is_empty()
1685 || range.end > self.len()
1686 {
1687 return None;
1688 }
1689
1690 self.predecessor_generic_unchecked(
1691 range,
1692 symbol,
1693 BitVec::from_zeros(self.bits_per_element()),
1694 |level, symbol| symbol.get_unchecked((self.bits_per_element() - 1) - level),
1695 |bit, level, result| {
1696 result.set_unchecked((self.bits_per_element() - 1) - level, bit);
1697 },
1698 Self::partial_quantile_search_unchecked,
1699 )
1700 }
1701
1702 /// Get the predecessor of the given `symbol` in the given `range`.
1703 /// The `symbol` is a `k`-bit word encoded in a `u64` numeral,
1704 /// where k is less than or equal to 64.
1705 /// The `symbol` does not have to be in the encoded sequence.
1706 /// The predecessor is the largest element in the `range` that is smaller than or equal
1707 /// to the `symbol`.
1708 ///
1709 /// Returns `None` if the number of bits in the matrix is greater than 64,
1710 /// if the range is empty, if the wavelet matrix is empty, if the range is out of bounds,
1711 /// or if the `symbol` is smaller than all elements in the range.
1712 ///
1713 /// # Example
1714 /// ```
1715 /// use vers_vecs::{BitVec, WaveletMatrix};
1716 ///
1717 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1718 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1719 ///
1720 /// assert_eq!(wavelet_matrix.predecessor_u64(0..3, 7), Some(4));
1721 /// assert_eq!(wavelet_matrix.predecessor_u64(0..3, 4), Some(4));
1722 /// assert_eq!(wavelet_matrix.predecessor_u64(0..6, 7), Some(7));
1723 /// ```
1724 #[must_use]
1725 pub fn predecessor_u64(&self, range: Range<usize>, symbol: u64) -> Option<u64> {
1726 if self.bits_per_element() > 64
1727 || range.is_empty()
1728 || self.is_empty()
1729 || range.end > self.len()
1730 {
1731 return None;
1732 }
1733
1734 self.predecessor_generic_unchecked(
1735 range,
1736 &symbol,
1737 0,
1738 |level, symbol| (symbol >> ((self.bits_per_element() - 1) - level)) & 1,
1739 |bit, _level, result| {
1740 // we ignore the level here, and instead rely on the fact that the bits are set in order.
1741 // we have to do that, because the quantile_search_u64 does the same.
1742 *result <<= 1;
1743 *result |= bit;
1744 },
1745 Self::partial_quantile_search_u64_unchecked,
1746 )
1747 }
1748
1749 #[inline(always)]
1750 fn successor_generic_unchecked<
1751 T: Clone,
1752 Reader: Fn(usize, &T) -> u64,
1753 Writer: Fn(u64, usize, &mut T),
1754 Quantile: Fn(&Self, Range<usize>, usize, usize, T) -> T,
1755 >(
1756 &self,
1757 mut range: Range<usize>,
1758 symbol: &T,
1759 mut result_value: T,
1760 bit_reader: Reader,
1761 result_writer: Writer,
1762 quantile_search: Quantile,
1763 ) -> Option<T> {
1764 // the bit-prefix at the last node where we could go to an interval with larger elements,
1765 // i.e. where we need to go if the current prefix has no elements larger than the query
1766 let mut last_larger_prefix = result_value.clone();
1767 // the level of the last node where we could go to an interval with larger elements
1768 let mut last_zero_level: Option<usize> = None;
1769 // the range of the last node where we could go to an interval with larger elements
1770 let mut next_larger_range: Option<Range<usize>> = None;
1771
1772 for (level, data) in self.data.iter().enumerate() {
1773 let query_bit = bit_reader(level, symbol);
1774
1775 // read the amount of elements with the current result-prefix plus one 0 bit.
1776 // if the query_bit is 1, we can calculate the amount of elements with the result-prefix
1777 // plus one 1 from there
1778 let zeros_start = data.rank0(range.start);
1779 let zeros_end = data.rank0(range.end);
1780 let elements_zero = zeros_end - zeros_start;
1781
1782 if query_bit == 0 {
1783 if elements_zero == 0 {
1784 // if our query element is 0 in this level and suddenly our new interval is empty,
1785 // all elements that were in the previous interval are larger than the query element,
1786 // because they all have a 1 in the current level.
1787 // this means the successor is the smallest element in the last level's interval.
1788
1789 return Some(quantile_search(self, range, 0, level, result_value));
1790 }
1791
1792 // if the other interval is not empty, we update the last interval where we can switch to a prefix with a 1
1793 if !(data.rank0 + (range.start - zeros_start)..data.rank0 + (range.end - zeros_end))
1794 .is_empty()
1795 {
1796 last_zero_level = Some(level);
1797 next_larger_range = Some(
1798 data.rank0 + (range.start - zeros_start)
1799 ..data.rank0 + (range.end - zeros_end),
1800 );
1801 last_larger_prefix = result_value.clone();
1802 }
1803
1804 // update the prefix
1805 result_writer(0, level, &mut result_value);
1806
1807 // update the range to the interval of the new prefix
1808 range.start = zeros_start;
1809 range.end = zeros_end;
1810 } else {
1811 if elements_zero == range.end - range.start {
1812 // if our query bit is 1 in this level and suddenly our new interval is empty,
1813 // all elements that were in the previous interval are smaller than the query element,
1814 // because they all have a 0 in the current level.
1815 // this means the successor is the smallest element in the last interval with a larger prefix,
1816 // i.e. the interval that has a 1 bit at the last level where our prefix had a 0 bit.
1817
1818 return next_larger_range.map(|r| {
1819 result_writer(1, last_zero_level.unwrap(), &mut last_larger_prefix);
1820 quantile_search(
1821 self,
1822 r,
1823 0,
1824 last_zero_level.unwrap() + 1,
1825 last_larger_prefix,
1826 )
1827 });
1828 }
1829
1830 // update the prefix
1831 result_writer(1, level, &mut result_value);
1832
1833 // update the range to the interval of the new prefix
1834 range.start = data.rank0 + (range.start - zeros_start);
1835 range.end = data.rank0 + (range.end - zeros_end);
1836 }
1837 }
1838
1839 Some(result_value)
1840 }
1841
1842 /// Get the successor of the given `symbol` in the given range.
1843 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
1844 /// where the least significant bit is the first element, and `k` is the number of bits per element
1845 /// in the wavelet matrix.
1846 /// The `symbol` does not have to be in the encoded sequence.
1847 /// The successor is the smallest element in the range that is greater than or equal
1848 /// to the `symbol`.
1849 ///
1850 /// Returns `None` if the number of bits in the `symbol` is not equal to `k`,
1851 /// if the range is empty, if the wavelet matrix is empty, if the range is out of bounds,
1852 /// or if the `symbol` is greater than all elements in the range.
1853 ///
1854 /// # Example
1855 /// ```
1856 /// use vers_vecs::{BitVec, WaveletMatrix};
1857 ///
1858 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1859 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1860 ///
1861 /// assert_eq!(wavelet_matrix.successor(0..3, &BitVec::pack_sequence_u8(&[2], 3)), Some(BitVec::pack_sequence_u8(&[4], 3)));
1862 /// assert_eq!(wavelet_matrix.successor(0..3, &BitVec::pack_sequence_u8(&[5], 3)), None);
1863 /// assert_eq!(wavelet_matrix.successor(0..6, &BitVec::pack_sequence_u8(&[2], 3)), Some(BitVec::pack_sequence_u8(&[2], 3)));
1864 /// ```
1865 ///
1866 /// [`BitVec`]: BitVec
1867 #[must_use]
1868 pub fn successor(&self, range: Range<usize>, symbol: &BitVec) -> Option<BitVec> {
1869 if symbol.len() != self.bits_per_element()
1870 || range.is_empty()
1871 || self.is_empty()
1872 || range.end > self.len()
1873 {
1874 return None;
1875 }
1876
1877 self.successor_generic_unchecked(
1878 range,
1879 symbol,
1880 BitVec::from_zeros(self.bits_per_element()),
1881 |level, symbol| symbol.get_unchecked((self.bits_per_element() - 1) - level),
1882 |bit, level, result| {
1883 result.set_unchecked((self.bits_per_element() - 1) - level, bit);
1884 },
1885 Self::partial_quantile_search_unchecked,
1886 )
1887 }
1888
1889 /// Get the successor of the given `symbol` in the given range.
1890 /// The `symbol` is a `k`-bit word encoded in a `u64` numeral,
1891 /// where k is less than or equal to 64.
1892 /// The `symbol` does not have to be in the encoded sequence.
1893 /// The successor is the smallest element in the range that is greater than or equal
1894 /// to the `symbol`.
1895 ///
1896 /// Returns `None` if the number of bits in the matrix is greater than 64,
1897 /// if the range is empty, if the wavelet matrix is empty, if the range is out of bounds,
1898 /// or if the `symbol` is greater than all elements in the range.
1899 ///
1900 /// # Example
1901 /// ```
1902 /// use vers_vecs::{BitVec, WaveletMatrix};
1903 ///
1904 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1905 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1906 ///
1907 /// assert_eq!(wavelet_matrix.successor_u64(0..3, 2), Some(4));
1908 /// assert_eq!(wavelet_matrix.successor_u64(0..3, 5), None);
1909 /// assert_eq!(wavelet_matrix.successor_u64(0..6, 2), Some(2));
1910 /// ```
1911 #[must_use]
1912 pub fn successor_u64(&self, range: Range<usize>, symbol: u64) -> Option<u64> {
1913 if self.bits_per_element() > 64
1914 || range.is_empty()
1915 || self.is_empty()
1916 || range.end > self.len()
1917 {
1918 return None;
1919 }
1920
1921 self.successor_generic_unchecked(
1922 range,
1923 &symbol,
1924 0,
1925 |level, symbol| (symbol >> ((self.bits_per_element() - 1) - level)) & 1,
1926 |bit, _level, result| {
1927 // we ignore the level here, and instead rely on the fact that the bits are set in order.
1928 // we have to do that, because the quantile_search_u64 does the same.
1929 *result <<= 1;
1930 *result |= bit;
1931 },
1932 Self::partial_quantile_search_u64_unchecked,
1933 )
1934 }
1935
1936 /// Get an iterator over the elements of the encoded sequence.
1937 /// The iterator yields `u64` elements.
1938 /// If the number of bits per element exceeds 64, `None` is returned.
1939 ///
1940 /// # Example
1941 /// ```
1942 /// use vers_vecs::{BitVec, WaveletMatrix};
1943 ///
1944 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1945 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1946 ///
1947 /// let mut iter = wavelet_matrix.iter_u64().unwrap();
1948 /// assert_eq!(iter.collect::<Vec<_>>(), vec![1, 4, 4, 1, 2, 7]);
1949 /// ```
1950 #[must_use]
1951 pub fn iter_u64(&self) -> Option<WaveletNumRefIter<'_>> {
1952 if self.bits_per_element() > 64 {
1953 None
1954 } else {
1955 Some(WaveletNumRefIter::new(self))
1956 }
1957 }
1958
1959 /// Turn the encoded sequence into an iterator.
1960 /// The iterator yields `u64` elements.
1961 /// If the number of bits per element exceeds 64, `None` is returned.
1962 #[must_use]
1963 pub fn into_iter_u64(self) -> Option<WaveletNumIter> {
1964 if self.bits_per_element() > 64 {
1965 None
1966 } else {
1967 Some(WaveletNumIter::new(self))
1968 }
1969 }
1970
1971 /// Get an iterator over the sorted elements of the encoded sequence.
1972 /// The iterator yields `BitVec` elements.
1973 ///
1974 /// See also [`iter_sorted_u64`] for an iterator that yields `u64` elements.
1975 ///
1976 /// [`iter_sorted_u64`]: Self::iter_sorted_u64
1977 #[must_use]
1978 pub fn iter_sorted(&self) -> WaveletSortedRefIter<'_> {
1979 WaveletSortedRefIter::new(self)
1980 }
1981
1982 /// Turn the encoded sequence into an iterator over the sorted sequence.
1983 /// The iterator yields `BitVec` elements.
1984 #[must_use]
1985 pub fn into_iter_sorted(self) -> WaveletSortedIter {
1986 WaveletSortedIter::new(self)
1987 }
1988
1989 /// Get an iterator over the sorted elements of the encoded sequence.
1990 /// The iterator yields `u64` elements.
1991 /// If the number of bits per element exceeds 64, `None` is returned.
1992 ///
1993 /// # Example
1994 /// ```
1995 /// use vers_vecs::{BitVec, WaveletMatrix};
1996 ///
1997 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1998 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1999 ///
2000 /// let mut iter = wavelet_matrix.iter_sorted_u64().unwrap();
2001 /// assert_eq!(iter.collect::<Vec<_>>(), vec![1, 1, 2, 4, 4, 7]);
2002 /// ```
2003 #[must_use]
2004 pub fn iter_sorted_u64(&self) -> Option<WaveletSortedNumRefIter<'_>> {
2005 if self.bits_per_element() > 64 {
2006 None
2007 } else {
2008 Some(WaveletSortedNumRefIter::new(self))
2009 }
2010 }
2011
2012 /// Turn the encoded sequence into an iterator over the sorted sequence.
2013 /// The iterator yields `u64` elements.
2014 /// If the number of bits per element exceeds 64, `None` is returned.
2015 #[must_use]
2016 pub fn into_iter_sorted_u64(self) -> Option<WaveletSortedNumIter> {
2017 if self.bits_per_element() > 64 {
2018 None
2019 } else {
2020 Some(WaveletSortedNumIter::new(self))
2021 }
2022 }
2023
2024 /// Get the number of bits per element in the alphabet of the encoded sequence.
2025 #[must_use]
2026 #[inline(always)]
2027 pub fn bits_per_element(&self) -> usize {
2028 self.data.len()
2029 }
2030
2031 /// Get the number of bits per element in the alphabet of the encoded sequence.
2032 #[must_use]
2033 #[deprecated(since = "1.5.1", note = "please use `bits_per_element` instead")]
2034 #[allow(clippy::cast_possible_truncation)]
2035 pub fn bit_len(&self) -> u16 {
2036 self.bits_per_element() as u16
2037 }
2038
2039 /// Get the number of elements stored in the encoded sequence.
2040 #[must_use]
2041 pub fn len(&self) -> usize {
2042 if self.data.is_empty() {
2043 0
2044 } else {
2045 self.data[0].len()
2046 }
2047 }
2048
2049 /// Check if the wavelet matrix is empty.
2050 #[must_use]
2051 pub fn is_empty(&self) -> bool {
2052 self.len() == 0
2053 }
2054
2055 /// Get the number of bytes allocated on the heap for the wavelet matrix.
2056 /// This does not include memory that is allocated but unused due to allocation policies of
2057 /// internal data structures.
2058 #[must_use]
2059 pub fn heap_size(&self) -> usize {
2060 self.data.iter().map(RsVec::heap_size).sum::<usize>()
2061 }
2062}
2063
2064impl_vector_iterator!(
2065 WaveletMatrix,
2066 WaveletIter,
2067 WaveletRefIter,
2068 get_value_unchecked,
2069 get_value,
2070 BitVec
2071);
2072
2073impl_vector_iterator!(
2074 WaveletMatrix,
2075 WaveletNumIter,
2076 WaveletNumRefIter,
2077 get_u64_unchecked,
2078 get_u64,
2079 u64,
2080 special
2081);
2082
2083impl_vector_iterator!(
2084 WaveletMatrix,
2085 WaveletSortedIter,
2086 WaveletSortedRefIter,
2087 get_sorted_unchecked,
2088 get_sorted,
2089 BitVec,
2090 special
2091);
2092
2093impl_vector_iterator!(
2094 WaveletMatrix,
2095 WaveletSortedNumIter,
2096 WaveletSortedNumRefIter,
2097 get_sorted_u64_unchecked,
2098 get_sorted_u64,
2099 u64,
2100 special
2101);
2102
2103#[cfg(test)]
2104mod tests;