vers_vecs/wavelet/mod.rs
1//! This module contains the implementation of a [wavelet matrix].
2//! The wavelet matrix is a data structure that encodes a sequence of `n` `k`-bit words
3//! into a matrix of bit vectors, allowing for fast rank and select queries on the encoded sequence.
4//!
5//! This implementation further supports quantile queries, and range-predecessor and -successor queries.
6//! All operations are `O(k)` time complexity.
7//!
8//! See the struct documentation for more information.
9//!
10//! [wavelet matrix]: WaveletMatrix
11
12use crate::util::impl_vector_iterator;
13use crate::{BitVec, RsVec};
14use std::mem;
15use std::ops::Range;
16
17/// A wavelet matrix implementation implemented as described in
18/// [Navarro and Claude, 2021](http://dx.doi.org/10.1007/978-3-642-34109-0_18).
19/// The implementation is designed to allow for extremely large alphabet sizes, without
20/// sacrificing performance for small alphabets.
21///
22/// There are two constructor algorithms available:
23/// - [`from_bit_vec`] and [`from_slice`] construct the wavelet matrix by repeatedly sorting the elements.
24/// These constructors have linear space overhead and run in `O(kn * log n)` time complexity.
25/// - [`from_bit_vec_pc`] and [`from_slice_pc`] construct the wavelet matrix by counting the
26/// prefixes of the elements. These constructors have a space complexity of `O(2^k)` and run
27/// in `O(kn)`, which makes this constructor preferable for large sequences over small alphabets.
28///
29/// They encode a sequence of `n` `k`-bit words into a wavelet matrix which supports constant-time
30/// rank and select queries on elements of its `k`-bit alphabet.
31/// All query functions are mirrored for both `BitVec` and `u64` query elements, so
32/// if `k <= 64`, no heap allocation is needed for the query element.
33///
34/// Other than rank and select queries, the matrix also supports quantile queries (range select i), and
35/// range-predecessor and -successor queries, all of which are loosely based on
36/// [Külekci and Thankachan](https://doi.org/10.1016/j.jda.2017.01.002) with better time complexity.
37///
38/// All operations implemented on the matrix are `O(k)` time complexity.
39/// The space complexity of the wavelet matrix is `O(n * k)` with a small linear overhead
40/// (see [`RsVec`]).
41///
42/// # Examples
43/// ```
44/// use vers_vecs::{BitVec, WaveletMatrix};
45///
46/// // pack elements from a 3-bit alphabet into a bit vector and construct a wavelet matrix from them
47/// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
48/// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
49///
50/// // query the wavelet matrix
51/// assert_eq!(wavelet_matrix.get_u64(0), Some(1));
52/// assert_eq!(wavelet_matrix.get_u64(1), Some(4));
53///
54// // rank and select queries
55/// assert_eq!(wavelet_matrix.rank_u64(3, 4), Some(2));
56/// assert_eq!(wavelet_matrix.rank_u64(3, 1), Some(1));
57/// assert_eq!(wavelet_matrix.select_u64(0, 7), Some(5));
58///
59/// // statistics
60/// assert_eq!(wavelet_matrix.range_median_u64(0..3), Some(4));
61/// assert_eq!(wavelet_matrix.predecessor_u64(0..6, 3), Some(2));
62/// ```
63///
64/// [`RsVec`]: RsVec
65/// [`from_bit_vec`]: WaveletMatrix::from_bit_vec
66/// [`from_slice`]: WaveletMatrix::from_slice
67/// [`from_bit_vec_pc`]: WaveletMatrix::from_bit_vec_pc
68/// [`from_slice_pc`]: WaveletMatrix::from_slice_pc
69#[derive(Clone, Debug)]
70#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
71#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemSize, mem_dbg::MemDbg))]
72pub struct WaveletMatrix {
73 data: Box<[RsVec]>,
74}
75
76impl WaveletMatrix {
77 /// Generic constructor that constructs the wavelet matrix by repeatedly sorting the elements.
78 /// The runtime complexity is `O(kn * log n)`.
79 ///
80 /// # Parameters
81 /// - `bits_per_element`: The number of bits in each word. Cannot exceed 1 << 16.
82 /// - `num_elements`: The number of elements in the sequence.
83 /// - `bit_lookup`: A closure that returns the `bit`-th bit of the `element`-th word.
84 #[inline(always)] // should get rid of closures in favor of static calls
85 fn permutation_sorting<LOOKUP: Fn(usize, usize) -> u64>(
86 bits_per_element: u16,
87 num_elements: usize,
88 bit_lookup: LOOKUP,
89 ) -> Self {
90 let element_len = bits_per_element as usize;
91
92 let mut data = vec![BitVec::from_zeros(num_elements); element_len];
93
94 // insert the first bit of each word into the first bit vector
95 // for each following level, insert the next bit of each word into the next bit vector
96 // sorted stably by the previous bit vector
97 let mut permutation = (0..num_elements).collect::<Vec<_>>();
98 let mut next_permutation = vec![0; num_elements];
99
100 for (level, data) in data.iter_mut().enumerate() {
101 let mut total_zeros = 0;
102 for (i, p) in permutation.iter().enumerate() {
103 if bit_lookup(*p, element_len - level - 1) == 0 {
104 total_zeros += 1;
105 } else {
106 data.set(i, 1).unwrap();
107 }
108 }
109
110 // scan through the generated bit array and move the elements to the correct position
111 // for the next permutation
112 if level < element_len - 1 {
113 let mut zero_boundary = 0;
114 let mut one_boundary = total_zeros;
115 for (i, p) in permutation.iter().enumerate() {
116 if data.get_unchecked(i) == 0 {
117 next_permutation[zero_boundary] = *p;
118 zero_boundary += 1;
119 } else {
120 next_permutation[one_boundary] = *p;
121 one_boundary += 1;
122 }
123 }
124
125 mem::swap(&mut permutation, &mut next_permutation);
126 }
127 }
128
129 Self {
130 data: data.into_iter().map(BitVec::into).collect(),
131 }
132 }
133
134 /// Create a new wavelet matrix from a sequence of `n` `k`-bit words.
135 /// The constructor runs in `O(kn * log n)` time complexity.
136 ///
137 /// # Parameters
138 /// - `bit_vec`: A packed sequence of `n` `k`-bit words. The `i`-th word begins in the
139 /// `bits_per_element * i`-th bit of the bit vector. Words are stored from least to most
140 /// significant bit.
141 /// - `bits_per_element`: The number `k` of bits in each word. Cannot exceed 1 << 16.
142 ///
143 /// # Panics
144 /// Panics if the number of bits in the bit vector is not a multiple of the number of bits per element.
145 #[must_use]
146 pub fn from_bit_vec(bit_vec: &BitVec, bits_per_element: u16) -> Self {
147 assert_eq!(bit_vec.len() % bits_per_element as usize, 0, "The number of bits in the bit vector must be a multiple of the number of bits per element.");
148 let num_elements = bit_vec.len() / bits_per_element as usize;
149 Self::permutation_sorting(bits_per_element, num_elements, |element, bit| {
150 bit_vec.get_unchecked(element * bits_per_element as usize + bit)
151 })
152 }
153
154 /// Create a new wavelet matrix from a sequence of `n` `k`-bit words.
155 /// The constructor runs in `O(kn * log n)` time complexity.
156 ///
157 /// # Parameters
158 /// - `sequence`: A slice of `n` u64 values, each encoding a `k`-bit word.
159 /// - `bits_per_element`: The number `k` of bits in each word. Cannot exceed 64.
160 ///
161 /// # Panics
162 /// Panics if the number of bits per element exceeds 64.
163 #[must_use]
164 pub fn from_slice(sequence: &[u64], bits_per_element: u16) -> Self {
165 assert!(
166 bits_per_element <= 64,
167 "The number of bits per element cannot exceed 64."
168 );
169 Self::permutation_sorting(bits_per_element, sequence.len(), |element, bit| {
170 (sequence[element] >> bit) & 1
171 })
172 }
173
174 /// Generic constructor that constructs the wavelet matrix by counting the prefixes of the elements.
175 /// The runtime complexity is `O(kn)`.
176 /// This constructor is only recommended for small alphabets.
177 ///
178 /// # Parameters
179 /// - `bits_per_element`: The number of bits in each word. Cannot exceed 64.
180 /// - `num_elements`: The number of elements in the sequence.
181 /// - `bit_lookup`: A closure that returns the `bit`-th bit of the `element`-th word.
182 /// - `element_lookup`: A closure that returns the `element`-th word.
183 #[inline(always)] // should get rid of closures in favor of static calls
184 fn prefix_counting<LOOKUP: Fn(usize, usize) -> u64, ELEMENT: Fn(usize) -> u64>(
185 bits_per_element: u16,
186 num_elements: usize,
187 bit_lookup: LOOKUP,
188 element_lookup: ELEMENT,
189 ) -> Self {
190 let element_len = bits_per_element as usize;
191 let mut histogram = vec![0usize; 1 << bits_per_element];
192 let mut borders = vec![0usize; 1 << bits_per_element];
193 let mut data = vec![BitVec::from_zeros(num_elements); element_len];
194
195 for i in 0..num_elements {
196 histogram[element_lookup(i) as usize] += 1;
197 data[0].set_unchecked(i, bit_lookup(i, element_len - 1));
198 }
199
200 for level in (1..element_len).rev() {
201 // combine histograms of prefixes
202 for h in 0..1 << level {
203 histogram[h] = histogram[2 * h] + histogram[2 * h + 1];
204 }
205
206 // compute borders of current level, using bit reverse patterns because of the weird
207 // node ordering in wavelet matrices
208 borders[0] = 0;
209 for h in 1usize..1 << level {
210 let h_minus_1 = (h - 1).reverse_bits() >> (64 - level);
211 borders[h.reverse_bits() >> (64 - level)] =
212 borders[h_minus_1] + histogram[h_minus_1];
213 }
214
215 for i in 0..num_elements {
216 let bit = bit_lookup(i, element_len - level - 1);
217 data[level].set_unchecked(
218 borders[element_lookup(i) as usize >> (element_len - level)],
219 bit,
220 );
221 borders[element_lookup(i) as usize >> (element_len - level)] += 1;
222 }
223 }
224
225 Self {
226 data: data.into_iter().map(BitVec::into).collect(),
227 }
228 }
229
230 /// Create a new wavelet matrix from a sequence of `n` `k`-bit words using the prefix counting
231 /// algorithm [Dinklage et al.](https://doi.org/10.1145/3457197)
232 /// The constructor runs in `O(kn)` time complexity but requires `O(2^k)` space during construction,
233 /// so it is only recommended for small alphabets.
234 /// Use the [`from_bit_vec`] or [`from_slice`] constructors for larger alphabets.
235 ///
236 /// # Parameters
237 /// - `bit_vec`: A packed sequence of `n` `k`-bit words. The `i`-th word begins in the
238 /// `bits_per_element * i`-th bit of the bit vector. Words are stored from least to most
239 /// significant bit.
240 /// - `bits_per_element`: The number `k` of bits in each word. Cannot exceed 1 << 16.
241 ///
242 /// # Panics
243 /// Panics if the number of bits in the bit vector is not a multiple of the number of bits per element,
244 /// or if the number of bits per element exceeds 64.
245 ///
246 /// [`from_bit_vec`]: WaveletMatrix::from_bit_vec
247 /// [`from_slice`]: WaveletMatrix::from_slice
248 #[must_use]
249 pub fn from_bit_vec_pc(bit_vec: &BitVec, bits_per_element: u16) -> Self {
250 assert_eq!(bit_vec.len() % bits_per_element as usize, 0, "The number of bits in the bit vector must be a multiple of the number of bits per element.");
251 assert!(
252 bits_per_element <= 64,
253 "The number of bits per element cannot exceed 64."
254 );
255 let num_elements = bit_vec.len() / bits_per_element as usize;
256 Self::prefix_counting(
257 bits_per_element,
258 num_elements,
259 |element, bit| bit_vec.get_unchecked(element * bits_per_element as usize + bit),
260 |element| {
261 bit_vec.get_bits_unchecked(
262 element * bits_per_element as usize,
263 bits_per_element as usize,
264 )
265 },
266 )
267 }
268
269 /// Create a new wavelet matrix from a sequence of `n` `k`-bit words using the prefix counting
270 /// algorithm [Dinklage et al.](https://doi.org/10.1145/3457197)
271 /// The constructor runs in `O(kn)` time complexity but requires `O(2^k)` space during construction,
272 /// so it is only recommended for small alphabets.
273 /// Use the [`from_bit_vec`] or [`from_slice`] constructors for larger alphabets.
274 ///
275 /// # Parameters
276 /// - `sequence`: A slice of `n` u64 values, each encoding a `k`-bit word.
277 /// - `bits_per_element`: The number `k` of bits in each word. Cannot exceed 64.
278 ///
279 /// # Panics
280 /// Panics if the number of bits per element exceeds 64.
281 ///
282 /// [`from_bit_vec`]: WaveletMatrix::from_bit_vec
283 /// [`from_slice`]: WaveletMatrix::from_slice
284 #[must_use]
285 pub fn from_slice_pc(sequence: &[u64], bits_per_element: u16) -> Self {
286 assert!(
287 bits_per_element <= 64,
288 "The number of bits per element cannot exceed 64."
289 );
290 Self::prefix_counting(
291 bits_per_element,
292 sequence.len(),
293 |element, bit| (sequence[element] >> bit) & 1,
294 |element| sequence[element],
295 )
296 }
297
298 /// Generic function to read a value from the wavelet matrix and consume it with a closure.
299 /// The function is used by the `get_value` and `get_u64` functions, deduplicating code.
300 #[inline(always)]
301 fn reconstruct_value_unchecked<F: FnMut(u64)>(&self, mut i: usize, mut target_func: F) {
302 for level in 0..self.bits_per_element() {
303 let bit = self.data[level].get_unchecked(i);
304 target_func(bit);
305 if bit == 0 {
306 i = self.data[level].rank0(i);
307 } else {
308 i = self.data[level].rank0 + self.data[level].rank1(i);
309 }
310 }
311 }
312
313 /// Get the `i`-th element of the encoded sequence in a `k`-bit word.
314 /// The `k`-bit word is returned as a `BitVec`.
315 /// The first element of the bit vector is the least significant bit.
316 ///
317 /// Returns `None` if the index is out of bounds.
318 ///
319 /// # Example
320 /// ```
321 /// use vers_vecs::{BitVec, WaveletMatrix};
322 ///
323 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
324 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
325 ///
326 /// assert_eq!(wavelet_matrix.get_value(0), Some(BitVec::pack_sequence_u8(&[1], 3)));
327 /// assert_eq!(wavelet_matrix.get_value(1), Some(BitVec::pack_sequence_u8(&[4], 3)));
328 /// assert_eq!(wavelet_matrix.get_value(100), None);
329 /// ```
330 #[must_use]
331 pub fn get_value(&self, i: usize) -> Option<BitVec> {
332 if self.data.is_empty() || i >= self.data[0].len() {
333 None
334 } else {
335 Some(self.get_value_unchecked(i))
336 }
337 }
338
339 /// Get the `i`-th element of the encoded sequence in a `k`-bit word.
340 /// The `k`-bit word is returned as a `BitVec`.
341 /// The first element of the bit vector is the least significant bit.
342 /// This function does not perform bounds checking.
343 /// Use [`get_value`] for a checked version.
344 ///
345 /// # Panics
346 /// May panic if the index is out of bounds. May instead return an empty bit vector.
347 ///
348 /// [`get_value`]: WaveletMatrix::get_value
349 #[must_use]
350 pub fn get_value_unchecked(&self, i: usize) -> BitVec {
351 let mut value = BitVec::from_zeros(self.bits_per_element());
352 let mut level = self.bits_per_element() - 1;
353 self.reconstruct_value_unchecked(i, |bit| {
354 value.set_unchecked(level, bit);
355 level = level.saturating_sub(1);
356 });
357 value
358 }
359
360 /// Get the `i`-th element of the encoded sequence as a `u64`.
361 /// The `u64` is constructed from the `k`-bit word stored in the wavelet matrix.
362 ///
363 /// Returns `None` if the index is out of bounds, or if the number of bits per element exceeds 64.
364 ///
365 /// # Example
366 /// ```
367 /// use vers_vecs::{BitVec, WaveletMatrix};
368 ///
369 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
370 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
371 ///
372 /// assert_eq!(wavelet_matrix.get_u64(0), Some(1));
373 /// assert_eq!(wavelet_matrix.get_u64(1), Some(4));
374 /// assert_eq!(wavelet_matrix.get_u64(100), None);
375 /// ```
376 #[must_use]
377 pub fn get_u64(&self, i: usize) -> Option<u64> {
378 if self.bits_per_element() > 64 || self.data.is_empty() || i >= self.data[0].len() {
379 None
380 } else {
381 Some(self.get_u64_unchecked(i))
382 }
383 }
384
385 /// Get the `i`-th element of the encoded sequence as a `u64` numeral.
386 /// The element is encoded in the lowest `k` bits of the numeral.
387 /// If the number of bits per element exceeds 64, the value is truncated.
388 /// This function does not perform bounds checking.
389 /// Use [`get_u64`] for a checked version.
390 ///
391 /// # Panic
392 /// May panic if the value of `i` is out of bounds. May instead return 0.
393 ///
394 /// [`get_u64`]: WaveletMatrix::get_u64
395 #[must_use]
396 pub fn get_u64_unchecked(&self, i: usize) -> u64 {
397 let mut value = 0;
398 self.reconstruct_value_unchecked(i, |bit| {
399 value <<= 1;
400 value |= bit;
401 });
402 value
403 }
404
405 /// Get the number of occurrences of the given `symbol` in the encoded sequence in the `range`.
406 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
407 /// where the least significant bit is the first element, and `k` is the number of bits per element
408 /// in the wavelet matrix.
409 ///
410 /// This method does not perform bounds checking, nor does it check if the `symbol` is a valid
411 /// `k`-bit word.
412 /// Use [`rank_range`] for a checked version.
413 ///
414 /// # Panics
415 /// May panic if the `range` is out of bounds,
416 /// or if the number of bits in `symbol` is lower than `k`.
417 /// May instead return 0.
418 ///
419 /// [`BitVec`]: BitVec
420 /// [`rank_range`]: WaveletMatrix::rank_range
421 #[must_use]
422 pub fn rank_range_unchecked(&self, mut range: Range<usize>, symbol: &BitVec) -> usize {
423 for (level, data) in self.data.iter().enumerate() {
424 if symbol.get_unchecked((self.bits_per_element() - 1) - level) == 0 {
425 range.start = data.rank0(range.start);
426 range.end = data.rank0(range.end);
427 } else {
428 range.start = data.rank0 + data.rank1(range.start);
429 range.end = data.rank0 + data.rank1(range.end);
430 }
431 }
432
433 range.end - range.start
434 }
435
436 /// Get the number of occurrences of the given `symbol` in the encoded sequence in the `range`.
437 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
438 /// where the least significant bit is the first element, and `k` is the number of bits per element
439 /// in the wavelet matrix.
440 ///
441 /// Returns `None` if the `range` is out of bounds (greater than the length of the encoded sequence,
442 /// but since it is exclusive, it may be equal to the length),
443 /// or if the number of bits in `symbol` is not equal to `k`.
444 ///
445 /// # Example
446 /// ```
447 /// use vers_vecs::{BitVec, WaveletMatrix};
448 ///
449 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
450 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
451 ///
452 /// assert_eq!(wavelet_matrix.rank_range(0..3, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
453 /// assert_eq!(wavelet_matrix.rank_range(2..4, &BitVec::pack_sequence_u8(&[4], 3)), Some(1));
454 /// ```
455 ///
456 /// [`BitVec`]: BitVec
457 #[must_use]
458 pub fn rank_range(&self, range: Range<usize>, symbol: &BitVec) -> Option<usize> {
459 if range.start >= self.len()
460 || range.end > self.len()
461 || symbol.len() != self.bits_per_element()
462 {
463 None
464 } else {
465 Some(self.rank_range_unchecked(range, symbol))
466 }
467 }
468
469 /// Get the number of occurrences of the given `symbol` in the encoded sequence in the `range`.
470 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
471 /// where k is less than or equal to 64.
472 /// The interval is half-open, meaning `rank_range_u64(0..0, symbol)` returns 0.
473 ///
474 /// This method does not perform bounds checking, nor does it check if the elements of the
475 /// wavelet matrix can be represented in a u64 numeral.
476 /// Use [`rank_range_u64`] for a checked version.
477 ///
478 /// # Panics
479 /// May panic if the `range` is out of bounds.
480 /// May instead return 0.
481 /// If the number of bits in wavelet matrix elements exceed `64`, the behavior is
482 /// platform-dependent.
483 ///
484 /// [`rank_range_u64`]: WaveletMatrix::rank_range_u64
485 #[must_use]
486 pub fn rank_range_u64_unchecked(&self, mut range: Range<usize>, symbol: u64) -> usize {
487 for (level, data) in self.data.iter().enumerate() {
488 if (symbol >> ((self.bits_per_element() - 1) - level)) & 1 == 0 {
489 range.start = data.rank0(range.start);
490 range.end = data.rank0(range.end);
491 } else {
492 range.start = data.rank0 + data.rank1(range.start);
493 range.end = data.rank0 + data.rank1(range.end);
494 }
495 }
496
497 range.end - range.start
498 }
499
500 /// Get the number of occurrences of the given `symbol` in the encoded sequence in the `range`.
501 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
502 /// where k is less than or equal to 64.
503 /// The interval is half-open, meaning `rank_range_u64(0..0, symbol)` returns 0.
504 ///
505 /// Returns `None` if the `range` is out of bounds (greater than the length of the encoded sequence,
506 /// but since it is exclusive, it may be equal to the length),
507 /// or if the number of bits in the wavelet matrix elements exceed `64`.
508 ///
509 /// # Example
510 /// ```
511 /// use vers_vecs::{BitVec, WaveletMatrix};
512 ///
513 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
514 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
515 ///
516 /// assert_eq!(wavelet_matrix.rank_range_u64(0..3, 4), Some(2));
517 /// assert_eq!(wavelet_matrix.rank_range_u64(2..4, 4), Some(1));
518 /// ```
519 #[must_use]
520 pub fn rank_range_u64(&self, range: Range<usize>, symbol: u64) -> Option<usize> {
521 if range.start >= self.len() || range.end > self.len() || self.bits_per_element() > 64 {
522 None
523 } else {
524 Some(self.rank_range_u64_unchecked(range, symbol))
525 }
526 }
527
528 /// Get the number of occurrences of the given `symbol` in the encoded sequence between the
529 /// `offset`-th and `i`-th element (exclusive).
530 /// This is equivalent to ```rank_range_unchecked(offset..i, symbol)```.
531 /// The interval is half-open, meaning ```rank_offset_unchecked(0, 0, symbol)``` returns 0.
532 ///
533 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
534 /// where the least significant bit is the first element, and `k` is the number of bits per element
535 /// in the wavelet matrix.
536 ///
537 /// This method does not perform bounds checking, nor does it check if the `symbol` is a valid
538 /// `k`-bit word.
539 /// Use [`rank_offset`] for a checked version.
540 ///
541 /// # Panics
542 /// May panic if `offset` is out of bounds,
543 /// or if `offset + i` is larger than the length of the encoded sequence,
544 /// or if `offset` is greater than `i`,
545 /// or if the number of bits in `symbol` is lower than `k`.
546 /// May instead return 0.
547 ///
548 /// [`BitVec`]: BitVec
549 /// [`rank_offset`]: WaveletMatrix::rank_offset
550 #[must_use]
551 pub fn rank_offset_unchecked(&self, offset: usize, i: usize, symbol: &BitVec) -> usize {
552 self.rank_range_unchecked(offset..i, symbol)
553 }
554
555 /// Get the number of occurrences of the given `symbol` in the encoded sequence between the
556 /// `offset`-th and `i`-th element (exclusive).
557 /// This is equivalent to ``rank_range(offset..i, symbol)``.
558 /// The interval is half-open, meaning ``rank_offset(0, 0, symbol)`` returns 0,
559 /// because the interval is empty.
560 ///
561 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
562 /// where the least significant bit is the first element, and `k` is the number of bits per element
563 /// in the wavelet matrix.
564 ///
565 /// Returns `None` if `offset` is out of bounds,
566 /// or if `i` is larger than the length of the encoded sequence,
567 /// or if `offset` is greater than `i`,
568 /// or if the number of bits in `symbol` is not equal to `k`.
569 /// `i` may equal the length of the encoded sequence,
570 /// which will return the number of occurrences of the symbol up to the end of the sequence.
571 ///
572 /// # Example
573 /// ```
574 /// use vers_vecs::{BitVec, WaveletMatrix};
575 ///
576 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
577 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
578 ///
579 /// assert_eq!(wavelet_matrix.rank_offset(0, 3, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
580 /// assert_eq!(wavelet_matrix.rank_offset(2, 4, &BitVec::pack_sequence_u8(&[4], 3)), Some(1));
581 /// ```
582 ///
583 /// [`BitVec`]: BitVec
584 #[must_use]
585 pub fn rank_offset(&self, offset: usize, i: usize, symbol: &BitVec) -> Option<usize> {
586 if offset > i
587 || offset >= self.len()
588 || i > self.len()
589 || symbol.len() != self.bits_per_element()
590 {
591 None
592 } else {
593 Some(self.rank_offset_unchecked(offset, i, symbol))
594 }
595 }
596
597 /// Get the number of occurrences of the given `symbol` in the encoded sequence between the
598 /// `offset`-th and `i`-th element (exclusive).
599 /// This is equivalent to ``rank_range(offset..i, symbol)``.
600 /// The interval is half-open, meaning ``rank_offset(0, 0, symbol)`` returns 0.
601 ///
602 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
603 /// where k is less than or equal to 64.
604 ///
605 /// This method does not perform bounds checking, nor does it check if the elements of the
606 /// wavelet matrix can be represented in a u64 numeral.
607 /// Use [`rank_offset_u64`] for a checked version.
608 ///
609 /// # Panics
610 /// May panic if `offset` is out of bounds,
611 /// or if `i` is larger than the length of the encoded sequence,
612 /// or if `offset` is greater than `i`,
613 /// or if the number of bits in wavelet matrix elements exceed `64`.
614 /// May instead return 0.
615 ///
616 /// [`rank_offset_u64`]: WaveletMatrix::rank_offset_u64
617 #[must_use]
618 pub fn rank_offset_u64_unchecked(&self, offset: usize, i: usize, symbol: u64) -> usize {
619 self.rank_range_u64_unchecked(offset..i, symbol)
620 }
621
622 /// Get the number of occurrences of the given `symbol` in the encoded sequence between the
623 /// `offset`-th and `i`-th element (exclusive).
624 /// This is equivalent to ``rank_range(offset..i, symbol)``.
625 /// The interval is half-open, meaning ``rank_offset(0, 0, symbol)`` returns 0.
626 ///
627 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
628 /// where k is less than or equal to 64.
629 ///
630 /// Returns `None` if `offset` is out of bounds,
631 /// or if `i` is larger than the length of the encoded sequence,
632 /// or if `offset` is greater than `i`,
633 /// or if the number of bits in the wavelet matrix elements exceed `64`.
634 /// `i` may equal the length of the encoded sequence,
635 /// which will return the number of occurrences of the symbol up to the end of the sequence.
636 ///
637 /// # Example
638 /// ```
639 /// use vers_vecs::{BitVec, WaveletMatrix};
640 ///
641 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
642 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
643 ///
644 /// assert_eq!(wavelet_matrix.rank_offset_u64(0, 3, 4), Some(2));
645 /// assert_eq!(wavelet_matrix.rank_offset_u64(2, 4, 4), Some(1));
646 /// ```
647 #[must_use]
648 pub fn rank_offset_u64(&self, offset: usize, i: usize, symbol: u64) -> Option<usize> {
649 if offset > i || offset >= self.len() || i > self.len() || self.bits_per_element() > 64 {
650 None
651 } else {
652 Some(self.rank_offset_u64_unchecked(offset, i, symbol))
653 }
654 }
655
656 /// Get the number of occurrences of the given `symbol` in the encoded sequence up to the `i`-th
657 /// element (exclusive).
658 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
659 /// where the least significant bit is the first element, and `k` is the number of bits per element
660 /// in the wavelet matrix.
661 ///
662 /// This method does not perform bounds checking, nor does it check if the `symbol` is a valid
663 /// `k`-bit word.
664 /// Use [`rank`] for a checked version.
665 ///
666 /// # Panics
667 /// May panic if `i` is out of bounds, or if the number of bits in `symbol` is lower than `k`.
668 /// May instead return 0.
669 /// If the number of bits in `symbol` exceeds `k`, the remaining bits are ignored.
670 ///
671 /// [`BitVec`]: BitVec
672 /// [`rank`]: WaveletMatrix::rank
673 #[must_use]
674 pub fn rank_unchecked(&self, i: usize, symbol: &BitVec) -> usize {
675 self.rank_range_unchecked(0..i, symbol)
676 }
677
678 /// Get the number of occurrences of the given `symbol` in the encoded sequence up to the `i`-th
679 /// element (exclusive).
680 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
681 /// where the least significant bit is the first element, and `k` is the number of bits per element
682 /// in the wavelet matrix.
683 ///
684 /// Returns `None` if `i` is out of bounds (greater than the length of the encoded sequence, but
685 /// since it is exclusive, it may be equal to the length),
686 /// or if the number of bits in `symbol` is not equal to `k`.
687 ///
688 /// # Example
689 /// ```
690 /// use vers_vecs::{BitVec, WaveletMatrix};
691 ///
692 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
693 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
694 ///
695 /// assert_eq!(wavelet_matrix.rank(3, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
696 /// assert_eq!(wavelet_matrix.rank(3, &BitVec::pack_sequence_u8(&[1], 3)), Some(1));
697 /// ```
698 ///
699 /// [`BitVec`]: BitVec
700 #[must_use]
701 pub fn rank(&self, i: usize, symbol: &BitVec) -> Option<usize> {
702 if i > self.len() || symbol.len() != self.bits_per_element() {
703 None
704 } else {
705 Some(self.rank_range_unchecked(0..i, symbol))
706 }
707 }
708
709 /// Get the number of occurrences of the given `symbol` in the encoded sequence up to the `i`-th
710 /// element (exclusive).
711 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
712 /// where k is less than or equal to 64.
713 ///
714 /// This method does not perform bounds checking, nor does it check if the elements of the
715 /// wavelet matrix can be represented in a u64 numeral.
716 /// Use [`rank_u64`] for a checked version.
717 ///
718 /// # Panics
719 /// May panic if `i` is out of bounds,
720 /// or if the number of bits in wavelet matrix elements exceed `64`.
721 /// May instead return 0.
722 ///
723 /// [`rank_u64`]: WaveletMatrix::rank_u64
724 #[must_use]
725 pub fn rank_u64_unchecked(&self, i: usize, symbol: u64) -> usize {
726 self.rank_range_u64_unchecked(0..i, symbol)
727 }
728
729 /// Get the number of occurrences of the given `symbol` in the encoded sequence up to the `i`-th
730 /// element (exclusive).
731 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
732 /// where k is less than or equal to 64.
733 ///
734 /// Returns `None` if `i` is out of bounds (greater than the length of the encoded sequence, but
735 /// since it is exclusive, it may be equal to the length),
736 /// or if the number of bits in the wavelet matrix elements exceed `64`.
737 ///
738 /// # Example
739 /// ```
740 /// use vers_vecs::{BitVec, WaveletMatrix};
741 ///
742 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
743 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
744 ///
745 /// assert_eq!(wavelet_matrix.rank_u64(3, 4), Some(2));
746 /// assert_eq!(wavelet_matrix.rank_u64(3, 1), Some(1));
747 /// ```
748 #[must_use]
749 pub fn rank_u64(&self, i: usize, symbol: u64) -> Option<usize> {
750 if i > self.len() || self.bits_per_element() > 64 {
751 None
752 } else {
753 Some(self.rank_range_u64_unchecked(0..i, symbol))
754 }
755 }
756
757 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence,
758 /// starting from the `offset`-th element.
759 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
760 /// where the least significant bit is the first element, and `k` is the number of bits per element
761 /// in the wavelet matrix.
762 ///
763 /// This method does not perform bounds checking, nor does it check if the `symbol` is a valid
764 /// `k`-bit word.
765 /// Use [`select_offset`] for a checked version.
766 ///
767 /// Returns the index of the `rank`-th occurrence of the `symbol` in the encoded sequence,
768 /// or the length of the encoded sequence if the `rank`-th occurrence does not exist.
769 ///
770 /// # Panics
771 /// May panic if the `offset` is out of bounds,
772 /// or if the number of bits in `symbol` is lower than `k`.
773 /// May instead return the length of the encoded sequence.
774 ///
775 /// [`BitVec`]: BitVec
776 /// [`select_offset`]: WaveletMatrix::select_offset
777 #[must_use]
778 pub fn select_offset_unchecked(&self, offset: usize, rank: usize, symbol: &BitVec) -> usize {
779 let mut range_start = offset;
780
781 for (level, data) in self.data.iter().enumerate() {
782 if symbol.get_unchecked((self.bits_per_element() - 1) - level) == 0 {
783 range_start = data.rank0(range_start);
784 } else {
785 range_start = data.rank0 + data.rank1(range_start);
786 }
787 }
788
789 let mut range_end = range_start + rank;
790
791 for (level, data) in self.data.iter().enumerate().rev() {
792 if symbol.get_unchecked((self.bits_per_element() - 1) - level) == 0 {
793 range_end = data.select0(range_end);
794 } else {
795 range_end = data.select1(range_end - data.rank0);
796 }
797 }
798
799 range_end
800 }
801
802 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence,
803 /// starting from the `offset`-th element.
804 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
805 /// where the least significant bit is the first element, and `k` is the number of bits per element
806 /// in the wavelet matrix.
807 ///
808 /// Returns `None` if `offset` is out of bounds, or if the number of bits in `symbol` is not equal to `k`,
809 /// or if the `rank`-th occurrence of the `symbol` does not exist.
810 ///
811 /// # Example
812 /// ```
813 /// use vers_vecs::{BitVec, WaveletMatrix};
814 ///
815 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
816 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
817 ///
818 /// assert_eq!(wavelet_matrix.select_offset(0, 0, &BitVec::pack_sequence_u8(&[4], 3)), Some(1));
819 /// assert_eq!(wavelet_matrix.select_offset(0, 1, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
820 /// assert_eq!(wavelet_matrix.select_offset(2, 0, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
821 /// assert_eq!(wavelet_matrix.select_offset(2, 1, &BitVec::pack_sequence_u8(&[4], 3)), None);
822 /// ```
823 ///
824 /// [`BitVec`]: BitVec
825 #[must_use]
826 pub fn select_offset(&self, offset: usize, rank: usize, symbol: &BitVec) -> Option<usize> {
827 if offset >= self.len() || symbol.len() != self.bits_per_element() {
828 None
829 } else {
830 let idx = self.select_offset_unchecked(offset, rank, symbol);
831 if idx < self.len() {
832 Some(idx)
833 } else {
834 None
835 }
836 }
837 }
838
839 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence,
840 /// starting from the `offset`-th element.
841 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
842 /// where k is less than or equal to 64.
843 ///
844 /// This method does not perform bounds checking, nor does it check if the elements of the
845 /// wavelet matrix can be represented in a u64 numeral.
846 /// Use [`select_offset_u64`] for a checked version.
847 ///
848 /// Returns the index of the `rank`-th occurrence of the `symbol` in the encoded sequence,
849 /// or the length of the encoded sequence if the `rank`-th occurrence does not exist.
850 ///
851 /// # Panics
852 /// May panic if the `offset` is out of bounds,
853 /// or if the number of bits in wavelet matrix elements exceed `64`.
854 /// May instead return the length of the encoded sequence.
855 ///
856 /// [`select_offset_u64`]: WaveletMatrix::select_offset_u64
857 #[must_use]
858 pub fn select_offset_u64_unchecked(&self, offset: usize, rank: usize, symbol: u64) -> usize {
859 let mut range_start = offset;
860
861 for (level, data) in self.data.iter().enumerate() {
862 if (symbol >> ((self.bits_per_element() - 1) - level)) & 1 == 0 {
863 range_start = data.rank0(range_start);
864 } else {
865 range_start = data.rank0 + data.rank1(range_start);
866 }
867 }
868
869 let mut range_end = range_start + rank;
870
871 for (level, data) in self.data.iter().enumerate().rev() {
872 if (symbol >> ((self.bits_per_element() - 1) - level)) & 1 == 0 {
873 range_end = data.select0(range_end);
874 } else {
875 range_end = data.select1(range_end - data.rank0);
876 }
877 }
878
879 range_end
880 }
881
882 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence,
883 /// starting from the `offset`-th element.
884 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
885 /// where k is less than or equal to 64.
886 ///
887 /// Returns `None` if `offset` is out of bounds, or if the number of bits in the wavelet matrix
888 /// elements exceed `64`, or if the `rank`-th occurrence of the `symbol` does not exist.
889 ///
890 /// # Example
891 /// ```
892 /// use vers_vecs::{BitVec, WaveletMatrix};
893 ///
894 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
895 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
896 ///
897 /// assert_eq!(wavelet_matrix.select_offset_u64(0, 0, 4), Some(1));
898 /// assert_eq!(wavelet_matrix.select_offset_u64(0, 1, 4), Some(2));
899 /// assert_eq!(wavelet_matrix.select_offset_u64(2, 0, 4), Some(2));
900 /// assert_eq!(wavelet_matrix.select_offset_u64(2, 1, 4), None);
901 /// ```
902 #[must_use]
903 pub fn select_offset_u64(&self, offset: usize, rank: usize, symbol: u64) -> Option<usize> {
904 if offset >= self.len() || self.bits_per_element() > 64 {
905 None
906 } else {
907 let idx = self.select_offset_u64_unchecked(offset, rank, symbol);
908 if idx < self.len() {
909 Some(idx)
910 } else {
911 None
912 }
913 }
914 }
915
916 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence.
917 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
918 /// where the least significant bit is the first element, and `k` is the number of bits per element
919 /// in the wavelet matrix.
920 ///
921 /// This method does not perform bounds checking, nor does it check if the `symbol` is a valid
922 /// `k`-bit word.
923 /// Use [`select`] for a checked version.
924 ///
925 /// Returns the index of the `rank`-th occurrence of the `symbol` in the encoded sequence,
926 /// or the length of the encoded sequence if the `rank`-th occurrence does not exist.
927 ///
928 /// # Panics
929 /// May panic if the number of bits in `symbol` is not equal to `k`.
930 /// May instead return the length of the encoded sequence.
931 ///
932 /// [`BitVec`]: BitVec
933 /// [`select`]: WaveletMatrix::select
934 #[must_use]
935 pub fn select_unchecked(&self, rank: usize, symbol: &BitVec) -> usize {
936 self.select_offset_unchecked(0, rank, symbol)
937 }
938
939 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence.
940 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
941 /// where the least significant bit is the first element, and `k` is the number of bits per element
942 /// in the wavelet matrix.
943 ///
944 /// Returns `None` if the number of bits in `symbol` is not equal to `k`,
945 /// or if the `rank`-th occurrence of the `symbol` does not exist.
946 ///
947 /// # Example
948 /// ```
949 /// use vers_vecs::{BitVec, WaveletMatrix};
950 ///
951 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
952 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
953 ///
954 /// assert_eq!(wavelet_matrix.select(0, &BitVec::pack_sequence_u8(&[4], 3)), Some(1));
955 /// assert_eq!(wavelet_matrix.select(1, &BitVec::pack_sequence_u8(&[4], 3)), Some(2));
956 /// ```
957 ///
958 /// [`BitVec`]: BitVec
959 #[must_use]
960 pub fn select(&self, rank: usize, symbol: &BitVec) -> Option<usize> {
961 if symbol.len() == self.bits_per_element() {
962 let idx = self.select_unchecked(rank, symbol);
963 if idx < self.len() {
964 Some(idx)
965 } else {
966 None
967 }
968 } else {
969 None
970 }
971 }
972
973 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence.
974 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
975 /// where k is less than or equal to 64.
976 ///
977 /// This method does not perform bounds checking, nor does it check if the elements of the
978 /// wavelet matrix can be represented in a u64 numeral.
979 /// Use [`select_u64`] for a checked version.
980 ///
981 /// Returns the index of the `rank`-th occurrence of the `symbol` in the encoded sequence,
982 /// or the length of the encoded sequence if the `rank`-th occurrence does not exist.
983 ///
984 /// # Panics
985 /// May panic if the number of bits in wavelet matrix elements exceed `64`.
986 /// May instead return the length of the encoded sequence.
987 ///
988 /// [`select_u64`]: WaveletMatrix::select_u64
989 #[must_use]
990 pub fn select_u64_unchecked(&self, rank: usize, symbol: u64) -> usize {
991 self.select_offset_u64_unchecked(0, rank, symbol)
992 }
993
994 /// Get the index of the `rank`-th occurrence of the given `symbol` in the encoded sequence.
995 /// The `symbol` is a `k`-bit word encoded in a u64 numeral,
996 /// where k is less than or equal to 64.
997 ///
998 /// Returns `None` if the number of bits in the wavelet matrix elements exceed `64`,
999 /// or if the `rank`-th occurrence of the `symbol` does not exist.
1000 ///
1001 /// # Example
1002 /// ```
1003 /// use vers_vecs::{BitVec, WaveletMatrix};
1004 ///
1005 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1006 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1007 ///
1008 /// assert_eq!(wavelet_matrix.select_u64(0, 4), Some(1));
1009 /// assert_eq!(wavelet_matrix.select_u64(1, 4), Some(2));
1010 /// ```
1011 #[must_use]
1012 pub fn select_u64(&self, rank: usize, symbol: u64) -> Option<usize> {
1013 if self.bits_per_element() > 64 {
1014 None
1015 } else {
1016 let idx = self.select_u64_unchecked(rank, symbol);
1017 if idx < self.len() {
1018 Some(idx)
1019 } else {
1020 None
1021 }
1022 }
1023 }
1024
1025 /// Get the `k`-th smallest element in the encoded sequence in the specified `range`,
1026 /// where `k = 0` returns the smallest element.
1027 /// The `range` is a half-open interval, meaning that the `end` index is exclusive.
1028 /// The `k`-th smallest element is returned as a `BitVec`,
1029 /// where the least significant bit is the first element.
1030 ///
1031 /// This method does not perform bounds checking.
1032 /// It returns a nonsensical result if the `k` is greater than the size of the range.
1033 /// Use [`quantile`] for a checked version.
1034 ///
1035 /// # Panics
1036 /// May panic if the `range` is out of bounds. May instead return an empty bit vector.
1037 ///
1038 /// [`quantile`]: WaveletMatrix::quantile
1039 #[must_use]
1040 pub fn quantile_unchecked(&self, range: Range<usize>, k: usize) -> BitVec {
1041 let result = BitVec::from_zeros(self.bits_per_element());
1042
1043 self.partial_quantile_search_unchecked(range, k, 0, result)
1044 }
1045
1046 /// Internal function to reuse the quantile code for the predecessor and successor search.
1047 /// This function performs the quantile search starting at the given level with the given prefix.
1048 ///
1049 /// The function does not perform any checks, so the caller must ensure that the range is valid,
1050 /// and that the prefix is a valid prefix for the given level.
1051 #[inline(always)]
1052 fn partial_quantile_search_unchecked(
1053 &self,
1054 mut range: Range<usize>,
1055 mut k: usize,
1056 start_level: usize,
1057 mut prefix: BitVec,
1058 ) -> BitVec {
1059 debug_assert!(prefix.len() == self.bits_per_element());
1060 debug_assert!(!range.is_empty());
1061 debug_assert!(range.end <= self.len());
1062
1063 for (level, data) in self.data.iter().enumerate().skip(start_level) {
1064 let zeros_start = data.rank0(range.start);
1065 let zeros_end = data.rank0(range.end);
1066 let zeros = zeros_end - zeros_start;
1067
1068 // if k < zeros, the element is among the zeros
1069 if k < zeros {
1070 range.start = zeros_start;
1071 range.end = zeros_end;
1072 } else {
1073 // the element is among the ones, so we set the bit to 1, and move the range
1074 // into the 1-partition of the next level
1075 prefix.set_unchecked((self.bits_per_element() - 1) - level, 1);
1076 k -= zeros;
1077 range.start = data.rank0 + (range.start - zeros_start); // range.start - zeros_start is the rank1 of range.start
1078 range.end = data.rank0 + (range.end - zeros_end); // same here
1079 }
1080 }
1081
1082 prefix
1083 }
1084
1085 /// Get the `k`-th smallest element in the encoded sequence in the specified `range`,
1086 /// where `k = 0` returns the smallest element.
1087 /// The `range` is a half-open interval, meaning that the `end` index is exclusive.
1088 /// The `k`-th smallest element is returned as a [`BitVec`],
1089 /// where the least significant bit is the first element.
1090 ///
1091 /// Returns `None` if the `range` is out of bounds, or if `k` is greater than the size of the range.
1092 ///
1093 /// # Example
1094 /// ```
1095 /// use vers_vecs::{BitVec, WaveletMatrix};
1096 ///
1097 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1098 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1099 ///
1100 /// assert_eq!(wavelet_matrix.quantile(0..3, 0), Some(BitVec::pack_sequence_u8(&[1], 3)));
1101 /// assert_eq!(wavelet_matrix.quantile(0..3, 1), Some(BitVec::pack_sequence_u8(&[4], 3)));
1102 /// assert_eq!(wavelet_matrix.quantile(1..4, 0), Some(BitVec::pack_sequence_u8(&[1], 3)));
1103 /// ```
1104 #[must_use]
1105 pub fn quantile(&self, range: Range<usize>, k: usize) -> Option<BitVec> {
1106 if range.start >= self.len() || range.end > self.len() || k >= range.end - range.start {
1107 None
1108 } else {
1109 Some(self.quantile_unchecked(range, k))
1110 }
1111 }
1112
1113 /// Get the `i`-th smallest element in the entire wavelet matrix.
1114 /// The `i`-th smallest element is returned as a `BitVec`,
1115 /// where the least significant bit is the first element.
1116 ///
1117 /// This method does not perform bounds checking.
1118 /// Use [`get_sorted`] for a checked version.
1119 ///
1120 /// # Panics
1121 /// May panic if the `i` is out of bounds, or returns an empty bit vector.
1122 ///
1123 /// [`get_sorted`]: Self::get_sorted
1124 #[must_use]
1125 pub fn get_sorted_unchecked(&self, i: usize) -> BitVec {
1126 self.quantile_unchecked(0..self.len(), i)
1127 }
1128
1129 /// Get the `i`-th smallest element in the wavelet matrix.
1130 /// The `i`-th smallest element is returned as a `BitVec`,
1131 /// where the least significant bit is the first element.
1132 /// This method call is equivalent to `self.quantile(0..self.len(), i)`.
1133 ///
1134 /// Returns `None` if the `i` is out of bounds.
1135 ///
1136 /// # Example
1137 /// ```
1138 /// use vers_vecs::{BitVec, WaveletMatrix};
1139 ///
1140 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1141 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1142 ///
1143 /// assert_eq!(wavelet_matrix.get_sorted(0), Some(BitVec::pack_sequence_u8(&[1], 3)));
1144 /// assert_eq!(wavelet_matrix.get_sorted(1), Some(BitVec::pack_sequence_u8(&[1], 3)));
1145 /// assert_eq!(wavelet_matrix.get_sorted(2), Some(BitVec::pack_sequence_u8(&[2], 3)));
1146 /// ```
1147 #[must_use]
1148 pub fn get_sorted(&self, i: usize) -> Option<BitVec> {
1149 if i >= self.len() {
1150 None
1151 } else {
1152 Some(self.get_sorted_unchecked(i))
1153 }
1154 }
1155
1156 /// Get the `k`-th smallest element in the encoded sequence in the specified `range`,
1157 /// where `k = 0` returns the smallest element.
1158 /// The `range` is a half-open interval, meaning that the `end` index is exclusive.
1159 /// The `k`-th smallest element is returned as a `u64` numeral.
1160 /// If the number of bits per element exceeds 64, the value is truncated.
1161 ///
1162 /// This method does not perform bounds checking.
1163 /// It returns a nonsensical result if the `k` is greater than the size of the range.
1164 /// Use [`quantile_u64`] for a checked version.
1165 ///
1166 /// # Panics
1167 /// May panic if the `range` is out of bounds.
1168 /// May instead return 0.
1169 ///
1170 /// [`quantile_u64`]: WaveletMatrix::quantile_u64
1171 #[must_use]
1172 pub fn quantile_u64_unchecked(&self, range: Range<usize>, k: usize) -> u64 {
1173 self.partial_quantile_search_u64_unchecked(range, k, 0, 0)
1174 }
1175
1176 /// Internal function to reuse the quantile code for the predecessor and successor search.
1177 /// This function performs the quantile search starting at the given level with the given prefix.
1178 ///
1179 /// The function does not perform any checks, so the caller must ensure that the range is valid,
1180 /// and that the prefix is a valid prefix for the given level (i.e. the prefix is not shifted
1181 /// to another level).
1182 #[inline(always)]
1183 fn partial_quantile_search_u64_unchecked(
1184 &self,
1185 mut range: Range<usize>,
1186 mut k: usize,
1187 start_level: usize,
1188 mut prefix: u64,
1189 ) -> u64 {
1190 debug_assert!(self.bits_per_element() <= 64);
1191 debug_assert!(!range.is_empty());
1192 debug_assert!(range.end <= self.len());
1193
1194 for data in self.data.iter().skip(start_level) {
1195 prefix <<= 1;
1196 let zeros_start = data.rank0(range.start);
1197 let zeros_end = data.rank0(range.end);
1198 let zeros = zeros_end - zeros_start;
1199
1200 if k < zeros {
1201 range.start = zeros_start;
1202 range.end = zeros_end;
1203 } else {
1204 prefix |= 1;
1205 k -= zeros;
1206 range.start = data.rank0 + (range.start - zeros_start);
1207 range.end = data.rank0 + (range.end - zeros_end);
1208 }
1209 }
1210
1211 prefix
1212 }
1213
1214 /// Get the `k`-th smallest element in the encoded sequence in the specified `range`,
1215 /// where `k = 0` returns the smallest element.
1216 /// The `range` is a half-open interval, meaning that the `end` index is exclusive.
1217 /// The `k`-th smallest element is returned as a `u64` numeral.
1218 ///
1219 /// Returns `None` if the `range` is out of bounds, or if the number of bits per element exceeds 64,
1220 /// or if `k` is greater than the size of the range.
1221 ///
1222 /// # Example
1223 /// ```
1224 /// use vers_vecs::{BitVec, WaveletMatrix};
1225 ///
1226 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1227 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1228 ///
1229 /// assert_eq!(wavelet_matrix.quantile_u64(0..3, 0), Some(1));
1230 /// assert_eq!(wavelet_matrix.quantile_u64(0..3, 1), Some(4));
1231 /// assert_eq!(wavelet_matrix.quantile_u64(1..4, 0), Some(1));
1232 /// ```
1233 #[must_use]
1234 pub fn quantile_u64(&self, range: Range<usize>, k: usize) -> Option<u64> {
1235 if range.start >= self.len()
1236 || range.end > self.len()
1237 || self.bits_per_element() > 64
1238 || k >= range.end - range.start
1239 {
1240 None
1241 } else {
1242 Some(self.quantile_u64_unchecked(range, k))
1243 }
1244 }
1245
1246 /// Get the `i`-th smallest element in the wavelet matrix.
1247 /// The `i`-th smallest element is returned as a u64 numeral.
1248 ///
1249 /// If the number of bits per element exceeds 64, the value is truncated.
1250 ///
1251 /// This method does not perform bounds checking.
1252 /// Use [`get_sorted_u64`] for a checked version.
1253 ///
1254 /// # Panics
1255 /// May panic if the `i` is out of bounds, or returns an empty bit vector.
1256 ///
1257 /// [`get_sorted_u64`]: WaveletMatrix::get_sorted_u64
1258 #[must_use]
1259 pub fn get_sorted_u64_unchecked(&self, i: usize) -> u64 {
1260 self.quantile_u64_unchecked(0..self.len(), i)
1261 }
1262
1263 /// Get the `i`-th smallest element in the entire wavelet matrix.
1264 /// The `i`-th smallest element is returned as a u64 numeral.
1265 ///
1266 /// Returns `None` if the `i` is out of bounds, or if the number of bits per element exceeds 64.
1267 ///
1268 /// # Example
1269 /// ```
1270 /// use vers_vecs::{BitVec, WaveletMatrix};
1271 ///
1272 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1273 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1274 ///
1275 /// assert_eq!(wavelet_matrix.get_sorted_u64(0), Some(1));
1276 /// assert_eq!(wavelet_matrix.get_sorted_u64(1), Some(1));
1277 /// assert_eq!(wavelet_matrix.get_sorted_u64(2), Some(2));
1278 /// ```
1279 #[must_use]
1280 pub fn get_sorted_u64(&self, i: usize) -> Option<u64> {
1281 if i >= self.len() || self.bits_per_element() > 64 {
1282 None
1283 } else {
1284 Some(self.get_sorted_u64_unchecked(i))
1285 }
1286 }
1287
1288 /// Get the smallest element in the encoded sequence in the specified `range`.
1289 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1290 /// The smallest element is returned as a `BitVec`,
1291 ///
1292 /// This method does not perform bounds checking.
1293 /// Use [`range_min`] for a checked version.
1294 ///
1295 /// # Panics
1296 /// May panic if the `range` is out of bounds or if the range is empty.
1297 /// May instead return an empty bit vector.
1298 ///
1299 /// [`range_min`]: WaveletMatrix::range_min
1300 #[must_use]
1301 pub fn range_min_unchecked(&self, range: Range<usize>) -> BitVec {
1302 self.quantile_unchecked(range, 0)
1303 }
1304
1305 /// Get the smallest element in the encoded sequence in the specified `range`.
1306 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1307 /// The smallest element is returned as a `BitVec`,
1308 ///
1309 /// Returns `None` if the `range` is out of bounds or if the range is empty.
1310 ///
1311 /// # Example
1312 /// ```
1313 /// use vers_vecs::{BitVec, WaveletMatrix};
1314 ///
1315 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1316 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1317 ///
1318 /// assert_eq!(wavelet_matrix.range_min(0..3), Some(BitVec::pack_sequence_u8(&[1], 3)));
1319 /// assert_eq!(wavelet_matrix.range_min(1..4), Some(BitVec::pack_sequence_u8(&[1], 3)));
1320 /// assert_eq!(wavelet_matrix.range_min(1..3), Some(BitVec::pack_sequence_u8(&[4], 3)));
1321 /// ```
1322 #[must_use]
1323 pub fn range_min(&self, range: Range<usize>) -> Option<BitVec> {
1324 self.quantile(range, 0)
1325 }
1326
1327 /// Get the smallest element in the encoded sequence in the specified `range`.
1328 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1329 /// The smallest element is returned as a `u64` numeral.
1330 /// If the number of bits per element exceeds 64, the value is truncated.
1331 ///
1332 /// This method does not perform bounds checking.
1333 /// Use [`range_min_u64`] for a checked version.
1334 ///
1335 /// # Panics
1336 /// May panic if the `range` is out of bounds or if the range is empty.
1337 /// May instead return 0.
1338 ///
1339 /// [`range_min_u64`]: WaveletMatrix::range_min_u64
1340 #[must_use]
1341 pub fn range_min_u64_unchecked(&self, range: Range<usize>) -> u64 {
1342 self.quantile_u64_unchecked(range, 0)
1343 }
1344
1345 /// Get the smallest element in the encoded sequence in the specified `range`.
1346 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1347 /// The smallest element is returned as a `u64` numeral.
1348 ///
1349 /// Returns `None` if the `range` is out of bounds, if the range is empty, or if the number of bits
1350 /// per element exceeds 64.
1351 ///
1352 /// # Example
1353 /// ```
1354 /// use vers_vecs::{BitVec, WaveletMatrix};
1355 ///
1356 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1357 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1358 ///
1359 /// assert_eq!(wavelet_matrix.range_min_u64(0..3), Some(1));
1360 /// assert_eq!(wavelet_matrix.range_min_u64(1..4), Some(1));
1361 /// assert_eq!(wavelet_matrix.range_min_u64(1..3), Some(4));
1362 /// ```
1363 #[must_use]
1364 pub fn range_min_u64(&self, range: Range<usize>) -> Option<u64> {
1365 self.quantile_u64(range, 0)
1366 }
1367
1368 /// Get the largest element in the encoded sequence in the specified `range`.
1369 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1370 /// The largest element is returned as a `BitVec`,
1371 /// where the least significant bit is the first element.
1372 ///
1373 /// This method does not perform bounds checking.
1374 /// Use [`range_max`] for a checked version.
1375 ///
1376 /// # Panics
1377 /// May panic if the `range` is out of bounds or if the range is empty.
1378 /// May instead return an empty bit vector.
1379 ///
1380 /// [`range_max`]: WaveletMatrix::range_max
1381 #[must_use]
1382 pub fn range_max_unchecked(&self, range: Range<usize>) -> BitVec {
1383 let k = range.end - range.start - 1;
1384 self.quantile_unchecked(range, k)
1385 }
1386
1387 /// Get the largest element in the encoded sequence in the specified `range`.
1388 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1389 /// The largest element is returned as a `BitVec`,
1390 /// where the least significant bit is the first element.
1391 ///
1392 /// Returns `None` if the `range` is out of bounds or if the range is empty.
1393 ///
1394 /// # Example
1395 /// ```
1396 /// use vers_vecs::{BitVec, WaveletMatrix};
1397 ///
1398 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1399 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1400 ///
1401 /// assert_eq!(wavelet_matrix.range_max(0..3), Some(BitVec::pack_sequence_u8(&[4], 3)));
1402 /// assert_eq!(wavelet_matrix.range_max(3..6), Some(BitVec::pack_sequence_u8(&[7], 3)));
1403 /// ```
1404 #[must_use]
1405 pub fn range_max(&self, range: Range<usize>) -> Option<BitVec> {
1406 if range.is_empty() {
1407 None
1408 } else {
1409 let k = range.end - range.start - 1;
1410 self.quantile(range, k)
1411 }
1412 }
1413
1414 /// Get the largest element in the encoded sequence in the specified `range`.
1415 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1416 /// The largest element is returned as a `u64` numeral.
1417 /// If the number of bits per element exceeds 64, the value is truncated.
1418 ///
1419 /// This method does not perform bounds checking.
1420 /// Use [`range_max_u64`] for a checked version.
1421 ///
1422 /// # Panics
1423 /// May panic if the `range` is out of bounds or if the range is empty.
1424 /// May instead return 0.
1425 ///
1426 /// [`range_max_u64`]: WaveletMatrix::range_max_u64
1427 #[must_use]
1428 pub fn range_max_u64_unchecked(&self, range: Range<usize>) -> u64 {
1429 let k = range.end - range.start - 1;
1430 self.quantile_u64_unchecked(range, k)
1431 }
1432
1433 /// Get the largest element in the encoded sequence in the specified `range`.
1434 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1435 /// The largest element is returned as a `u64` numeral.
1436 ///
1437 /// Returns `None` if the `range` is out of bounds, if the range is empty, or if the number of bits
1438 /// per element exceeds 64.
1439 ///
1440 /// # Example
1441 /// ```
1442 /// use vers_vecs::{BitVec, WaveletMatrix};
1443 ///
1444 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1445 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1446 ///
1447 /// assert_eq!(wavelet_matrix.range_max_u64(0..3), Some(4));
1448 /// assert_eq!(wavelet_matrix.range_max_u64(3..6), Some(7));
1449 /// ```
1450 #[must_use]
1451 pub fn range_max_u64(&self, range: Range<usize>) -> Option<u64> {
1452 if range.is_empty() {
1453 None
1454 } else {
1455 let k = range.end - range.start - 1;
1456 self.quantile_u64(range, k)
1457 }
1458 }
1459
1460 /// Get the median element in the encoded sequence in the specified `range`.
1461 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1462 /// The median element is returned as a `BitVec`,
1463 /// where the least significant bit is the first element.
1464 ///
1465 /// If the range does not contain an odd number of elements, the position is rounded down.
1466 ///
1467 /// This method does not perform bounds checking.
1468 /// Use [`range_median`] for a checked version.
1469 ///
1470 /// # Panics
1471 /// May panic if the `range` is out of bounds or if the range is empty.
1472 /// May instead return an empty bit vector.
1473 ///
1474 /// [`range_median`]: WaveletMatrix::range_median
1475 #[must_use]
1476 pub fn range_median_unchecked(&self, range: Range<usize>) -> BitVec {
1477 let k = (range.end - 1 - range.start) / 2;
1478 self.quantile_unchecked(range, k)
1479 }
1480
1481 /// Get the median element in the encoded sequence in the specified `range`.
1482 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1483 /// The median element is returned as a `BitVec`,
1484 /// where the least significant bit is the first element.
1485 ///
1486 /// If the range does not contain an odd number of elements, the position is rounded down.
1487 ///
1488 /// Returns `None` if the `range` is out of bounds or if the range is empty.
1489 ///
1490 /// # Example
1491 /// ```
1492 /// use vers_vecs::{BitVec, WaveletMatrix};
1493 ///
1494 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1495 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1496 ///
1497 /// assert_eq!(wavelet_matrix.range_median(0..3), Some(BitVec::pack_sequence_u8(&[4], 3)));
1498 /// assert_eq!(wavelet_matrix.range_median(1..4), Some(BitVec::pack_sequence_u8(&[4], 3)));
1499 /// assert_eq!(wavelet_matrix.range_median(0..6), Some(BitVec::pack_sequence_u8(&[2], 3)));
1500 /// ```
1501 #[must_use]
1502 pub fn range_median(&self, range: Range<usize>) -> Option<BitVec> {
1503 if range.is_empty() {
1504 None
1505 } else {
1506 let k = (range.end - 1 - range.start) / 2;
1507 self.quantile(range, k)
1508 }
1509 }
1510
1511 /// Get the median element in the encoded sequence in the specified `range`.
1512 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1513 /// The median element is returned as a `u64` numeral.
1514 /// If the number of bits per element exceeds 64, the value is truncated.
1515 ///
1516 /// If the range does not contain an odd number of elements, the position is rounded down.
1517 ///
1518 /// This method does not perform bounds checking.
1519 /// Use [`range_median_u64`] for a checked version.
1520 ///
1521 /// # Panics
1522 /// May panic if the `range` is out of bounds or if the range is empty.
1523 /// May instead return 0.
1524 ///
1525 /// [`range_median_u64`]: WaveletMatrix::range_median_u64
1526 #[must_use]
1527 pub fn range_median_u64_unchecked(&self, range: Range<usize>) -> u64 {
1528 let k = (range.end - 1 - range.start) / 2;
1529 self.quantile_u64_unchecked(range, k)
1530 }
1531
1532 /// Get the median element in the encoded sequence in the specified `range`.
1533 /// The range is a half-open interval, meaning that the `end` index is exclusive.
1534 /// The median element is returned as a `u64` numeral.
1535 ///
1536 /// If the range does not contain an odd number of elements, the position is rounded down.
1537 ///
1538 /// Returns `None` if the `range` is out of bounds, if the range is empty, or if the number of bits
1539 /// per element exceeds 64.
1540 ///
1541 /// # Example
1542 /// ```
1543 /// use vers_vecs::{BitVec, WaveletMatrix};
1544 ///
1545 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1546 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1547 ///
1548 /// assert_eq!(wavelet_matrix.range_median_u64(0..3), Some(4));
1549 /// assert_eq!(wavelet_matrix.range_median_u64(1..4), Some(4));
1550 /// assert_eq!(wavelet_matrix.range_median_u64(0..6), Some(2));
1551 /// ```
1552 #[must_use]
1553 pub fn range_median_u64(&self, range: Range<usize>) -> Option<u64> {
1554 if range.is_empty() || self.bits_per_element() > 64 || range.end > self.len() {
1555 None
1556 } else {
1557 let k = (range.end - 1 - range.start) / 2;
1558 self.quantile_u64(range, k)
1559 }
1560 }
1561
1562 /// Get the predecessor of the given `symbol` in the given `range`.
1563 /// This is a private generic helper function to implement the public `predecessor` functions.
1564 ///
1565 /// The read and write access to the query and result values are abstracted by the `Reader` and `Writer` closures.
1566 #[inline(always)] // even though the function is pretty large, inlining probably gets rid of the closure calls in favor of static calls
1567 fn predecessor_generic_unchecked<
1568 T: Clone,
1569 Reader: Fn(usize, &T) -> u64,
1570 Writer: Fn(u64, usize, &mut T),
1571 Quantile: Fn(&Self, Range<usize>, usize, usize, T) -> T,
1572 >(
1573 &self,
1574 mut range: Range<usize>,
1575 symbol: &T,
1576 mut result_value: T,
1577 bit_reader: Reader,
1578 result_writer: Writer,
1579 quantile_search: Quantile,
1580 ) -> Option<T> {
1581 // the bit-prefix at the last node where we could go to an interval with smaller elements,
1582 // i.e. where we need to go if the current prefix has no elements smaller than the query
1583 let mut last_smaller_prefix = result_value.clone();
1584 // the level of the last node where we could go to an interval with smaller elements
1585 let mut last_one_level: Option<usize> = None;
1586 // the range of the last node where we could go to an interval with smaller elements
1587 let mut next_smaller_range: Option<Range<usize>> = None;
1588
1589 for (level, data) in self.data.iter().enumerate() {
1590 let query_bit = bit_reader(level, symbol);
1591
1592 // read the amount of elements with the current result-prefix plus one 0 bit.
1593 // if the query_bit is 1, we can calculate the amount of elements with the result-prefix
1594 // plus one 1 from there
1595 let zeros_start = data.rank0(range.start);
1596 let zeros_end = data.rank0(range.end);
1597 let elements_zero = zeros_end - zeros_start;
1598
1599 if query_bit == 0 {
1600 if elements_zero == 0 {
1601 // if our query bit is zero in this level and suddenly our new interval is empty,
1602 // all elements that were in the previous interval are bigger than the query element,
1603 // because they all have a 1 in the current level.
1604 // this means the predecessor is the largest element in the last smaller interval,
1605 // i.e. the interval that has a 0 bit at the last level where our prefix had a 1 bit.
1606
1607 return next_smaller_range.map(|r| {
1608 let idx = r.end - r.start - 1;
1609 result_writer(0, last_one_level.unwrap(), &mut last_smaller_prefix);
1610 quantile_search(
1611 self,
1612 r,
1613 idx,
1614 last_one_level.unwrap() + 1,
1615 last_smaller_prefix,
1616 )
1617 });
1618 }
1619
1620 // update the prefix
1621 result_writer(0, level, &mut result_value);
1622
1623 // update the range to the interval of the new prefix
1624 range.start = zeros_start;
1625 range.end = zeros_end;
1626 } else {
1627 if elements_zero == range.end - range.start {
1628 // if our query element is 1 in this level and suddenly our new interval is empty,
1629 // all elements that were in the previous interval are smaller than the query element,
1630 // because they all have a 0 in the current level.
1631 // this means the predecessor is the largest element in the last level's interval.
1632
1633 let idx = range.end - range.start - 1;
1634 return Some(quantile_search(self, range, idx, level, result_value));
1635 }
1636
1637 // if the other interval is not empty, we update the last smaller interval to the last interval where we can switch to a 0 prefix
1638 if !(zeros_start..zeros_end).is_empty() {
1639 last_one_level = Some(level);
1640 next_smaller_range = Some(zeros_start..zeros_end);
1641 last_smaller_prefix = result_value.clone();
1642 }
1643
1644 // update the prefix
1645 result_writer(1, level, &mut result_value);
1646
1647 // update the range to the interval of the new prefix
1648 range.start = data.rank0 + (range.start - zeros_start);
1649 range.end = data.rank0 + (range.end - zeros_end);
1650 }
1651 }
1652
1653 Some(result_value)
1654 }
1655
1656 /// Get the predecessor of the given `symbol` in the given `range`.
1657 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
1658 /// where the least significant bit is the first element, and `k` is the number of bits per element
1659 /// in the wavelet matrix.
1660 /// The `symbol` does not have to be in the encoded sequence.
1661 /// The predecessor is the largest element in the `range` that is smaller than or equal
1662 /// to the `symbol`.
1663 ///
1664 /// Returns `None` if the number of bits in the `symbol` is not equal to `k`,
1665 /// if the range is empty, if the wavelet matrix is empty, if the range is out of bounds,
1666 /// or if the `symbol` is smaller than all elements in the range.
1667 ///
1668 /// # Example
1669 /// ```
1670 /// use vers_vecs::{BitVec, WaveletMatrix};
1671 ///
1672 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1673 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1674 ///
1675 /// assert_eq!(wavelet_matrix.predecessor(0..3, &BitVec::pack_sequence_u8(&[7], 3)), Some(BitVec::pack_sequence_u8(&[4], 3)));
1676 /// assert_eq!(wavelet_matrix.predecessor(0..3, &BitVec::pack_sequence_u8(&[4], 3)), Some(BitVec::pack_sequence_u8(&[4], 3)));
1677 /// assert_eq!(wavelet_matrix.predecessor(0..6, &BitVec::pack_sequence_u8(&[7], 3)), Some(BitVec::pack_sequence_u8(&[7], 3)));
1678 /// ```
1679 ///
1680 /// [`BitVec`]: BitVec
1681 #[must_use]
1682 pub fn predecessor(&self, range: Range<usize>, symbol: &BitVec) -> Option<BitVec> {
1683 if symbol.len() != self.bits_per_element()
1684 || range.is_empty()
1685 || self.is_empty()
1686 || range.end > self.len()
1687 {
1688 return None;
1689 }
1690
1691 self.predecessor_generic_unchecked(
1692 range,
1693 symbol,
1694 BitVec::from_zeros(self.bits_per_element()),
1695 |level, symbol| symbol.get_unchecked((self.bits_per_element() - 1) - level),
1696 |bit, level, result| {
1697 result.set_unchecked((self.bits_per_element() - 1) - level, bit);
1698 },
1699 Self::partial_quantile_search_unchecked,
1700 )
1701 }
1702
1703 /// Get the predecessor of the given `symbol` in the given `range`.
1704 /// The `symbol` is a `k`-bit word encoded in a `u64` numeral,
1705 /// where k is less than or equal to 64.
1706 /// The `symbol` does not have to be in the encoded sequence.
1707 /// The predecessor is the largest element in the `range` that is smaller than or equal
1708 /// to the `symbol`.
1709 ///
1710 /// Returns `None` if the number of bits in the matrix is greater than 64,
1711 /// if the range is empty, if the wavelet matrix is empty, if the range is out of bounds,
1712 /// or if the `symbol` is smaller than all elements in the range.
1713 ///
1714 /// # Example
1715 /// ```
1716 /// use vers_vecs::{BitVec, WaveletMatrix};
1717 ///
1718 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1719 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1720 ///
1721 /// assert_eq!(wavelet_matrix.predecessor_u64(0..3, 7), Some(4));
1722 /// assert_eq!(wavelet_matrix.predecessor_u64(0..3, 4), Some(4));
1723 /// assert_eq!(wavelet_matrix.predecessor_u64(0..6, 7), Some(7));
1724 /// ```
1725 #[must_use]
1726 pub fn predecessor_u64(&self, range: Range<usize>, symbol: u64) -> Option<u64> {
1727 if self.bits_per_element() > 64
1728 || range.is_empty()
1729 || self.is_empty()
1730 || range.end > self.len()
1731 {
1732 return None;
1733 }
1734
1735 self.predecessor_generic_unchecked(
1736 range,
1737 &symbol,
1738 0,
1739 |level, symbol| (symbol >> ((self.bits_per_element() - 1) - level)) & 1,
1740 |bit, _level, result| {
1741 // we ignore the level here, and instead rely on the fact that the bits are set in order.
1742 // we have to do that, because the quantile_search_u64 does the same.
1743 *result <<= 1;
1744 *result |= bit;
1745 },
1746 Self::partial_quantile_search_u64_unchecked,
1747 )
1748 }
1749
1750 #[inline(always)]
1751 fn successor_generic_unchecked<
1752 T: Clone,
1753 Reader: Fn(usize, &T) -> u64,
1754 Writer: Fn(u64, usize, &mut T),
1755 Quantile: Fn(&Self, Range<usize>, usize, usize, T) -> T,
1756 >(
1757 &self,
1758 mut range: Range<usize>,
1759 symbol: &T,
1760 mut result_value: T,
1761 bit_reader: Reader,
1762 result_writer: Writer,
1763 quantile_search: Quantile,
1764 ) -> Option<T> {
1765 // the bit-prefix at the last node where we could go to an interval with larger elements,
1766 // i.e. where we need to go if the current prefix has no elements larger than the query
1767 let mut last_larger_prefix = result_value.clone();
1768 // the level of the last node where we could go to an interval with larger elements
1769 let mut last_zero_level: Option<usize> = None;
1770 // the range of the last node where we could go to an interval with larger elements
1771 let mut next_larger_range: Option<Range<usize>> = None;
1772
1773 for (level, data) in self.data.iter().enumerate() {
1774 let query_bit = bit_reader(level, symbol);
1775
1776 // read the amount of elements with the current result-prefix plus one 0 bit.
1777 // if the query_bit is 1, we can calculate the amount of elements with the result-prefix
1778 // plus one 1 from there
1779 let zeros_start = data.rank0(range.start);
1780 let zeros_end = data.rank0(range.end);
1781 let elements_zero = zeros_end - zeros_start;
1782
1783 if query_bit == 0 {
1784 if elements_zero == 0 {
1785 // if our query element is 0 in this level and suddenly our new interval is empty,
1786 // all elements that were in the previous interval are larger than the query element,
1787 // because they all have a 1 in the current level.
1788 // this means the successor is the smallest element in the last level's interval.
1789
1790 return Some(quantile_search(self, range, 0, level, result_value));
1791 }
1792
1793 // if the other interval is not empty, we update the last interval where we can switch to a prefix with a 1
1794 if !(data.rank0 + (range.start - zeros_start)..data.rank0 + (range.end - zeros_end))
1795 .is_empty()
1796 {
1797 last_zero_level = Some(level);
1798 next_larger_range = Some(
1799 data.rank0 + (range.start - zeros_start)
1800 ..data.rank0 + (range.end - zeros_end),
1801 );
1802 last_larger_prefix = result_value.clone();
1803 }
1804
1805 // update the prefix
1806 result_writer(0, level, &mut result_value);
1807
1808 // update the range to the interval of the new prefix
1809 range.start = zeros_start;
1810 range.end = zeros_end;
1811 } else {
1812 if elements_zero == range.end - range.start {
1813 // if our query bit is 1 in this level and suddenly our new interval is empty,
1814 // all elements that were in the previous interval are smaller than the query element,
1815 // because they all have a 0 in the current level.
1816 // this means the successor is the smallest element in the last interval with a larger prefix,
1817 // i.e. the interval that has a 1 bit at the last level where our prefix had a 0 bit.
1818
1819 return next_larger_range.map(|r| {
1820 result_writer(1, last_zero_level.unwrap(), &mut last_larger_prefix);
1821 quantile_search(
1822 self,
1823 r,
1824 0,
1825 last_zero_level.unwrap() + 1,
1826 last_larger_prefix,
1827 )
1828 });
1829 }
1830
1831 // update the prefix
1832 result_writer(1, level, &mut result_value);
1833
1834 // update the range to the interval of the new prefix
1835 range.start = data.rank0 + (range.start - zeros_start);
1836 range.end = data.rank0 + (range.end - zeros_end);
1837 }
1838 }
1839
1840 Some(result_value)
1841 }
1842
1843 /// Get the successor of the given `symbol` in the given range.
1844 /// The `symbol` is a `k`-bit word encoded in a [`BitVec`],
1845 /// where the least significant bit is the first element, and `k` is the number of bits per element
1846 /// in the wavelet matrix.
1847 /// The `symbol` does not have to be in the encoded sequence.
1848 /// The successor is the smallest element in the range that is greater than or equal
1849 /// to the `symbol`.
1850 ///
1851 /// Returns `None` if the number of bits in the `symbol` is not equal to `k`,
1852 /// if the range is empty, if the wavelet matrix is empty, if the range is out of bounds,
1853 /// or if the `symbol` is greater than all elements in the range.
1854 ///
1855 /// # Example
1856 /// ```
1857 /// use vers_vecs::{BitVec, WaveletMatrix};
1858 ///
1859 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1860 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1861 ///
1862 /// assert_eq!(wavelet_matrix.successor(0..3, &BitVec::pack_sequence_u8(&[2], 3)), Some(BitVec::pack_sequence_u8(&[4], 3)));
1863 /// assert_eq!(wavelet_matrix.successor(0..3, &BitVec::pack_sequence_u8(&[5], 3)), None);
1864 /// assert_eq!(wavelet_matrix.successor(0..6, &BitVec::pack_sequence_u8(&[2], 3)), Some(BitVec::pack_sequence_u8(&[2], 3)));
1865 /// ```
1866 ///
1867 /// [`BitVec`]: BitVec
1868 #[must_use]
1869 pub fn successor(&self, range: Range<usize>, symbol: &BitVec) -> Option<BitVec> {
1870 if symbol.len() != self.bits_per_element()
1871 || range.is_empty()
1872 || self.is_empty()
1873 || range.end > self.len()
1874 {
1875 return None;
1876 }
1877
1878 self.successor_generic_unchecked(
1879 range,
1880 symbol,
1881 BitVec::from_zeros(self.bits_per_element()),
1882 |level, symbol| symbol.get_unchecked((self.bits_per_element() - 1) - level),
1883 |bit, level, result| {
1884 result.set_unchecked((self.bits_per_element() - 1) - level, bit);
1885 },
1886 Self::partial_quantile_search_unchecked,
1887 )
1888 }
1889
1890 /// Get the successor of the given `symbol` in the given range.
1891 /// The `symbol` is a `k`-bit word encoded in a `u64` numeral,
1892 /// where k is less than or equal to 64.
1893 /// The `symbol` does not have to be in the encoded sequence.
1894 /// The successor is the smallest element in the range that is greater than or equal
1895 /// to the `symbol`.
1896 ///
1897 /// Returns `None` if the number of bits in the matrix is greater than 64,
1898 /// if the range is empty, if the wavelet matrix is empty, if the range is out of bounds,
1899 /// or if the `symbol` is greater than all elements in the range.
1900 ///
1901 /// # Example
1902 /// ```
1903 /// use vers_vecs::{BitVec, WaveletMatrix};
1904 ///
1905 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1906 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1907 ///
1908 /// assert_eq!(wavelet_matrix.successor_u64(0..3, 2), Some(4));
1909 /// assert_eq!(wavelet_matrix.successor_u64(0..3, 5), None);
1910 /// assert_eq!(wavelet_matrix.successor_u64(0..6, 2), Some(2));
1911 /// ```
1912 #[must_use]
1913 pub fn successor_u64(&self, range: Range<usize>, symbol: u64) -> Option<u64> {
1914 if self.bits_per_element() > 64
1915 || range.is_empty()
1916 || self.is_empty()
1917 || range.end > self.len()
1918 {
1919 return None;
1920 }
1921
1922 self.successor_generic_unchecked(
1923 range,
1924 &symbol,
1925 0,
1926 |level, symbol| (symbol >> ((self.bits_per_element() - 1) - level)) & 1,
1927 |bit, _level, result| {
1928 // we ignore the level here, and instead rely on the fact that the bits are set in order.
1929 // we have to do that, because the quantile_search_u64 does the same.
1930 *result <<= 1;
1931 *result |= bit;
1932 },
1933 Self::partial_quantile_search_u64_unchecked,
1934 )
1935 }
1936
1937 /// Get an iterator over the elements of the encoded sequence.
1938 /// The iterator yields `u64` elements.
1939 /// If the number of bits per element exceeds 64, `None` is returned.
1940 ///
1941 /// # Example
1942 /// ```
1943 /// use vers_vecs::{BitVec, WaveletMatrix};
1944 ///
1945 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1946 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
1947 ///
1948 /// let mut iter = wavelet_matrix.iter_u64().unwrap();
1949 /// assert_eq!(iter.collect::<Vec<_>>(), vec![1, 4, 4, 1, 2, 7]);
1950 /// ```
1951 #[must_use]
1952 pub fn iter_u64(&self) -> Option<WaveletNumRefIter<'_>> {
1953 if self.bits_per_element() > 64 {
1954 None
1955 } else {
1956 Some(WaveletNumRefIter::new(self))
1957 }
1958 }
1959
1960 /// Turn the encoded sequence into an iterator.
1961 /// The iterator yields `u64` elements.
1962 /// If the number of bits per element exceeds 64, `None` is returned.
1963 #[must_use]
1964 pub fn into_iter_u64(self) -> Option<WaveletNumIter> {
1965 if self.bits_per_element() > 64 {
1966 None
1967 } else {
1968 Some(WaveletNumIter::new(self))
1969 }
1970 }
1971
1972 /// Get an iterator over the sorted elements of the encoded sequence.
1973 /// The iterator yields `BitVec` elements.
1974 ///
1975 /// See also [`iter_sorted_u64`] for an iterator that yields `u64` elements.
1976 ///
1977 /// [`iter_sorted_u64`]: Self::iter_sorted_u64
1978 #[must_use]
1979 pub fn iter_sorted(&self) -> WaveletSortedRefIter<'_> {
1980 WaveletSortedRefIter::new(self)
1981 }
1982
1983 /// Turn the encoded sequence into an iterator over the sorted sequence.
1984 /// The iterator yields `BitVec` elements.
1985 #[must_use]
1986 pub fn into_iter_sorted(self) -> WaveletSortedIter {
1987 WaveletSortedIter::new(self)
1988 }
1989
1990 /// Get an iterator over the sorted elements of the encoded sequence.
1991 /// The iterator yields `u64` elements.
1992 /// If the number of bits per element exceeds 64, `None` is returned.
1993 ///
1994 /// # Example
1995 /// ```
1996 /// use vers_vecs::{BitVec, WaveletMatrix};
1997 ///
1998 /// let bit_vec = BitVec::pack_sequence_u8(&[1, 4, 4, 1, 2, 7], 3);
1999 /// let wavelet_matrix = WaveletMatrix::from_bit_vec(&bit_vec, 3);
2000 ///
2001 /// let mut iter = wavelet_matrix.iter_sorted_u64().unwrap();
2002 /// assert_eq!(iter.collect::<Vec<_>>(), vec![1, 1, 2, 4, 4, 7]);
2003 /// ```
2004 #[must_use]
2005 pub fn iter_sorted_u64(&self) -> Option<WaveletSortedNumRefIter<'_>> {
2006 if self.bits_per_element() > 64 {
2007 None
2008 } else {
2009 Some(WaveletSortedNumRefIter::new(self))
2010 }
2011 }
2012
2013 /// Turn the encoded sequence into an iterator over the sorted sequence.
2014 /// The iterator yields `u64` elements.
2015 /// If the number of bits per element exceeds 64, `None` is returned.
2016 #[must_use]
2017 pub fn into_iter_sorted_u64(self) -> Option<WaveletSortedNumIter> {
2018 if self.bits_per_element() > 64 {
2019 None
2020 } else {
2021 Some(WaveletSortedNumIter::new(self))
2022 }
2023 }
2024
2025 /// Get the number of bits per element in the alphabet of the encoded sequence.
2026 #[must_use]
2027 #[inline(always)]
2028 pub fn bits_per_element(&self) -> usize {
2029 self.data.len()
2030 }
2031
2032 /// Get the number of bits per element in the alphabet of the encoded sequence.
2033 #[must_use]
2034 #[deprecated(since = "1.5.1", note = "please use `bits_per_element` instead")]
2035 #[allow(clippy::cast_possible_truncation)]
2036 pub fn bit_len(&self) -> u16 {
2037 self.bits_per_element() as u16
2038 }
2039
2040 /// Get the number of elements stored in the encoded sequence.
2041 #[must_use]
2042 pub fn len(&self) -> usize {
2043 if self.data.is_empty() {
2044 0
2045 } else {
2046 self.data[0].len()
2047 }
2048 }
2049
2050 /// Check if the wavelet matrix is empty.
2051 #[must_use]
2052 pub fn is_empty(&self) -> bool {
2053 self.len() == 0
2054 }
2055
2056 /// Get the number of bytes allocated on the heap for the wavelet matrix.
2057 /// This does not include memory that is allocated but unused due to allocation policies of
2058 /// internal data structures.
2059 #[must_use]
2060 pub fn heap_size(&self) -> usize {
2061 self.data.iter().map(RsVec::heap_size).sum::<usize>()
2062 }
2063}
2064
2065impl_vector_iterator!(
2066 WaveletMatrix,
2067 WaveletIter,
2068 WaveletRefIter,
2069 get_value_unchecked,
2070 get_value,
2071 BitVec
2072);
2073
2074impl_vector_iterator!(
2075 WaveletMatrix,
2076 WaveletNumIter,
2077 WaveletNumRefIter,
2078 get_u64_unchecked,
2079 get_u64,
2080 u64,
2081 special
2082);
2083
2084impl_vector_iterator!(
2085 WaveletMatrix,
2086 WaveletSortedIter,
2087 WaveletSortedRefIter,
2088 get_sorted_unchecked,
2089 get_sorted,
2090 BitVec,
2091 special
2092);
2093
2094impl_vector_iterator!(
2095 WaveletMatrix,
2096 WaveletSortedNumIter,
2097 WaveletSortedNumRefIter,
2098 get_sorted_u64_unchecked,
2099 get_sorted_u64,
2100 u64,
2101 special
2102);
2103
2104#[cfg(test)]
2105mod tests;