simple_sds_sbwt/wavelet_matrix.rs
1//! An immutable integer vector supporting rank/select-type queries.
2//!
3//! The wavelet matrix was first described in:
4//!
5//! > Claude, Navarro, Ordóñez: The wavelet matrix: An efficient wavelet tree for large alphabets.
6//! > Information Systems, 2015.
7//! > DOI: [10.1016/j.is.2014.06.002](https://doi.org/10.1016/j.is.2014.06.002)
8//!
9//! See [`wm_core`] for a low-level description.
10//! As in wavelet trees, access and rank queries proceed down from level `0`, while select queries go up from level `width - 1`.
11
12use crate::int_vector::IntVector;
13use crate::ops::{Vector, Access, AccessIter, VectorIndex, Pack};
14use crate::serialize::Serialize;
15use crate::wavelet_matrix::wm_core::WMCore;
16
17use std::io::{Error, ErrorKind};
18use std::iter::FusedIterator;
19use std::io;
20
21pub mod wm_core;
22
23#[cfg(test)]
24mod tests;
25
26//-----------------------------------------------------------------------------
27
28/// An immutable integer vector supporting rank/select-type queries.
29///
30/// Each item consists of the lowest 1 to 64 bits of a [`u64`] value, as specified by the width of the vector.
31/// The vector is represented using [`WMCore`].
32/// There is also an [`IntVector`] storing the starting position of each possible item value after the reordering done by the core.
33/// Hence a `WaveletMatrix` should only be used when most values in `0..(1 << width)` are in use.
34/// The maximum length of the vector is approximately [`usize::MAX`] items.
35///
36/// A `WaveletMatrix` can be built from a [`Vec`] of unsigned integers using the [`From`] trait.
37/// The construction requires several passes over the input and uses the input vector as working space.
38/// Using smaller integer types helps reducing the space overhead during construction.
39///
40/// `WaveletMatrix` implements the following `simple_sds` traits:
41/// * Basic functionality: [`Vector`], [`Access`]
42/// * Queries and operations: [`VectorIndex`]
43/// * Serialization: [`Serialize`]
44///
45/// Overridden default implementations:
46/// * [`VectorIndex::contains`] has a simple constant-time implementation.
47/// * [`VectorIndex::inverse_select`] is effectively the same as [`Access::get`].
48///
49/// # Examples
50///
51/// ```
52/// use simple_sds_sbwt::ops::{Vector, Access, VectorIndex};
53/// use simple_sds_sbwt::wavelet_matrix::WaveletMatrix;
54///
55/// // Construction
56/// let source: Vec<u64> = vec![1, 0, 3, 1, 1, 2, 4, 5, 1, 2, 1, 7, 0, 1];
57/// let wm = WaveletMatrix::from(source.clone());
58///
59/// // Access
60/// assert_eq!(wm.len(), source.len());
61/// assert_eq!(wm.width(), 3);
62/// for i in 0..wm.len() {
63/// assert_eq!(wm.get(i), source[i]);
64/// }
65/// assert!(wm.iter().eq(source.iter().cloned()));
66///
67/// // Rank
68/// assert_eq!(wm.rank(5, 3), 1);
69/// assert_eq!(wm.rank(10, 2), 2);
70///
71/// // Select
72/// assert_eq!(wm.select(2, 1), Some(4));
73/// assert!(wm.select(1, 7).is_none());
74/// assert_eq!(wm.select_iter(1, 2).next(), Some((1, 9)));
75///
76/// // Inverse select
77/// let index = 7;
78/// let inverse = wm.inverse_select(index).unwrap();
79/// assert_eq!(inverse, (0, 5));
80/// assert_eq!(wm.select(inverse.0, inverse.1), Some(index));
81///
82/// // Predecessor / successor
83/// assert!(wm.predecessor(1, 3).next().is_none());
84/// assert_eq!(wm.predecessor(2, 3).next(), Some((0, 2)));
85/// assert_eq!(wm.successor(12, 0).next(), Some((1, 12)));
86/// assert!(wm.successor(13, 0).next().is_none());
87/// ```
88///
89/// # Notes
90///
91/// * `WaveletMatrix` never panics from I/O errors.
92#[derive(Clone, Debug, PartialEq, Eq)]
93pub struct WaveletMatrix {
94 len: usize,
95 data: WMCore,
96 // Starting offset of each value after reordering by the wavelet matrix, or `len` if the value does not exist.
97 first: IntVector,
98}
99
100impl WaveletMatrix {
101 // Returns the starting offset of the value after reordering.
102 fn start(&self, value: <Self as Vector>::Item) -> usize {
103 self.first.get(value as usize) as usize
104 }
105
106 // Computes `first` from an iterator.
107 fn start_offsets<Iter: Iterator<Item = u64>>(iter: Iter, len: usize, max_value: u64) -> IntVector {
108 // Count the number of occurrences of each value.
109 let mut counts: Vec<(u64, usize)> = Vec::with_capacity((max_value + 1) as usize);
110 for i in 0..=max_value {
111 counts.push((i, 0));
112 }
113 for value in iter {
114 counts[value as usize].1 += 1;
115 }
116
117 // Sort the counts in reverse bit order to get the order below the final level.
118 counts.sort_unstable_by_key(|(value, _)| value.reverse_bits());
119
120 // Replace occurrence counts with the prefix sum in the sorted order.
121 let mut cumulative = 0;
122 for (_, count) in counts.iter_mut() {
123 if *count == 0 {
124 *count = len;
125 } else {
126 let increment = *count;
127 *count = cumulative;
128 cumulative += increment;
129 }
130 }
131
132 // Sort the prefix sums by symbol to get starting offsets and then return the offsets.
133 counts.sort_unstable_by_key(|(value, _)| *value);
134 let mut result: IntVector = counts.into_iter().map(|(_, offset)| offset).collect();
135 result.pack();
136 result
137 }
138}
139
140macro_rules! wavelet_matrix_from {
141 ($t:ident) => {
142 impl From<Vec<$t>> for WaveletMatrix {
143 fn from(source: Vec<$t>) -> Self {
144 let len = source.len();
145 let max_value = source.iter().cloned().max().unwrap_or(0);
146 let first = Self::start_offsets(source.iter().map(|x| *x as u64), source.len(), max_value as u64);
147 let data = WMCore::from(source);
148 WaveletMatrix { len, data, first, }
149 }
150 }
151 }
152}
153
154wavelet_matrix_from!(u8);
155wavelet_matrix_from!(u16);
156wavelet_matrix_from!(u32);
157wavelet_matrix_from!(u64);
158wavelet_matrix_from!(usize);
159
160//-----------------------------------------------------------------------------
161
162impl Vector for WaveletMatrix {
163 type Item = u64;
164
165 #[inline]
166 fn len(&self) -> usize {
167 self.len
168 }
169
170 #[inline]
171 fn width(&self) -> usize {
172 self.data.width()
173 }
174
175 #[inline]
176 fn max_len(&self) -> usize {
177 usize::MAX
178 }
179}
180
181impl<'a> Access<'a> for WaveletMatrix {
182 type Iter = AccessIter<'a, Self>;
183
184 fn get(&self, index: usize) -> <Self as Vector>::Item {
185 self.inverse_select(index).unwrap().1
186 }
187
188 fn iter(&'a self) -> Self::Iter {
189 Self::Iter::new(self)
190 }
191}
192
193impl<'a> VectorIndex<'a> for WaveletMatrix {
194 type ValueIter = ValueIter<'a>;
195
196 fn contains(&self, value: <Self as Vector>::Item) -> bool {
197 (value as usize) < self.first.len() && self.start(value) < self.len()
198 }
199
200 fn rank(&self, index: usize, value: <Self as Vector>::Item) -> usize {
201 if !self.contains(value) {
202 return 0;
203 }
204 self.data.map_down_with(index, value) - self.start(value)
205 }
206
207 fn inverse_select(&self, index: usize) -> Option<(usize, <Self as Vector>::Item)> {
208 self.data.map_down(index).map(|(index, value)| (index - self.start(value), value))
209 }
210
211 fn value_iter(&'a self, value: <Self as Vector>::Item) -> Self::ValueIter {
212 Self::ValueIter {
213 parent: self,
214 value,
215 rank: 0,
216 }
217 }
218
219 fn value_of(iter: &Self::ValueIter) -> <Self as Vector>::Item {
220 iter.value
221 }
222
223 fn select(&self, rank: usize, value: <Self as Vector>::Item) -> Option<usize> {
224 if !self.contains(value) {
225 return None;
226 }
227 self.data.map_up_with(self.start(value) + rank, value)
228 }
229
230 fn select_iter(&'a self, rank: usize, value: <Self as Vector>::Item) -> Self::ValueIter {
231 Self::ValueIter {
232 parent: self,
233 value,
234 rank,
235 }
236 }
237}
238
239impl Serialize for WaveletMatrix {
240 fn serialize_header<T: io::Write>(&self, writer: &mut T) -> io::Result<()> {
241 self.len.serialize(writer)
242 }
243
244 fn serialize_body<T: io::Write>(&self, writer: &mut T) -> io::Result<()> {
245 self.data.serialize(writer)?;
246 self.first.serialize(writer)?;
247 Ok(())
248 }
249
250 fn load<T: io::Read>(reader: &mut T) -> io::Result<Self> {
251 let len = usize::load(reader)?;
252 let data = WMCore::load(reader)?;
253 if data.len() != len {
254 return Err(Error::new(ErrorKind::InvalidData, "Core length does not match wavelet matrix length"));
255 }
256
257 let first = IntVector::load(reader)?;
258 Ok(WaveletMatrix { len, data, first, })
259 }
260
261 fn size_in_elements(&self) -> usize {
262 let mut result = self.len.size_in_elements();
263 result += self.data.size_in_elements();
264 result += self.first.size_in_elements();
265 result
266 }
267}
268
269//-----------------------------------------------------------------------------
270
271/// A read-only iterator over the occurrences of a specific value in [`WaveletMatrix`].
272///
273/// The type of `Item` is [`(usize, usize)`] representing a pair (rank, index).
274/// The item at position `index` has the given value, and the rank of that value at that position is `rank`.
275///
276/// # Examples
277///
278/// ```
279/// use simple_sds_sbwt::ops::VectorIndex;
280/// use simple_sds_sbwt::wavelet_matrix::WaveletMatrix;
281///
282/// // Construction
283/// let source: Vec<u64> = vec![1, 0, 3, 1, 1, 2, 4, 5, 1, 2, 1, 7, 0, 1];
284/// let wm = WaveletMatrix::from(source.clone());
285///
286/// // Iteration over values
287/// let mut iter = wm.value_iter(2);
288/// assert_eq!(WaveletMatrix::value_of(&iter), 2);
289/// assert_eq!(iter.next(), Some((0, 5)));
290/// assert_eq!(iter.next(), Some((1, 9)));
291/// assert!(iter.next().is_none());
292/// ```
293#[derive(Clone, Debug)]
294pub struct ValueIter<'a> {
295 parent: &'a WaveletMatrix,
296 value: <WaveletMatrix as Vector>::Item,
297 // The first rank we have not seen.
298 rank: usize,
299}
300
301impl<'a> Iterator for ValueIter<'a> {
302 type Item = (usize, usize);
303
304 fn next(&mut self) -> Option<Self::Item> {
305 if self.rank >= self.parent.len() {
306 return None;
307 }
308 if let Some(index) = self.parent.select(self.rank, self.value) {
309 let result = Some((self.rank, index));
310 self.rank += 1;
311 result
312 } else {
313 self.rank = self.parent.len();
314 None
315 }
316 }
317}
318
319impl<'a> FusedIterator for ValueIter<'a> {}
320
321//-----------------------------------------------------------------------------
322
323/// [`WaveletMatrix`] transformed into an iterator.
324///
325/// The type of `Item` is [`u64`].
326///
327/// # Examples
328///
329/// ```
330/// use simple_sds_sbwt::wavelet_matrix::WaveletMatrix;
331///
332/// let source: Vec<u64> = vec![1, 0, 3, 1, 1, 2, 4, 5, 1, 2, 1, 7, 0, 1];
333/// let wm = WaveletMatrix::from(source.clone());
334/// let target: Vec<u64> = wm.into_iter().collect();
335/// assert_eq!(target, source);
336/// ```
337#[derive(Clone, Debug)]
338pub struct IntoIter {
339 parent: WaveletMatrix,
340 index: usize,
341}
342
343impl Iterator for IntoIter {
344 type Item = <WaveletMatrix as Vector>::Item;
345
346 fn next(&mut self) -> Option<Self::Item> {
347 if self.index >= self.parent.len() {
348 None
349 } else {
350 let result = Some(self.parent.get(self.index));
351 self.index += 1;
352 result
353 }
354 }
355
356 #[inline]
357 fn size_hint(&self) -> (usize, Option<usize>) {
358 let remaining = self.parent.len() - self.index;
359 (remaining, Some(remaining))
360 }
361}
362
363impl ExactSizeIterator for IntoIter {}
364
365impl FusedIterator for IntoIter {}
366
367impl IntoIterator for WaveletMatrix {
368 type Item = <Self as Vector>::Item;
369 type IntoIter = IntoIter;
370
371 fn into_iter(self) -> Self::IntoIter {
372 IntoIter {
373 parent: self,
374 index: 0,
375 }
376 }
377}
378
379//-----------------------------------------------------------------------------