simple_sds_sbwt/
wavelet_matrix.rs

1//! An immutable integer vector supporting rank/select-type queries.
2//!
3//! The wavelet matrix was first described in:
4//!
5//! > Claude, Navarro, Ordóñez: The wavelet matrix: An efficient wavelet tree for large alphabets.  
6//! > Information Systems, 2015.  
7//! > DOI: [10.1016/j.is.2014.06.002](https://doi.org/10.1016/j.is.2014.06.002)
8//!
9//! See [`wm_core`] for a low-level description.
10//! As in wavelet trees, access and rank queries proceed down from level `0`, while select queries go up from level `width - 1`.
11
12use crate::int_vector::IntVector;
13use crate::ops::{Vector, Access, AccessIter, VectorIndex, Pack};
14use crate::serialize::Serialize;
15use crate::wavelet_matrix::wm_core::WMCore;
16
17use std::io::{Error, ErrorKind};
18use std::iter::FusedIterator;
19use std::io;
20
21pub mod wm_core;
22
23#[cfg(test)]
24mod tests;
25
26//-----------------------------------------------------------------------------
27
28/// An immutable integer vector supporting rank/select-type queries.
29///
30/// Each item consists of the lowest 1 to 64 bits of a [`u64`] value, as specified by the width of the vector.
31/// The vector is represented using [`WMCore`].
32/// There is also an [`IntVector`] storing the starting position of each possible item value after the reordering done by the core.
33/// Hence a `WaveletMatrix` should only be used when most values in `0..(1 << width)` are in use.
34/// The maximum length of the vector is approximately [`usize::MAX`] items.
35///
36/// A `WaveletMatrix` can be built from a [`Vec`] of unsigned integers using the [`From`] trait.
37/// The construction requires several passes over the input and uses the input vector as working space.
38/// Using smaller integer types helps reducing the space overhead during construction.
39///
40/// `WaveletMatrix` implements the following `simple_sds` traits:
41/// * Basic functionality: [`Vector`], [`Access`]
42/// * Queries and operations: [`VectorIndex`]
43/// * Serialization: [`Serialize`]
44///
45/// Overridden default implementations:
46/// * [`VectorIndex::contains`] has a simple constant-time implementation.
47/// * [`VectorIndex::inverse_select`] is effectively the same as [`Access::get`].
48///
49/// # Examples
50///
51/// ```
52/// use simple_sds_sbwt::ops::{Vector, Access, VectorIndex};
53/// use simple_sds_sbwt::wavelet_matrix::WaveletMatrix;
54///
55/// // Construction
56/// let source: Vec<u64> = vec![1, 0, 3, 1, 1, 2, 4, 5, 1, 2, 1, 7, 0, 1];
57/// let wm = WaveletMatrix::from(source.clone());
58///
59/// // Access
60/// assert_eq!(wm.len(), source.len());
61/// assert_eq!(wm.width(), 3);
62/// for i in 0..wm.len() {
63///     assert_eq!(wm.get(i), source[i]);
64/// }
65/// assert!(wm.iter().eq(source.iter().cloned()));
66///
67/// // Rank
68/// assert_eq!(wm.rank(5, 3), 1);
69/// assert_eq!(wm.rank(10, 2), 2);
70///
71/// // Select
72/// assert_eq!(wm.select(2, 1), Some(4));
73/// assert!(wm.select(1, 7).is_none());
74/// assert_eq!(wm.select_iter(1, 2).next(), Some((1, 9)));
75///
76/// // Inverse select
77/// let index = 7;
78/// let inverse = wm.inverse_select(index).unwrap();
79/// assert_eq!(inverse, (0, 5));
80/// assert_eq!(wm.select(inverse.0, inverse.1), Some(index));
81///
82/// // Predecessor / successor
83/// assert!(wm.predecessor(1, 3).next().is_none());
84/// assert_eq!(wm.predecessor(2, 3).next(), Some((0, 2)));
85/// assert_eq!(wm.successor(12, 0).next(), Some((1, 12)));
86/// assert!(wm.successor(13, 0).next().is_none());
87/// ```
88///
89/// # Notes
90///
91/// * `WaveletMatrix` never panics from I/O errors.
92#[derive(Clone, Debug, PartialEq, Eq)]
93pub struct WaveletMatrix {
94    len: usize,
95    data: WMCore,
96    // Starting offset of each value after reordering by the wavelet matrix, or `len` if the value does not exist.
97    first: IntVector,
98}
99
100impl WaveletMatrix {
101    // Returns the starting offset of the value after reordering.
102    fn start(&self, value: <Self as Vector>::Item) -> usize {
103        self.first.get(value as usize) as usize
104    }
105
106    // Computes `first` from an iterator.
107    fn start_offsets<Iter: Iterator<Item = u64>>(iter: Iter, len: usize, max_value: u64) -> IntVector {
108        // Count the number of occurrences of each value.
109        let mut counts: Vec<(u64, usize)> = Vec::with_capacity((max_value + 1) as usize);
110        for i in 0..=max_value {
111            counts.push((i, 0));
112        }
113        for value in iter {
114            counts[value as usize].1 += 1;
115        }
116
117        // Sort the counts in reverse bit order to get the order below the final level.
118        counts.sort_unstable_by_key(|(value, _)| value.reverse_bits());
119
120        // Replace occurrence counts with the prefix sum in the sorted order.
121        let mut cumulative = 0;
122        for (_, count) in counts.iter_mut() {
123            if *count == 0 {
124                *count = len;
125            } else {
126                let increment = *count;
127                *count = cumulative;
128                cumulative += increment;
129            }
130        }
131
132        // Sort the prefix sums by symbol to get starting offsets and then return the offsets.
133        counts.sort_unstable_by_key(|(value, _)| *value);
134        let mut result: IntVector = counts.into_iter().map(|(_, offset)| offset).collect();
135        result.pack();
136        result
137    }
138}
139
140macro_rules! wavelet_matrix_from {
141    ($t:ident) => {
142        impl From<Vec<$t>> for WaveletMatrix {
143            fn from(source: Vec<$t>) -> Self {
144                let len = source.len();
145                let max_value = source.iter().cloned().max().unwrap_or(0);
146                let first = Self::start_offsets(source.iter().map(|x| *x as u64), source.len(), max_value as u64);
147                let data = WMCore::from(source);
148                WaveletMatrix { len, data, first, }
149            }
150        }
151    }
152}
153
154wavelet_matrix_from!(u8);
155wavelet_matrix_from!(u16);
156wavelet_matrix_from!(u32);
157wavelet_matrix_from!(u64);
158wavelet_matrix_from!(usize);
159
160//-----------------------------------------------------------------------------
161
162impl Vector for WaveletMatrix {
163    type Item = u64;
164
165    #[inline]
166    fn len(&self) -> usize {
167        self.len
168    }
169
170    #[inline]
171    fn width(&self) -> usize {
172        self.data.width()
173    }
174
175    #[inline]
176    fn max_len(&self) -> usize {
177        usize::MAX
178    }
179}
180
181impl<'a> Access<'a> for WaveletMatrix {
182    type Iter = AccessIter<'a, Self>;
183
184    fn get(&self, index: usize) -> <Self as Vector>::Item {
185        self.inverse_select(index).unwrap().1
186    }
187
188    fn iter(&'a self) -> Self::Iter {
189        Self::Iter::new(self)
190    }
191}
192
193impl<'a> VectorIndex<'a> for WaveletMatrix {
194    type ValueIter = ValueIter<'a>;
195
196    fn contains(&self, value: <Self as Vector>::Item) -> bool {
197        (value as usize) < self.first.len() && self.start(value) < self.len()
198    }
199
200    fn rank(&self, index: usize, value: <Self as Vector>::Item) -> usize {
201        if !self.contains(value) {
202            return 0;
203        }
204        self.data.map_down_with(index, value) - self.start(value)
205    }
206
207    fn inverse_select(&self, index: usize) -> Option<(usize, <Self as Vector>::Item)> {
208        self.data.map_down(index).map(|(index, value)| (index - self.start(value), value))
209    }
210
211    fn value_iter(&'a self, value: <Self as Vector>::Item) -> Self::ValueIter {
212        Self::ValueIter {
213            parent: self,
214            value,
215            rank: 0,
216        }
217    }
218
219    fn value_of(iter: &Self::ValueIter) -> <Self as Vector>::Item {
220        iter.value
221    }
222
223    fn select(&self, rank: usize, value: <Self as Vector>::Item) -> Option<usize> {
224        if !self.contains(value) {
225            return None;
226        }
227        self.data.map_up_with(self.start(value) + rank, value)
228    }
229
230    fn select_iter(&'a self, rank: usize, value: <Self as Vector>::Item) -> Self::ValueIter {
231        Self::ValueIter {
232            parent: self,
233            value,
234            rank,
235        }
236    }
237}
238
239impl Serialize for WaveletMatrix {
240    fn serialize_header<T: io::Write>(&self, writer: &mut T) -> io::Result<()> {
241        self.len.serialize(writer)
242    }
243
244    fn serialize_body<T: io::Write>(&self, writer: &mut T) -> io::Result<()> {
245        self.data.serialize(writer)?;
246        self.first.serialize(writer)?;
247        Ok(())
248    }
249
250    fn load<T: io::Read>(reader: &mut T) -> io::Result<Self> {
251        let len = usize::load(reader)?;
252        let data = WMCore::load(reader)?;
253        if data.len() != len {
254            return Err(Error::new(ErrorKind::InvalidData, "Core length does not match wavelet matrix length"));
255        }
256
257        let first = IntVector::load(reader)?;
258        Ok(WaveletMatrix { len, data, first, })
259    }
260
261    fn size_in_elements(&self) -> usize {
262        let mut result = self.len.size_in_elements();
263        result += self.data.size_in_elements();
264        result += self.first.size_in_elements();
265        result
266    }
267}
268
269//-----------------------------------------------------------------------------
270
271/// A read-only iterator over the occurrences of a specific value in [`WaveletMatrix`].
272///
273/// The type of `Item` is [`(usize, usize)`] representing a pair (rank, index).
274/// The item at position `index` has the given value, and the rank of that value at that position is `rank`.
275///
276/// # Examples
277///
278/// ```
279/// use simple_sds_sbwt::ops::VectorIndex;
280/// use simple_sds_sbwt::wavelet_matrix::WaveletMatrix;
281///
282/// // Construction
283/// let source: Vec<u64> = vec![1, 0, 3, 1, 1, 2, 4, 5, 1, 2, 1, 7, 0, 1];
284/// let wm = WaveletMatrix::from(source.clone());
285///
286/// // Iteration over values
287/// let mut iter = wm.value_iter(2);
288/// assert_eq!(WaveletMatrix::value_of(&iter), 2);
289/// assert_eq!(iter.next(), Some((0, 5)));
290/// assert_eq!(iter.next(), Some((1, 9)));
291/// assert!(iter.next().is_none());
292/// ```
293#[derive(Clone, Debug)]
294pub struct ValueIter<'a> {
295    parent: &'a WaveletMatrix,
296    value: <WaveletMatrix as Vector>::Item,
297    // The first rank we have not seen.
298    rank: usize,
299}
300
301impl<'a> Iterator for ValueIter<'a> {
302    type Item = (usize, usize);
303
304    fn next(&mut self) -> Option<Self::Item> {
305        if self.rank >= self.parent.len() {
306            return None;
307        }
308        if let Some(index) = self.parent.select(self.rank, self.value) {
309            let result = Some((self.rank, index));
310            self.rank += 1;
311            result
312        } else {
313            self.rank = self.parent.len();
314            None
315        }
316    }
317}
318
319impl<'a> FusedIterator for ValueIter<'a> {}
320
321//-----------------------------------------------------------------------------
322
323/// [`WaveletMatrix`] transformed into an iterator.
324///
325/// The type of `Item` is [`u64`].
326///
327/// # Examples
328///
329/// ```
330/// use simple_sds_sbwt::wavelet_matrix::WaveletMatrix;
331///
332/// let source: Vec<u64> = vec![1, 0, 3, 1, 1, 2, 4, 5, 1, 2, 1, 7, 0, 1];
333/// let wm = WaveletMatrix::from(source.clone());
334/// let target: Vec<u64> = wm.into_iter().collect();
335/// assert_eq!(target, source);
336/// ```
337#[derive(Clone, Debug)]
338pub struct IntoIter {
339    parent: WaveletMatrix,
340    index: usize,
341}
342
343impl Iterator for IntoIter {
344    type Item = <WaveletMatrix as Vector>::Item;
345
346    fn next(&mut self) -> Option<Self::Item> {
347        if self.index >= self.parent.len() {
348            None
349        } else {
350            let result = Some(self.parent.get(self.index));
351            self.index += 1;
352            result
353        }
354    }
355
356    #[inline]
357    fn size_hint(&self) -> (usize, Option<usize>) {
358        let remaining = self.parent.len() - self.index;
359        (remaining, Some(remaining))
360    }
361}
362
363impl ExactSizeIterator for IntoIter {}
364
365impl FusedIterator for IntoIter {}
366
367impl IntoIterator for WaveletMatrix {
368    type Item = <Self as Vector>::Item;
369    type IntoIter = IntoIter;
370
371    fn into_iter(self) -> Self::IntoIter {
372        IntoIter {
373            parent: self,
374            index: 0,
375        }
376    }
377}
378
379//-----------------------------------------------------------------------------