simple_sds_sbwt/wavelet_matrix/
wm_core.rs

1//! The core wavelet matrix structure.
2//!
3//! A wavelet matrix core is a vector of [`u64`] items.
4//! It represents a bidirectional mapping between the items in the original vector and the same items in the reordered vector.
5//! The reordered vector is based on sorting the items stably by their reverse binary representations.
6//! Mapping down means mapping from the original vector to the reordered vector, while mapping up means the reverse.
7//!
8//! This structure can be understood as the positional BWT (PBWT) of the binary sequences corresponding to the integers.
9//! Each level in the core wavelet matrix corresponds to a column in the PBWT.
10//! Bitvector `bv[level]` on level `level` represent bit values
11//!
12//! > `1 << (width - 1 - level)`.
13//!
14//! If `bv[level][i] == 0`, position `i` on level `level` it maps to position
15//!
16//! > `bv[level].rank_zero(i)`
17//!
18//! on level `level + 1`.
19//! Otherwise it maps to position
20//!
21//! > `bv[level].count_zeros() + bv[level].rank(i)`.
22
23use crate::bit_vector::BitVector;
24use crate::ops::{BitVec, Rank, Select, SelectZero, PredSucc};
25use crate::raw_vector::{RawVector, PushRaw};
26use crate::serialize::Serialize;
27use crate::bits;
28
29use std::io::{Error, ErrorKind};
30use std::{cmp, io};
31
32//-----------------------------------------------------------------------------
33
34// FIXME Should this be generic over the bitvector type? How to handle construction?
35/// A bidirectional mapping between the original vector and a stably sorted vector of the same items.
36///
37/// Each item consists of the lowest 1 to 64 bits of a [`u64`] value, as specified by the width of the vector.
38/// The width determines the number of levels in the `WMCore`.
39///
40/// A `WMCore` can be built from a [`Vec`] of unsigned integers using the [`From`] trait.
41/// The construction requires several passes over the input and uses the input vector as working space.
42/// Using smaller integer types helps reducing the space overhead during construction.
43///
44/// # Examples
45///
46/// ```
47/// use simple_sds_sbwt::wavelet_matrix::wm_core::WMCore;
48///
49/// // Construction
50/// let source: Vec<u64> = vec![1, 0, 3, 1, 1, 2, 4, 5, 1, 2, 1, 7, 0, 1];
51/// let mut reordered: Vec<u64> = source.clone();
52/// reordered.sort_by_key(|x| x.reverse_bits());
53///
54/// // Construction
55/// let core = WMCore::from(source.clone());
56/// assert_eq!(core.len(), source.len());
57/// assert_eq!(core.width(), 3);
58///
59/// // Map down
60/// for i in 0..core.len() {
61///     let mapped = core.map_down_with(i, source[i]);
62///     assert_eq!(reordered[mapped], source[i]);
63///     assert_eq!(core.map_down(i), Some((mapped, source[i])));
64/// }
65///
66/// // Map up
67/// for i in 0..core.len() {
68///     let mapped = core.map_up_with(i, reordered[i]).unwrap();
69///     assert_eq!(source[mapped], reordered[i]);
70///     assert_eq!(core.map_down_with(mapped, source[mapped]), i);
71/// }
72/// ```
73#[derive(Clone, Debug, PartialEq, Eq)]
74pub struct WMCore {
75    levels: Vec<BitVector>,
76}
77
78impl WMCore {
79    /// Returns the length of each level in the structure.
80    pub fn len(&self) -> usize {
81        self.levels[0].len()
82    }
83
84    /// Returns the number of levels in the structure.
85    pub fn width(&self) -> usize {
86        self.levels.len()
87    }
88
89    /// Maps the item at the given position down.
90    ///
91    /// Returns the position of the item in the reordered vector and the value of the item, or [`None`] if the position is invalid.
92    pub fn map_down(&self, index: usize) -> Option<(usize, u64)> {
93        if index >= self.len() {
94            return None;
95        }
96
97        let mut index = index;
98        let mut value = 0;
99        for level in 0..self.width() {
100            if self.levels[level].get(index) {
101                index = self.map_down_one(index, level);
102                value += self.bit_value(level);
103            } else {
104                index = self.map_down_zero(index, level);
105            }
106        }
107
108        Some((index, value))
109    }
110
111    /// Maps down with the given value.
112    ///
113    /// Returns the position in the reordered vector.
114    /// This is the number of occurrences of `value` before `index` in the original vector, plus the number of items that map before `value` in the reordering.
115    ///
116    /// # Arguments
117    ///
118    /// * `index`: Position in the original vector.
119    /// * `value`: Value of an item.
120    pub fn map_down_with(&self, index: usize, value: u64) -> usize {
121        let mut index = cmp::min(index, self.len());
122        for level in 0..self.width() {
123            if value & self.bit_value(level) != 0 {
124                index = self.map_down_one(index, level);
125            } else {
126                index = self.map_down_zero(index, level);
127            }
128        }
129
130        index
131    }
132
133    /// Maps two positions down at once.
134    ///
135    /// Returns the positions in the reordered vector.
136    /// Because [`Self::map_down_with`] is constrained by memory latency, this can be faster than doing two independent queries.
137    ///
138    /// # Arguments
139    ///
140    /// * `first`: A position in the original vector.
141    /// * `second`: A position in the original vector.
142    /// * `value`: Value of an item.
143    pub fn map_down_with_two_positions(&self, first: usize, second: usize, value: u64) -> (usize, usize) {
144        let mut first = cmp::min(first, self.len());
145        let mut second = cmp::min(second, self.len());
146        for level in 0..self.width() {
147            if value & self.bit_value(level) != 0 {
148                first = self.map_down_one(first, level);
149                second = self.map_down_one(second, level);
150            } else {
151                first = self.map_down_zero(first, level);
152                second = self.map_down_zero(second, level);
153            }
154        }
155
156        (first, second)
157    }
158
159    /// Maps up with the given value.
160    ///
161    /// Returns the position in the original vector, or [`None`] if there is no such position.
162    /// This is the position where [`Self::map_down`] would return `(index, value)`.
163    ///
164    /// # Arguments
165    ///
166    /// * `index`: Position in the reordered vector.
167    /// * `value`: Value of an item.
168    pub fn map_up_with(&self, index: usize, value: u64) -> Option<usize> {
169        let mut index = index;
170        for level in (0..self.width()).rev() {
171            if value & self.bit_value(level) != 0 {
172                index = self.map_up_one(index, level)?;
173            } else {
174                index = self.map_up_zero(index, level)?;
175            }
176        }
177
178        Some(index)
179    }
180
181    // Returns the bit value for the given level.
182    fn bit_value(&self, level: usize) -> u64 {
183        1 << (self.width() - 1 - level)
184    }
185
186    // Maps the index to the next level with a set bit.
187    fn map_down_one(&self, index: usize, level: usize) -> usize {
188        self.levels[level].count_zeros() + self.levels[level].rank(index)
189    }
190
191    // Maps the index to the next level with an unset bit.
192    fn map_down_zero(&self, index: usize, level: usize) -> usize {
193        self.levels[level].rank_zero(index)
194    }
195
196    // Maps the index from the next level with a set bit.
197    fn map_up_one(&self, index: usize, level: usize) -> Option<usize> {
198        self.levels[level].select(index - self.levels[level].count_zeros())
199    }
200
201    // Maps the index from the next level with an unset bit.
202    fn map_up_zero(&self, index: usize, level: usize) -> Option<usize> {
203        self.levels[level].select_zero(index)
204    }
205
206    // Initializes the support structures for the bitvectors.
207    fn init_support(&mut self) {
208        for bv in self.levels.iter_mut() {
209            bv.enable_rank();
210            bv.enable_select();
211            bv.enable_select_zero();
212            bv.enable_pred_succ();
213        }
214    }
215}
216
217//-----------------------------------------------------------------------------
218
219macro_rules! wm_core_from {
220    ($t:ident) => {
221        impl From<Vec<$t>> for WMCore {
222            fn from(source: Vec<$t>) -> Self {
223                let mut source = source;
224                let max_value = source.iter().cloned().max().unwrap_or(0);
225                let width = bits::bit_len(max_value as u64);
226
227                let mut levels: Vec<BitVector> = Vec::new();
228                for level in 0..width {
229                    let bit_value: $t = 1 << (width - 1 - level);
230                    let mut zeros: Vec<$t> = Vec::new();
231                    let mut ones: Vec<$t> = Vec::new();
232                    let mut raw_data = RawVector::with_capacity(source.len());
233
234                    // Determine if the current bit is set in each value.
235                    for value in source.iter() {
236                        if value & bit_value != 0 {
237                            ones.push(*value);
238                            raw_data.push_bit(true);
239                        } else {
240                            zeros.push(*value);
241                            raw_data.push_bit(false);
242                        }
243                    }
244
245                    // Sort the values stably by the current bit.
246                    source.clear();
247                    source.extend(zeros);
248                    source.extend(ones);
249        
250                    // Create the bitvector for the current level.
251                    levels.push(BitVector::from(raw_data));
252                }
253
254                let mut result = WMCore { levels, };
255                result.init_support();
256                result
257            }
258        }
259    }
260}
261
262wm_core_from!(u8);
263wm_core_from!(u16);
264wm_core_from!(u32);
265wm_core_from!(u64);
266wm_core_from!(usize);
267
268//-----------------------------------------------------------------------------
269
270impl Serialize for WMCore {
271    fn serialize_header<T: io::Write>(&self, _: &mut T) -> io::Result<()> {
272        Ok(())
273    }
274
275    fn serialize_body<T: io::Write>(&self, writer: &mut T) -> io::Result<()> {
276        let width = self.width();
277        width.serialize(writer)?;
278        for bv in self.levels.iter() {
279            bv.serialize(writer)?;
280        }
281        Ok(())
282    }
283
284    fn load<T: io::Read>(reader: &mut T) -> io::Result<Self> {
285        let width = usize::load(reader)?;
286        if width == 0 || width > bits::WORD_BITS {
287            return Err(Error::new(ErrorKind::InvalidData, "Invalid width"));
288        }
289
290        let mut len: Option<usize> = None;
291        let mut levels: Vec<BitVector> = Vec::with_capacity(width);
292        for _ in 0..width {
293            let bv = BitVector::load(reader)?;
294            match len {
295                Some(len) => {
296                    if bv.len() != len {
297                        return Err(Error::new(ErrorKind::InvalidData, "Invalid bitvector length"));
298                    }
299                },
300                None => {
301                    len = Some(bv.len());
302                },
303            }
304            levels.push(bv);
305        }
306        let mut result = WMCore { levels, };
307        result.init_support();
308        Ok(result)
309    }
310
311    fn size_in_elements(&self) -> usize {
312        let mut result = 1; // Width.
313        for bv in self.levels.iter() {
314            result += bv.size_in_elements();
315        }
316        result
317    }
318}
319
320//-----------------------------------------------------------------------------
321
322#[cfg(test)]
323mod tests {
324    use super::*;
325
326    use crate::{internal, serialize};
327
328    fn reordered_vector(original: &[u64]) -> Vec<u64> {
329        let mut result = Vec::from(original);
330        result.sort_unstable_by_key(|x| x.reverse_bits());
331        result
332    }
333
334    fn check_core(core: &WMCore, original: &[u64], reordered: &[u64]) {
335        assert_eq!(core.len(), original.len(), "Invalid WMCore length");
336
337        for i in 0..core.len() {
338            let mapped = core.map_down_with(i, original[i]);
339            assert_eq!(reordered[mapped], original[i], "Invalid value at the mapped down position from {}", i);
340            assert_eq!(core.map_down(i), Some((mapped, original[i])), "The map down functions are inconsistent at {}", i);
341        }
342
343        for i in 0..core.len() {
344            let mapped = core.map_up_with(i, reordered[i]).unwrap();
345            assert_eq!(original[mapped], reordered[i], "Invalid value at the mapped up position from {}", i);
346            assert_eq!(core.map_down_with(mapped, original[mapped]), i, "Did not get the original position after mapping up and down from {}", i);
347        }
348
349        let first = internal::random_queries(core.len(), core.len());
350        let second = internal::random_queries(core.len(), core.len());
351        let values = internal::random_vector(core.len(), core.width());
352        for i in 0..core.len() {
353            let mapped = core.map_down_with_two_positions(first[i], second[i], values[i]);
354            assert_eq!(mapped.0, core.map_down_with(first[i], values[i]), "Invalid first mapped position at {}, value {}", first[i], values[i]);
355            assert_eq!(mapped.1, core.map_down_with(second[i], values[i]), "Invalid second mapped position at {}, value {}", second[i], values[i]);
356        }
357    }
358
359    #[test]
360    fn empty_core() {
361        let original: Vec<u64> = Vec::new();
362        let reordered = reordered_vector(&original);
363        let core = WMCore::from(original.clone());
364        check_core(&core, &original, &reordered);
365    }
366
367    #[test]
368    fn non_empty_core() {
369        let original = internal::random_vector(322, 7);
370        let reordered = reordered_vector(&original);
371        let core = WMCore::from(original.clone());
372        check_core(&core, &original, &reordered);
373    }
374
375    #[test]
376    fn serialize_core() {
377        let original = internal::random_vector(286, 6);
378        let core = WMCore::from(original);
379        serialize::test(&core, "wm-core", None, true);
380    }
381}
382
383//-----------------------------------------------------------------------------