kmers_rs/
list.rs

1use std::io::Read;
2
3use crate::{
4    vbyte::{Decoder8, Encoder8},
5    Kmer,
6};
7
8/// A compressed representation for a sorted list of k-mers.
9pub struct CompressedKmerList {
10    bytes: Vec<u8>,
11    count: usize,
12    last: Kmer,
13}
14
15impl CompressedKmerList {
16    /// Create an empty CompressedKmerList
17    pub fn new() -> CompressedKmerList {
18        CompressedKmerList {
19            bytes: Vec::new(),
20            count: 0,
21            last: Kmer(0),
22        }
23    }
24
25    /// Return the number of k-mers in the list
26    pub fn len(&self) -> usize {
27        self.count
28    }
29
30    /// Add a k-mer to the compressed list.
31    ///
32    /// The k-mer `x` must be greater than or equal to the last k-mer on the list.
33    pub fn push(&mut self, x: Kmer) {
34        assert!(x >= self.last);
35        let d = x.0 - self.last.0;
36        Encoder8::encode(d, &mut self.bytes).unwrap();
37        self.count += 1;
38        self.last = x;
39    }
40
41    /// Create an iterator over the compressed list.
42    pub fn iter(&self) -> CompressedKmerListIterator<'_, impl Iterator<Item = &'_ u8>> {
43        CompressedKmerListIterator::new(self.bytes.iter())
44    }
45
46    /// Create a new k-mer list from an iterator.
47    pub fn from_iter<Src>(src: Src) -> CompressedKmerList
48    where
49        Src: Iterator<Item = Kmer>,
50    {
51        let mut res = CompressedKmerList::new();
52        for x in src {
53            res.push(x);
54        }
55        res
56    }
57}
58
59/// Iterate over a compressed k-mer list
60pub struct CompressedKmerListIterator<'a, Src>
61where
62    Src: Iterator<Item = &'a u8>,
63{
64    bytes: IteratorReadAdaptor<'a, Src>,
65    last: Kmer,
66}
67
68impl<'a, Src> CompressedKmerListIterator<'a, Src>
69where
70    Src: Iterator<Item = &'a u8>,
71{
72    /// Create an iterator over the compressed list.
73    pub fn new(bytes: Src) -> CompressedKmerListIterator<'a, Src> {
74        CompressedKmerListIterator {
75            bytes: IteratorReadAdaptor::new(bytes),
76            last: Kmer(0),
77        }
78    }
79}
80
81impl<'a, Src> Iterator for CompressedKmerListIterator<'a, Src>
82where
83    Src: Iterator<Item = &'a u8>,
84{
85    type Item = Kmer;
86    fn next(&mut self) -> Option<Self::Item> {
87        match Decoder8::decode(&mut self.bytes).unwrap() {
88            None => None,
89            Some(d) => {
90                self.last.0 += d;
91                Some(self.last.clone())
92            }
93        }
94    }
95}
96
97/// A compressed representation for a sorted list of k-mer and count pairs.
98pub struct CompressedKmerFrequencyList {
99    bytes: Vec<u8>,
100    count: usize,
101    last: Kmer,
102}
103
104impl CompressedKmerFrequencyList {
105    /// Create an empty CompressedKmerFrequencyList
106    pub fn new() -> CompressedKmerFrequencyList {
107        CompressedKmerFrequencyList {
108            bytes: Vec::new(),
109            count: 0,
110            last: Kmer(0),
111        }
112    }
113
114    /// Return the number of k-mer, frequency pairs in the list
115    pub fn len(&self) -> usize {
116        self.count
117    }
118
119    /// Add a k-mer and count to the compressed list.
120    ///
121    /// The k-mer `x` must be greater than or equal to the last k-mer on the list.
122    pub fn push(&mut self, x_f: (Kmer, usize)) {
123        let (x, f) = x_f;
124        assert!(x >= self.last);
125        let d = x.0 - self.last.0;
126        Encoder8::encode(d, &mut self.bytes).unwrap();
127        Encoder8::encode(f as u64, &mut self.bytes).unwrap();
128        self.count += 1;
129        self.last = x;
130    }
131
132    /// Create an iterator over the compressed list.
133    pub fn iter(&self) -> CompressedKmerFrequencyListIterator<'_, impl Iterator<Item = &'_ u8>> {
134        CompressedKmerFrequencyListIterator::new(self.bytes.iter())
135    }
136
137    /// Create a new k-mer list from an iterator.
138    pub fn from_iter<Src>(src: Src) -> CompressedKmerFrequencyList
139    where
140        Src: Iterator<Item = (Kmer, usize)>,
141    {
142        let mut res = CompressedKmerFrequencyList::new();
143        for x_f in src {
144            res.push(x_f);
145        }
146        res
147    }
148}
149
150/// Iterate over a compressed k-mer list
151pub struct CompressedKmerFrequencyListIterator<'a, Src>
152where
153    Src: Iterator<Item = &'a u8>,
154{
155    bytes: IteratorReadAdaptor<'a, Src>,
156    last: Kmer,
157}
158
159impl<'a, Src> CompressedKmerFrequencyListIterator<'a, Src>
160where
161    Src: Iterator<Item = &'a u8>,
162{
163    /// Create an iterator over the compressed list.
164    pub fn new(bytes: Src) -> CompressedKmerFrequencyListIterator<'a, Src> {
165        CompressedKmerFrequencyListIterator {
166            bytes: IteratorReadAdaptor::new(bytes),
167            last: Kmer(0),
168        }
169    }
170}
171
172impl<'a, Src> Iterator for CompressedKmerFrequencyListIterator<'a, Src>
173where
174    Src: Iterator<Item = &'a u8>,
175{
176    type Item = (Kmer, usize);
177    fn next(&mut self) -> Option<Self::Item> {
178        match Decoder8::decode(&mut self.bytes).unwrap() {
179            None => None,
180            Some(d) => {
181                self.last.0 += d;
182                let f = Decoder8::decode(&mut self.bytes).unwrap().unwrap();
183                Some((self.last.clone(), f as usize))
184            }
185        }
186    }
187}
188
189/// Convert an iterator over bytes to a [`Read`](std::io::Read).
190pub struct IteratorReadAdaptor<'a, Src>
191where
192    Src: Iterator<Item = &'a u8>,
193{
194    itr: Src,
195}
196
197impl<'a, Src> IteratorReadAdaptor<'a, Src>
198where
199    Src: Iterator<Item = &'a u8>,
200{
201    /// Convert an iterator over bytes to a [`Read`](std::io::Read).
202    pub fn new(src: Src) -> IteratorReadAdaptor<'a, Src> {
203        IteratorReadAdaptor { itr: src }
204    }
205}
206
207impl<'a, Src> Read for IteratorReadAdaptor<'a, Src>
208where
209    Src: Iterator<Item = &'a u8>,
210{
211    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
212        if buf.len() == 0 {
213            return Ok(0);
214        }
215
216        let mut count = 0;
217        while let Some(b) = self.itr.next() {
218            buf[count] = *b;
219            count += 1;
220            if count == buf.len() {
221                break;
222            }
223        }
224        Ok(count)
225    }
226}
227
228#[cfg(test)]
229mod tests {
230    use super::*;
231
232    struct MiniRng {
233        x: u64,
234    }
235
236    impl MiniRng {
237        fn new(seed: u64) -> MiniRng {
238            MiniRng { x: seed }
239        }
240
241        fn rnd(&mut self) -> u64 {
242            self.x = self.x.wrapping_mul(2862933555777941757u64);
243            self.x = self.x.wrapping_add(3037000493u64);
244            self.x
245        }
246
247        fn freq(&mut self) -> u64 {
248            let mut f: f64 = 0.0;
249            let mut u = self.rnd();
250            while u > 0 {
251                if u & 1 == 1 {
252                    f += 1.0;
253                }
254                f /= 2.0;
255                u >>= 1;
256            }
257            (1.0 / f).floor() as u64
258        }
259    }
260
261    #[test]
262    fn test_1() {
263        let k = 7;
264        let m = (1u64 << (2 * k)) - 1;
265        let mut rng = MiniRng::new(19);
266        let mut xs = Vec::new();
267        for _i in 0..100 {
268            let x = Kmer(rng.rnd() & m);
269            xs.push(x);
270        }
271        xs.sort();
272        let c = CompressedKmerList::from_iter(xs.iter().map(|x| x.clone()));
273        assert_eq!(c.len(), xs.len());
274        let ys = Vec::from_iter(c.iter());
275        assert_eq!(ys, xs);
276    }
277
278    #[test]
279    fn test_2() {
280        let k = 7;
281        let m = (1u64 << (2 * k)) - 1;
282        let mut rng = MiniRng::new(19);
283        let mut xs = Vec::new();
284        for _i in 0..100 {
285            let x = Kmer(rng.rnd() & m);
286            let f = rng.freq();
287            xs.push((x, f as usize));
288        }
289        xs.sort();
290        let c = CompressedKmerFrequencyList::from_iter(xs.iter().map(|(x,f)| (x.clone(), *f)));
291        assert_eq!(c.len(), xs.len());
292        let ys = Vec::from_iter(c.iter());
293        assert_eq!(ys, xs);
294    }}