kmers_rs/
accumulator.rs

1use std::collections::BTreeMap;
2
3use crate::{
4    counting_kmer_frequency_iterator, frequency::frequency_vector_iter, merge, unique,
5    CompressedKmerFrequencyList, CompressedKmerList, Kmer,
6};
7
8/// Accumulate k-mer frequencies for small to medium k.
9pub struct DenseAccumulator {
10    k: usize,
11    counts: Vec<usize>,
12}
13
14impl DenseAccumulator {
15    /// Construct a new accumulator.
16    pub fn new(k: usize) -> DenseAccumulator {
17        let n = 1usize << (2 * k);
18        let mut counts = Vec::new();
19        counts.resize(n, 0);
20        DenseAccumulator { k, counts }
21    }
22
23    /// Add k-mers from the forward strand of the given sequence.
24    pub fn add<S>(&mut self, seq: &S)
25    where
26        S: AsRef<[u8]>,
27    {
28        Kmer::with_many(self.k, seq, |x| {
29            self.counts[x.0 as usize] += 1;
30        });
31    }
32
33    /// Add k-mers from both strands of the given sequence.
34    pub fn add_both<S>(&mut self, seq: &S)
35    where
36        S: AsRef<[u8]>,
37    {
38        Kmer::with_many_both(self.k, seq, |x, y| {
39            self.counts[x.0 as usize] += 1;
40            self.counts[y.0 as usize] += 1;
41        });
42    }
43
44    /// Get a k-mer frequency iterator over the accumulator.
45    pub fn kmer_frequencies(&self) -> impl Iterator<Item = (Kmer, usize)> + '_ {
46        frequency_vector_iter(self.counts.iter())
47    }
48}
49
50/// A simple sparse accumulator
51pub struct SimpleSparseAccumulator {
52    k: usize,
53    counts: BTreeMap<Kmer, usize>,
54}
55
56impl SimpleSparseAccumulator {
57    /// Construct a new sparse kmer frequency accumulator
58    pub fn new(k: usize) -> SimpleSparseAccumulator {
59        SimpleSparseAccumulator {
60            k,
61            counts: BTreeMap::new(),
62        }
63    }
64
65    /// Add k-mers from the forward strand of the given sequence.
66    pub fn add<S>(&mut self, seq: &S)
67    where
68        S: AsRef<[u8]>,
69    {
70        Kmer::with_many(self.k, seq, |x| {
71            *self.counts.entry(x.clone()).or_insert(0) += 1;
72        });
73    }
74
75    /// Add k-mers from both strands of the given sequence.
76    pub fn add_both<S>(&mut self, seq: &S)
77    where
78        S: AsRef<[u8]>,
79    {
80        Kmer::with_many_both(self.k, seq, |x, y| {
81            *self.counts.entry(x.clone()).or_insert(0) += 1;
82            *self.counts.entry(y.clone()).or_insert(0) += 1;
83        });
84    }
85
86    /// Get a k-mer frequency iterator over the accumulator.
87    pub fn kmer_frequencies(&self) -> impl Iterator<Item = (Kmer, usize)> + '_ {
88        self.counts.iter().map(|(x, f)| (x.clone(), *f))
89    }
90}
91
92/// A sparse k-mer accumulator for large set of k-mers.
93pub struct BigSparseAccumulator {
94    k: usize,
95    buffer_size: usize,
96    buffer: Vec<Kmer>,
97    unique_kmers: CompressedKmerList,
98    non_unique_kmers: CompressedKmerFrequencyList,
99}
100
101impl BigSparseAccumulator {
102    /// Create a new sparse k-mer accumulator, buffering `buffer_size` k-mers between compactions.
103    pub fn new(k: usize, buffer_size: usize) -> BigSparseAccumulator {
104        BigSparseAccumulator {
105            k,
106            buffer_size,
107            buffer: Vec::with_capacity(buffer_size),
108            unique_kmers: CompressedKmerList::new(),
109            non_unique_kmers: CompressedKmerFrequencyList::new(),
110        }
111    }
112
113    /// Add k-mers from the forward strand of the given sequence.
114    pub fn add<S>(&mut self, seq: &S)
115    where
116        S: AsRef<[u8]>,
117    {
118        Kmer::with_many(self.k, seq, |x| {
119            self.buffer.push(x.clone());
120            if self.buffer.len() + 2 >= self.buffer_size {
121                self.flush();
122            }
123        });
124    }
125
126    /// Add k-mers from both strands of the given sequence.
127    pub fn add_both<S>(&mut self, seq: &S)
128    where
129        S: AsRef<[u8]>,
130    {
131        Kmer::with_many_both(self.k, seq, |x, y| {
132            self.buffer.push(x.clone());
133            self.buffer.push(y.clone());
134            if self.buffer.len() + 2 >= self.buffer_size {
135                self.flush();
136            }
137        });
138    }
139
140    /// Get a k-mer frequency iterator over the accumulator.
141    pub fn kmer_frequencies(&mut self) -> impl Iterator<Item = (Kmer, usize)> + '_ {
142        self.flush();
143        let unique_kmer_frequencies = unique(self.unique_kmers.iter());
144        let non_unique_kmer_frequencies = self.non_unique_kmers.iter();
145        merge(unique_kmer_frequencies, non_unique_kmer_frequencies)
146    }
147
148    fn flush(&mut self) {
149        if self.buffer.len() == 0 {
150            return;
151        }
152
153        self.buffer.sort();
154
155        let mut unique_kmers = CompressedKmerList::new();
156        let mut non_unique_kmers = CompressedKmerFrequencyList::new();
157        {
158            let new_kmer_frequencies = counting_kmer_frequency_iterator(self.buffer.iter());
159            let unique_kmer_frequencies = unique(self.unique_kmers.iter());
160            let non_unique_kmer_frequencies = self.non_unique_kmers.iter();
161
162            for (x, f) in merge(
163                merge(new_kmer_frequencies, unique_kmer_frequencies),
164                non_unique_kmer_frequencies,
165            ) {
166                if f == 1 {
167                    unique_kmers.push(x);
168                } else {
169                    non_unique_kmers.push((x, f));
170                }
171            }
172        }
173        self.buffer.clear();
174        std::mem::swap(&mut unique_kmers, &mut self.unique_kmers);
175        std::mem::swap(&mut non_unique_kmers, &mut self.non_unique_kmers);
176    }
177}
178
179#[cfg(test)]
180mod tests {
181    use super::*;
182
183    struct MiniRng {
184        x: u64,
185    }
186
187    impl MiniRng {
188        fn new(seed: u64) -> MiniRng {
189            MiniRng { x: seed }
190        }
191
192        fn rnd(&mut self) -> u64 {
193            self.x = self.x.wrapping_mul(2862933555777941757u64);
194            self.x = self.x.wrapping_add(3037000493u64);
195            self.x
196        }
197    }
198
199    fn make_some_sequence(rng: &mut MiniRng) -> String {
200        let mut s = String::new();
201        let k: usize = 17;
202        let m = (1u64 << (2 * k)) - 1;
203        for _i in 0..20 {
204            let u = rng.rnd() & m;
205            let x = Kmer::from_u64(u);
206            s += &x.render(k);
207        }
208        println!("{}", s);
209        s
210    }
211
212    #[test]
213    fn test_1() {
214        let k = 7;
215
216        let seq1 = "CCACGACACATCATAGTCAAATTTCTAAAAATTAAGGACCACACAAACACCTTAAAAACAGAGAAAGAAAAATGACACATTACCTATACTGCAAAAACAATTCAAATGACAAAGTATTATTCATTGGAAACCACAGAGTTCAGAATGAAG";
217        let seq2 = "TTATATGCACATGATTTACCTCCAGTAAAGATTTCTAAAGTAAATGTAAGAAAAGGGACAAGTTCTCCAGGGAAAGAAAACCTGACAACAAAGTGTGTCTAAACTTGTGATCAAAGGTTAAATATCTACATCAAAGCCCTAATCAGCACA";
218
219        let counts1 = Kmer::frequency_vector(k, &seq1);
220        let counts2 = Kmer::frequency_vector_both(k, &seq2);
221        assert_eq!(counts1.len(), counts2.len());
222
223        let mut acc = DenseAccumulator::new(k);
224        acc.add(&seq1);
225        {
226            let mut i = 0;
227            for (x, f) in acc.kmer_frequencies() {
228                assert!(x.0 < counts1.len() as u64);
229                while (i as u64) < x.0 {
230                    assert_eq!(counts1[i], 0);
231                    i += 1;
232                }
233                assert_eq!(f, counts1[i]);
234                i += 1;
235            }
236            assert!(i <= counts1.len());
237        }
238        acc.add_both(&seq2);
239        {
240            let mut i = 0;
241            for (x, f) in acc.kmer_frequencies() {
242                assert!(x.0 < counts1.len() as u64);
243                while (i as u64) < x.0 {
244                    assert_eq!(counts1[i] + counts2[i], 0);
245                    i += 1;
246                }
247                assert_eq!(f, counts1[i] + counts2[i]);
248                i += 1;
249            }
250            assert!(i <= counts1.len());
251        }
252    }
253
254    #[test]
255    fn test_2() {
256        let k = 7;
257
258        let seq1 = "CCACGACACATCATAGTCAAATTTCTAAAAATTAAGGACCACACAAACACCTTAAAAACAGAGAAAGAAAAATGACACATTACCTATACTGCAAAAACAATTCAAATGACAAAGTATTATTCATTGGAAACCACAGAGTTCAGAATGAAG";
259        let seq2 = "TTATATGCACATGATTTACCTCCAGTAAAGATTTCTAAAGTAAATGTAAGAAAAGGGACAAGTTCTCCAGGGAAAGAAAACCTGACAACAAAGTGTGTCTAAACTTGTGATCAAAGGTTAAATATCTACATCAAAGCCCTAATCAGCACA";
260
261        let counts1 = Kmer::frequency_vector(k, &seq1);
262        let counts2 = Kmer::frequency_vector_both(k, &seq2);
263        assert_eq!(counts1.len(), counts2.len());
264
265        let mut acc = SimpleSparseAccumulator::new(k);
266        acc.add(&seq1);
267        {
268            let mut i = 0;
269            for (x, f) in acc.kmer_frequencies() {
270                assert!(x.0 < counts1.len() as u64);
271                while (i as u64) < x.0 {
272                    assert_eq!(counts1[i], 0);
273                    i += 1;
274                }
275                assert_eq!(f, counts1[i]);
276                i += 1;
277            }
278            assert!(i <= counts1.len());
279        }
280        acc.add_both(&seq2);
281        {
282            let mut i = 0;
283            for (x, f) in acc.kmer_frequencies() {
284                assert!(x.0 < counts1.len() as u64);
285                while (i as u64) < x.0 {
286                    assert_eq!(counts1[i] + counts2[i], 0);
287                    i += 1;
288                }
289                assert_eq!(f, counts1[i] + counts2[i]);
290                i += 1;
291            }
292            assert!(i <= counts1.len());
293        }
294    }
295
296    #[test]
297    fn test_3() {
298        let k = 7;
299        let mut acc = BigSparseAccumulator::new(k, 1024);
300        let mut rng = MiniRng::new(19);
301        for _i in 0..10 {
302            let seq = make_some_sequence(&mut rng);
303            acc.add(&seq.as_bytes().iter());
304        }
305        for _i in 0..10 {
306            let seq = make_some_sequence(&mut rng);
307            acc.add_both(&seq.as_bytes().iter());
308        }
309        acc.flush();
310        let xfs = Vec::from_iter(acc.kmer_frequencies());
311        assert_eq!(xfs.len(), 7585);
312        let mut h = Vec::new();
313        for (_x, f) in xfs.iter() {
314            let c = *f;
315            while h.len() <= c {
316                h.push(0);
317            }
318            h[c] += 1;
319        }
320        assert_eq!(h, vec![0, 5553, 1680, 308, 37, 7]);
321    }
322}