1use std::collections::BTreeMap;
2
3use crate::{
4 counting_kmer_frequency_iterator, frequency::frequency_vector_iter, merge, unique,
5 CompressedKmerFrequencyList, CompressedKmerList, Kmer,
6};
7
8pub struct DenseAccumulator {
10 k: usize,
11 counts: Vec<usize>,
12}
13
14impl DenseAccumulator {
15 pub fn new(k: usize) -> DenseAccumulator {
17 let n = 1usize << (2 * k);
18 let mut counts = Vec::new();
19 counts.resize(n, 0);
20 DenseAccumulator { k, counts }
21 }
22
23 pub fn add<S>(&mut self, seq: &S)
25 where
26 S: AsRef<[u8]>,
27 {
28 Kmer::with_many(self.k, seq, |x| {
29 self.counts[x.0 as usize] += 1;
30 });
31 }
32
33 pub fn add_both<S>(&mut self, seq: &S)
35 where
36 S: AsRef<[u8]>,
37 {
38 Kmer::with_many_both(self.k, seq, |x, y| {
39 self.counts[x.0 as usize] += 1;
40 self.counts[y.0 as usize] += 1;
41 });
42 }
43
44 pub fn kmer_frequencies(&self) -> impl Iterator<Item = (Kmer, usize)> + '_ {
46 frequency_vector_iter(self.counts.iter())
47 }
48}
49
50pub struct SimpleSparseAccumulator {
52 k: usize,
53 counts: BTreeMap<Kmer, usize>,
54}
55
56impl SimpleSparseAccumulator {
57 pub fn new(k: usize) -> SimpleSparseAccumulator {
59 SimpleSparseAccumulator {
60 k,
61 counts: BTreeMap::new(),
62 }
63 }
64
65 pub fn add<S>(&mut self, seq: &S)
67 where
68 S: AsRef<[u8]>,
69 {
70 Kmer::with_many(self.k, seq, |x| {
71 *self.counts.entry(x.clone()).or_insert(0) += 1;
72 });
73 }
74
75 pub fn add_both<S>(&mut self, seq: &S)
77 where
78 S: AsRef<[u8]>,
79 {
80 Kmer::with_many_both(self.k, seq, |x, y| {
81 *self.counts.entry(x.clone()).or_insert(0) += 1;
82 *self.counts.entry(y.clone()).or_insert(0) += 1;
83 });
84 }
85
86 pub fn kmer_frequencies(&self) -> impl Iterator<Item = (Kmer, usize)> + '_ {
88 self.counts.iter().map(|(x, f)| (x.clone(), *f))
89 }
90}
91
92pub struct BigSparseAccumulator {
94 k: usize,
95 buffer_size: usize,
96 buffer: Vec<Kmer>,
97 unique_kmers: CompressedKmerList,
98 non_unique_kmers: CompressedKmerFrequencyList,
99}
100
101impl BigSparseAccumulator {
102 pub fn new(k: usize, buffer_size: usize) -> BigSparseAccumulator {
104 BigSparseAccumulator {
105 k,
106 buffer_size,
107 buffer: Vec::with_capacity(buffer_size),
108 unique_kmers: CompressedKmerList::new(),
109 non_unique_kmers: CompressedKmerFrequencyList::new(),
110 }
111 }
112
113 pub fn add<S>(&mut self, seq: &S)
115 where
116 S: AsRef<[u8]>,
117 {
118 Kmer::with_many(self.k, seq, |x| {
119 self.buffer.push(x.clone());
120 if self.buffer.len() + 2 >= self.buffer_size {
121 self.flush();
122 }
123 });
124 }
125
126 pub fn add_both<S>(&mut self, seq: &S)
128 where
129 S: AsRef<[u8]>,
130 {
131 Kmer::with_many_both(self.k, seq, |x, y| {
132 self.buffer.push(x.clone());
133 self.buffer.push(y.clone());
134 if self.buffer.len() + 2 >= self.buffer_size {
135 self.flush();
136 }
137 });
138 }
139
140 pub fn kmer_frequencies(&mut self) -> impl Iterator<Item = (Kmer, usize)> + '_ {
142 self.flush();
143 let unique_kmer_frequencies = unique(self.unique_kmers.iter());
144 let non_unique_kmer_frequencies = self.non_unique_kmers.iter();
145 merge(unique_kmer_frequencies, non_unique_kmer_frequencies)
146 }
147
148 fn flush(&mut self) {
149 if self.buffer.len() == 0 {
150 return;
151 }
152
153 self.buffer.sort();
154
155 let mut unique_kmers = CompressedKmerList::new();
156 let mut non_unique_kmers = CompressedKmerFrequencyList::new();
157 {
158 let new_kmer_frequencies = counting_kmer_frequency_iterator(self.buffer.iter());
159 let unique_kmer_frequencies = unique(self.unique_kmers.iter());
160 let non_unique_kmer_frequencies = self.non_unique_kmers.iter();
161
162 for (x, f) in merge(
163 merge(new_kmer_frequencies, unique_kmer_frequencies),
164 non_unique_kmer_frequencies,
165 ) {
166 if f == 1 {
167 unique_kmers.push(x);
168 } else {
169 non_unique_kmers.push((x, f));
170 }
171 }
172 }
173 self.buffer.clear();
174 std::mem::swap(&mut unique_kmers, &mut self.unique_kmers);
175 std::mem::swap(&mut non_unique_kmers, &mut self.non_unique_kmers);
176 }
177}
178
179#[cfg(test)]
180mod tests {
181 use super::*;
182
183 struct MiniRng {
184 x: u64,
185 }
186
187 impl MiniRng {
188 fn new(seed: u64) -> MiniRng {
189 MiniRng { x: seed }
190 }
191
192 fn rnd(&mut self) -> u64 {
193 self.x = self.x.wrapping_mul(2862933555777941757u64);
194 self.x = self.x.wrapping_add(3037000493u64);
195 self.x
196 }
197 }
198
199 fn make_some_sequence(rng: &mut MiniRng) -> String {
200 let mut s = String::new();
201 let k: usize = 17;
202 let m = (1u64 << (2 * k)) - 1;
203 for _i in 0..20 {
204 let u = rng.rnd() & m;
205 let x = Kmer::from_u64(u);
206 s += &x.render(k);
207 }
208 println!("{}", s);
209 s
210 }
211
212 #[test]
213 fn test_1() {
214 let k = 7;
215
216 let seq1 = "CCACGACACATCATAGTCAAATTTCTAAAAATTAAGGACCACACAAACACCTTAAAAACAGAGAAAGAAAAATGACACATTACCTATACTGCAAAAACAATTCAAATGACAAAGTATTATTCATTGGAAACCACAGAGTTCAGAATGAAG";
217 let seq2 = "TTATATGCACATGATTTACCTCCAGTAAAGATTTCTAAAGTAAATGTAAGAAAAGGGACAAGTTCTCCAGGGAAAGAAAACCTGACAACAAAGTGTGTCTAAACTTGTGATCAAAGGTTAAATATCTACATCAAAGCCCTAATCAGCACA";
218
219 let counts1 = Kmer::frequency_vector(k, &seq1);
220 let counts2 = Kmer::frequency_vector_both(k, &seq2);
221 assert_eq!(counts1.len(), counts2.len());
222
223 let mut acc = DenseAccumulator::new(k);
224 acc.add(&seq1);
225 {
226 let mut i = 0;
227 for (x, f) in acc.kmer_frequencies() {
228 assert!(x.0 < counts1.len() as u64);
229 while (i as u64) < x.0 {
230 assert_eq!(counts1[i], 0);
231 i += 1;
232 }
233 assert_eq!(f, counts1[i]);
234 i += 1;
235 }
236 assert!(i <= counts1.len());
237 }
238 acc.add_both(&seq2);
239 {
240 let mut i = 0;
241 for (x, f) in acc.kmer_frequencies() {
242 assert!(x.0 < counts1.len() as u64);
243 while (i as u64) < x.0 {
244 assert_eq!(counts1[i] + counts2[i], 0);
245 i += 1;
246 }
247 assert_eq!(f, counts1[i] + counts2[i]);
248 i += 1;
249 }
250 assert!(i <= counts1.len());
251 }
252 }
253
254 #[test]
255 fn test_2() {
256 let k = 7;
257
258 let seq1 = "CCACGACACATCATAGTCAAATTTCTAAAAATTAAGGACCACACAAACACCTTAAAAACAGAGAAAGAAAAATGACACATTACCTATACTGCAAAAACAATTCAAATGACAAAGTATTATTCATTGGAAACCACAGAGTTCAGAATGAAG";
259 let seq2 = "TTATATGCACATGATTTACCTCCAGTAAAGATTTCTAAAGTAAATGTAAGAAAAGGGACAAGTTCTCCAGGGAAAGAAAACCTGACAACAAAGTGTGTCTAAACTTGTGATCAAAGGTTAAATATCTACATCAAAGCCCTAATCAGCACA";
260
261 let counts1 = Kmer::frequency_vector(k, &seq1);
262 let counts2 = Kmer::frequency_vector_both(k, &seq2);
263 assert_eq!(counts1.len(), counts2.len());
264
265 let mut acc = SimpleSparseAccumulator::new(k);
266 acc.add(&seq1);
267 {
268 let mut i = 0;
269 for (x, f) in acc.kmer_frequencies() {
270 assert!(x.0 < counts1.len() as u64);
271 while (i as u64) < x.0 {
272 assert_eq!(counts1[i], 0);
273 i += 1;
274 }
275 assert_eq!(f, counts1[i]);
276 i += 1;
277 }
278 assert!(i <= counts1.len());
279 }
280 acc.add_both(&seq2);
281 {
282 let mut i = 0;
283 for (x, f) in acc.kmer_frequencies() {
284 assert!(x.0 < counts1.len() as u64);
285 while (i as u64) < x.0 {
286 assert_eq!(counts1[i] + counts2[i], 0);
287 i += 1;
288 }
289 assert_eq!(f, counts1[i] + counts2[i]);
290 i += 1;
291 }
292 assert!(i <= counts1.len());
293 }
294 }
295
296 #[test]
297 fn test_3() {
298 let k = 7;
299 let mut acc = BigSparseAccumulator::new(k, 1024);
300 let mut rng = MiniRng::new(19);
301 for _i in 0..10 {
302 let seq = make_some_sequence(&mut rng);
303 acc.add(&seq.as_bytes().iter());
304 }
305 for _i in 0..10 {
306 let seq = make_some_sequence(&mut rng);
307 acc.add_both(&seq.as_bytes().iter());
308 }
309 acc.flush();
310 let xfs = Vec::from_iter(acc.kmer_frequencies());
311 assert_eq!(xfs.len(), 7585);
312 let mut h = Vec::new();
313 for (_x, f) in xfs.iter() {
314 let c = *f;
315 while h.len() <= c {
316 h.push(0);
317 }
318 h[c] += 1;
319 }
320 assert_eq!(h, vec![0, 5553, 1680, 308, 37, 7]);
321 }
322}