1
2#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Debug)]
4pub struct Kmer(pub u64);
5
6impl Kmer {
7 pub fn from_u64(value: u64) -> Kmer {
9 Kmer(value)
10 }
11
12 pub fn make(seq: &str) -> Option<Kmer> {
17 if seq.len() > 32 {
18 return None;
19 }
20 let mut x = Kmer(0);
21 for c in seq.chars() {
22 let b = Kmer::base(c)?;
23 x.0 = (x.0 << 2) | b.0;
24 }
25 Some(x)
26 }
27
28 pub fn make_many(k: usize, seq: &str) -> Vec<Kmer> {
33 let mut xs: Vec<Kmer> = Vec::new();
34 Self::with_many(k, &seq, |x| xs.push(x.clone()));
35 xs
36 }
37
38 pub fn with_many<S, F>(k: usize, seq: &S, mut f: F) -> ()
41 where
42 S: AsRef<[u8]>,
43 F: FnMut(&Kmer),
44 {
45 let mut i = 0;
46 let msk: u64 = (1 << (2 * k)) - 1;
47 let mut x = Kmer(0);
48 for c in seq.as_ref() {
49 match Kmer::byte(*c) {
50 None => {
51 i = 0;
52 x.0 = 0;
53 }
54 Some(b) => {
55 x.0 = (x.0 << 2) | b.0;
56 i += 1;
57 if i == k {
58 x.0 &= msk;
59 f(&x);
60 i -= 1;
61 }
62 }
63 }
64 }
65 }
66
67 pub fn with_many_both<S, F>(k: usize, seq: &S, mut f: F) -> ()
70 where
71 S: AsRef<[u8]>,
72 F: FnMut(&Kmer, &Kmer),
73 {
74 let shift = 2 * (k - 1);
75 let msk: u64 = (1 << (2 * k)) - 1;
76 let mut x = Kmer(0);
77 let mut y = Kmer(0);
78 let mut i = 0;
79 for c in seq.as_ref() {
80 match Kmer::byte(*c) {
81 None => {
82 i = 0;
83 x.0 = 0;
84 y.0 = 0;
85 }
86 Some(b) => {
87 x.0 = (x.0 << 2) | b.0;
88 y.0 = (y.0 >> 2) | ((3 - b.0) << shift);
89 i += 1;
90 if i == k {
91 x.0 &= msk;
92 f(&x, &y);
93 i -= 1;
94 }
95 }
96 }
97 }
98 }
99
100 #[inline]
102 pub fn base(c: char) -> Option<Kmer> {
103 match c {
104 'A' | 'a' => Some(Kmer(0)),
105 'C' | 'c' => Some(Kmer(1)),
106 'G' | 'g' => Some(Kmer(2)),
107 'T' | 't' | 'U' | 'u' => Some(Kmer(3)),
108 _ => None,
109 }
110 }
111
112 #[inline]
114 pub fn byte(c: u8) -> Option<Kmer> {
115 match c {
116 b'A' | b'a' => Some(Kmer(0)),
117 b'C' | b'c' => Some(Kmer(1)),
118 b'G' | b'g' => Some(Kmer(2)),
119 b'T' | b't' | b'U' | b'u' => Some(Kmer(3)),
120 _ => None,
121 }
122 }
123
124 #[inline]
126 pub fn rev(&self, k: usize) -> Kmer {
127 const M2: u64 = 0x3333333333333333;
128 const M3: u64 = 0x0F0F0F0F0F0F0F0F;
129 const M4: u64 = 0x00FF00FF00FF00FF;
130 const M5: u64 = 0x0000FFFF0000FFFF;
131 const M6: u64 = 0x00000000FFFFFFFF;
132
133 let mut x = Kmer(self.0);
134 x.0 = ((x.0 >> 2) & M2) | ((x.0 & M2) << 2);
135 x.0 = ((x.0 >> 4) & M3) | ((x.0 & M3) << 4);
136 x.0 = ((x.0 >> 8) & M4) | ((x.0 & M4) << 8);
137 x.0 = ((x.0 >> 16) & M5) | ((x.0 & M5) << 16);
138 x.0 = ((x.0 >> 32) & M6) | ((x.0 & M6) << 32);
139 x.0 >>= 64 - 2 * k;
140 x
141 }
142
143 #[inline]
145 pub fn rev_comp(&self, k: usize) -> Kmer {
146 Kmer(!self.0).rev(k)
147 }
148
149 pub fn render(&self, k: usize) -> String {
151 let mut s = String::new();
152 let mut y = self.rev(k);
153 for _i in 0..k {
154 match y.0 & 3 {
155 0 => {
156 s.push('A');
157 }
158 1 => {
159 s.push('C');
160 }
161 2 => {
162 s.push('G');
163 }
164 3 => {
165 s.push('T');
166 }
167 _ => {
168 unreachable!();
169 }
170 }
171 y.0 >>= 2;
172 }
173 s
174 }
175
176 pub fn ham(x: &Kmer, y: &Kmer) -> usize {
179 const M1: u64 = 0x5555555555555555;
180 let z = x.0 ^ y.0;
181 let v = (z | (z >> 1)) & M1;
182 v.count_ones() as usize
183 }
184
185 pub fn frequency_vector<S>(k: usize, seq: &S) -> Vec<usize>
193 where
194 S: AsRef<[u8]>,
195 {
196 let n: usize = 1 << (2 * k);
197 let mut v: Vec<usize> = Vec::new();
198 v.resize(n, 0);
199 Kmer::with_many(k, seq, |x| {
200 v[x.0 as usize] += 1;
201 });
202 v
203 }
204
205 pub fn frequency_vector_both<S>(k: usize, seq: &S) -> Vec<usize>
213 where
214 S: AsRef<[u8]>,
215 {
216 let n: usize = 1 << (2 * k);
217 let mut v: Vec<usize> = Vec::new();
218 v.resize(n, 0);
219 Kmer::with_many_both(k, seq, |x, y| {
220 v[x.0 as usize] += 1;
221 v[y.0 as usize] += 1;
222 });
223 v
224 }
225}
226
227pub struct KmerIterator<'a, Src>
229where
230Src: Iterator<Item = &'a u8>
231{
232 k: usize,
233 shift: usize,
234 mask: u64,
235 x: Kmer,
236 y: Kmer,
237 i: usize,
238 src: Src
239}
240
241impl<'a, Src> KmerIterator<'a, Src>
242where
243Src: Iterator<Item = &'a u8>
244{
245 pub fn new(k: usize, src: Src) -> KmerIterator<'a, Src> {
247 let shift: usize = 2 * (k - 1);
248 let mask: u64 = (1 << (2 * k)) - 1;
249 let x: Kmer = Kmer(0);
250 let y: Kmer = Kmer(0);
251 let i: usize = 0;
252 KmerIterator { k, shift, mask, x, y, i, src }
253 }
254}
255
256impl<'a, Src> Iterator for KmerIterator<'a, Src>
257where
258Src: Iterator<Item = &'a u8>
259{
260 type Item = (Kmer, Kmer);
261 fn next(&mut self) -> Option<Self::Item> {
262 while let Some(c) = self.src.next() {
263 match Kmer::byte(*c) {
264 None => {
265 self.i = 0;
266 self.x.0 = 0;
267 self.y.0 = 0;
268 }
269 Some(b) => {
270 self.x.0 = (self.x.0 << 2) | b.0;
271 self.y.0 = (self.y.0 >> 2) | ((3 - b.0) << self.shift);
272 self.i += 1;
273 if self.i == self.k {
274 self.x.0 &= self.mask;
275 self.i -= 1;
276 return Some((self.x.clone(), self.y.clone()));
277 }
278 }
279 }
280 }
281 None
282 }
283}
284
285#[cfg(test)]
286mod tests {
287 use super::*;
288
289 #[test]
290 fn test_0a() {
291 let seq = "CGAT";
292 let ox = Kmer::make(seq);
293 assert_eq!(ox, Some(Kmer(0b01100011u64)));
294 }
295
296 #[test]
297 fn test_0b() {
298 let seq = "CGXAT";
299 let ox = Kmer::make(seq);
300 assert_eq!(ox, None);
301 }
302
303 #[test]
304 fn test_0c() {
305 let seq = "CTTTCTGGGGCTAGAGCAGGCAAACGTGGTACA";
306 assert_eq!(seq.len(), 33);
307 let ox = Kmer::make(seq);
308 assert_eq!(ox, None);
309 }
310
311 #[test]
312 fn test_1() {
313 let seq = "CTTTCTGGGGCTAGAGCAGGCAAACGTGGTACAGTCGACTCCATTCTTTCTTCCTCTGAGACCCCTTCCAGGAATTCAANGGCGCTGGTGAGTCATGAGGCCTCGGAGCAGGGAGTGGTGGTGGTTACATAATTCAGATTAACTCTCAGT";
314 let k = 11;
315 let xs = Kmer::make_many(k, seq);
316 assert_eq!(xs.len(), seq.len() - k + 1 - k);
317
318 let ys = vec![
319 "CTTTCTGGGGC",
320 "TTTCTGGGGCT",
321 "TTCTGGGGCTA",
322 "TCTGGGGCTAG",
323 "CTGGGGCTAGA",
324 "TGGGGCTAGAG",
325 "GGGGCTAGAGC",
326 "GGGCTAGAGCA",
327 "GGCTAGAGCAG",
328 "GCTAGAGCAGG",
329 "CTAGAGCAGGC",
330 "TAGAGCAGGCA",
331 "AGAGCAGGCAA",
332 "GAGCAGGCAAA",
333 "AGCAGGCAAAC",
334 "GCAGGCAAACG",
335 "CAGGCAAACGT",
336 "AGGCAAACGTG",
337 "GGCAAACGTGG",
338 "GCAAACGTGGT",
339 "CAAACGTGGTA",
340 "AAACGTGGTAC",
341 "AACGTGGTACA",
342 "ACGTGGTACAG",
343 "CGTGGTACAGT",
344 "GTGGTACAGTC",
345 "TGGTACAGTCG",
346 "GGTACAGTCGA",
347 "GTACAGTCGAC",
348 "TACAGTCGACT",
349 "ACAGTCGACTC",
350 "CAGTCGACTCC",
351 "AGTCGACTCCA",
352 "GTCGACTCCAT",
353 "TCGACTCCATT",
354 "CGACTCCATTC",
355 "GACTCCATTCT",
356 "ACTCCATTCTT",
357 "CTCCATTCTTT",
358 "TCCATTCTTTC",
359 "CCATTCTTTCT",
360 "CATTCTTTCTT",
361 "ATTCTTTCTTC",
362 "TTCTTTCTTCC",
363 "TCTTTCTTCCT",
364 "CTTTCTTCCTC",
365 "TTTCTTCCTCT",
366 "TTCTTCCTCTG",
367 "TCTTCCTCTGA",
368 "CTTCCTCTGAG",
369 "TTCCTCTGAGA",
370 "TCCTCTGAGAC",
371 "CCTCTGAGACC",
372 "CTCTGAGACCC",
373 "TCTGAGACCCC",
374 "CTGAGACCCCT",
375 "TGAGACCCCTT",
376 "GAGACCCCTTC",
377 "AGACCCCTTCC",
378 "GACCCCTTCCA",
379 "ACCCCTTCCAG",
380 "CCCCTTCCAGG",
381 "CCCTTCCAGGA",
382 "CCTTCCAGGAA",
383 "CTTCCAGGAAT",
384 "TTCCAGGAATT",
385 "TCCAGGAATTC",
386 "CCAGGAATTCA",
387 "CAGGAATTCAA",
388 "GGCGCTGGTGA",
389 "GCGCTGGTGAG",
390 "CGCTGGTGAGT",
391 "GCTGGTGAGTC",
392 "CTGGTGAGTCA",
393 "TGGTGAGTCAT",
394 "GGTGAGTCATG",
395 "GTGAGTCATGA",
396 "TGAGTCATGAG",
397 "GAGTCATGAGG",
398 "AGTCATGAGGC",
399 "GTCATGAGGCC",
400 "TCATGAGGCCT",
401 "CATGAGGCCTC",
402 "ATGAGGCCTCG",
403 "TGAGGCCTCGG",
404 "GAGGCCTCGGA",
405 "AGGCCTCGGAG",
406 "GGCCTCGGAGC",
407 "GCCTCGGAGCA",
408 "CCTCGGAGCAG",
409 "CTCGGAGCAGG",
410 "TCGGAGCAGGG",
411 "CGGAGCAGGGA",
412 "GGAGCAGGGAG",
413 "GAGCAGGGAGT",
414 "AGCAGGGAGTG",
415 "GCAGGGAGTGG",
416 "CAGGGAGTGGT",
417 "AGGGAGTGGTG",
418 "GGGAGTGGTGG",
419 "GGAGTGGTGGT",
420 "GAGTGGTGGTG",
421 "AGTGGTGGTGG",
422 "GTGGTGGTGGT",
423 "TGGTGGTGGTT",
424 "GGTGGTGGTTA",
425 "GTGGTGGTTAC",
426 "TGGTGGTTACA",
427 "GGTGGTTACAT",
428 "GTGGTTACATA",
429 "TGGTTACATAA",
430 "GGTTACATAAT",
431 "GTTACATAATT",
432 "TTACATAATTC",
433 "TACATAATTCA",
434 "ACATAATTCAG",
435 "CATAATTCAGA",
436 "ATAATTCAGAT",
437 "TAATTCAGATT",
438 "AATTCAGATTA",
439 "ATTCAGATTAA",
440 "TTCAGATTAAC",
441 "TCAGATTAACT",
442 "CAGATTAACTC",
443 "AGATTAACTCT",
444 "GATTAACTCTC",
445 "ATTAACTCTCA",
446 "TTAACTCTCAG",
447 "TAACTCTCAGT",
448 ];
449 assert_eq!(xs.len(), ys.len());
450 for i in 0..xs.len() {
451 assert_eq!(xs[i].render(k), ys[i]);
452 }
453 }
454
455 #[test]
456 fn test_2() {
457 let seq = "CTTTCTGGGGCTAGAGCAGGCAAACGTGGTACAGTCGACTCCATTCTTTCTTCCTCTGAGACCCCTTCCAGGAATTCAANGGCGCTGGTGAGTCATGAGGCCTCGGAGCAGGGAGTGGTGGTGGTTACATAATTCAGATTAACTCTCAGT";
458 let k = 11;
459 let mut xs = Vec::new();
460 Kmer::with_many_both(k, &seq, |x, y| {
461 xs.push(x.clone());
462 xs.push(y.clone())
463 });
464 assert_eq!(xs.len(), 2 * (seq.len() - k + 1 - k));
465 }
466
467 #[test]
468 fn test_3() {
469 let seq = "CTTTCTGGGGCTAGAGCAGGCAAACGTGGTACAGTCGACTCCATTCTTTCTTCCTCTGAGACCCCTTCCAGGAATTCAAAGGCGCTGGTGAGTCATGAGGCCTCGGAGCAGGGAGTGGTGGTGGTTACATAATTCAGATTAACTCTCAGT";
470 let xs = Kmer::frequency_vector(3, &seq);
471 assert_eq!(
472 xs,
473 vec![
474 2, 2, 1, 2, 2, 1, 1, 2, 3, 2, 5, 4, 1, 0, 1, 4, 2, 0, 6, 3, 2, 2, 0, 3, 1, 1, 1, 1,
475 1, 5, 3, 4, 1, 2, 6, 1, 3, 1, 1, 2, 3, 4, 3, 5, 1, 2, 5, 1, 2, 2, 1, 0, 4, 3, 2, 5,
476 3, 0, 6, 0, 2, 7, 0, 2
477 ]
478 );
479 }
480
481 #[test]
482 fn test_4() {
483 let seq = "CTTTCTGGGGCTAGAGCAGGCAAACGTGGTACAGTCGACTCCATTCTTTCTTCCTCTGAGACCCCTTCCAGGAATTCAAAGGCGCTGGTGAGTCATGAGGCCTCGGAGCAGGGAGTGGTGGTGGTTACATAATTCAGATTAACTCTCAGT";
484 let xs = Kmer::frequency_vector_both(3, &seq);
485 assert_eq!(
486 xs,
487 vec![
488 4, 3, 5, 6, 2, 6, 2, 6, 8, 4, 8, 6, 1, 1, 4, 6, 2, 5, 9, 4, 8, 5, 1, 8, 3, 2, 1, 2,
489 2, 11, 9, 5, 8, 4, 11, 1, 3, 5, 2, 4, 6, 5, 5, 6, 3, 4, 5, 3, 4, 3, 2, 1, 7, 6, 3,
490 8, 7, 3, 8, 2, 4, 8, 2, 4
491 ]
492 );
493 }
494
495 #[test]
496 fn test_5() {
497 let seq = "CTTTCTGGGGC";
498 let k = seq.len();
499 let x = Kmer::make(seq).unwrap();
500 let y = x.rev_comp(k);
501 assert_eq!(y.rev_comp(k), x);
502 assert_eq!(y.render(k), "GCCCCAGAAAG");
503 }
504
505 #[test]
506 fn test_6() {
507 let x = Kmer::make("CTTTCTGGGGC").unwrap();
508 let y = Kmer::make("CTATGTGGCGC").unwrap();
509 assert_eq!(Kmer::ham(&x, &x), 0);
510 assert_eq!(Kmer::ham(&y, &y), 0);
511 assert_eq!(Kmer::ham(&x, &y), 3);
512 assert_eq!(Kmer::ham(&y, &x), 3);
513 }
514
515 #[test]
516 fn test_7() {
517 let seq = "CTTTCTGGGGCTANGGCAAACGTGG";
518 let k = 11;
519 let mut itr = KmerIterator::new(k, seq.as_bytes().iter());
520 let x1 = Kmer::make("CTTTCTGGGGC").unwrap();
521 assert_eq!(itr.next(), Some((x1.clone(), x1.rev_comp(k))));
522 let x2 = Kmer::make("TTTCTGGGGCT").unwrap();
523 assert_eq!(itr.next(), Some((x2.clone(), x2.rev_comp(k))));
524 let x3 = Kmer::make("TTCTGGGGCTA").unwrap();
525 assert_eq!(itr.next(), Some((x3.clone(), x3.rev_comp(k))));
526 let x4 = Kmer::make("GGCAAACGTGG").unwrap();
527 assert_eq!(itr.next(), Some((x4.clone(), x4.rev_comp(k))));
528 assert_eq!(itr.next(), None);
529 }
530}