kmers_rs/
basics.rs

1
2/// A k-length nucleotide sequence represented as a 64-bit integer.
3#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Debug)]
4pub struct Kmer(pub u64);
5
6impl Kmer {
7    /// Construct a k-mer from an unsigned integer
8    pub fn from_u64(value: u64) -> Kmer {
9        Kmer(value)
10    }
11
12    /// convert a k-length string into a k-mer.
13    ///
14    /// If k > 32 or the string contains letters other than
15    /// "A", "C", "G", "T", or "U", `None` is returned.
16    pub fn make(seq: &str) -> Option<Kmer> {
17        if seq.len() > 32 {
18            return None;
19        }
20        let mut x = Kmer(0);
21        for c in seq.chars() {
22            let b = Kmer::base(c)?;
23            x.0 = (x.0 << 2) | b.0;
24        }
25        Some(x)
26    }
27
28    /// Extract k-mers from a string.
29    ///
30    /// Traverse the string `seq` and put all the valid k-mers in a vector.
31    ///
32    pub fn make_many(k: usize, seq: &str) -> Vec<Kmer> {
33        let mut xs: Vec<Kmer> = Vec::new();
34        Self::with_many(k, &seq, |x| xs.push(x.clone()));
35        xs
36    }
37
38    /// Extract k-mers from a byte sequence and pass them to a closure.
39    ///
40    pub fn with_many<S, F>(k: usize, seq: &S, mut f: F) -> ()
41    where
42        S: AsRef<[u8]>,
43        F: FnMut(&Kmer),
44    {
45        let mut i = 0;
46        let msk: u64 = (1 << (2 * k)) - 1;
47        let mut x = Kmer(0);
48        for c in seq.as_ref() {
49            match Kmer::byte(*c) {
50                None => {
51                    i = 0;
52                    x.0 = 0;
53                }
54                Some(b) => {
55                    x.0 = (x.0 << 2) | b.0;
56                    i += 1;
57                    if i == k {
58                        x.0 &= msk;
59                        f(&x);
60                        i -= 1;
61                    }
62                }
63            }
64        }
65    }
66
67    /// Extract k-mers from both strands of a byte sequence.
68    ///
69    pub fn with_many_both<S, F>(k: usize, seq: &S, mut f: F) -> ()
70    where
71        S: AsRef<[u8]>,
72        F: FnMut(&Kmer, &Kmer),
73    {
74        let shift = 2 * (k - 1);
75        let msk: u64 = (1 << (2 * k)) - 1;
76        let mut x = Kmer(0);
77        let mut y = Kmer(0);
78        let mut i = 0;
79        for c in seq.as_ref() {
80            match Kmer::byte(*c) {
81                None => {
82                    i = 0;
83                    x.0 = 0;
84                    y.0 = 0;
85                }
86                Some(b) => {
87                    x.0 = (x.0 << 2) | b.0;
88                    y.0 = (y.0 >> 2) | ((3 - b.0) << shift);
89                    i += 1;
90                    if i == k {
91                        x.0 &= msk;
92                        f(&x, &y);
93                        i -= 1;
94                    }
95                }
96            }
97        }
98    }
99
100    /// Convert a character to a 1-mer.
101    #[inline]
102    pub fn base(c: char) -> Option<Kmer> {
103        match c {
104            'A' | 'a' => Some(Kmer(0)),
105            'C' | 'c' => Some(Kmer(1)),
106            'G' | 'g' => Some(Kmer(2)),
107            'T' | 't' | 'U' | 'u' => Some(Kmer(3)),
108            _ => None,
109        }
110    }
111
112    /// Convert an ASCII byte to a 1-mer.
113    #[inline]
114    pub fn byte(c: u8) -> Option<Kmer> {
115        match c {
116            b'A' | b'a' => Some(Kmer(0)),
117            b'C' | b'c' => Some(Kmer(1)),
118            b'G' | b'g' => Some(Kmer(2)),
119            b'T' | b't' | b'U' | b'u' => Some(Kmer(3)),
120            _ => None,
121        }
122    }
123
124    /// Create a reversed k-mer.
125    #[inline]
126    pub fn rev(&self, k: usize) -> Kmer {
127        const M2: u64 = 0x3333333333333333;
128        const M3: u64 = 0x0F0F0F0F0F0F0F0F;
129        const M4: u64 = 0x00FF00FF00FF00FF;
130        const M5: u64 = 0x0000FFFF0000FFFF;
131        const M6: u64 = 0x00000000FFFFFFFF;
132
133        let mut x = Kmer(self.0);
134        x.0 = ((x.0 >> 2) & M2) | ((x.0 & M2) << 2);
135        x.0 = ((x.0 >> 4) & M3) | ((x.0 & M3) << 4);
136        x.0 = ((x.0 >> 8) & M4) | ((x.0 & M4) << 8);
137        x.0 = ((x.0 >> 16) & M5) | ((x.0 & M5) << 16);
138        x.0 = ((x.0 >> 32) & M6) | ((x.0 & M6) << 32);
139        x.0 >>= 64 - 2 * k;
140        x
141    }
142
143    /// Create a reverse-complement k-mer.
144    #[inline]
145    pub fn rev_comp(&self, k: usize) -> Kmer {
146        Kmer(!self.0).rev(k)
147    }
148
149    /// Convert a k-mer to a string.
150    pub fn render(&self, k: usize) -> String {
151        let mut s = String::new();
152        let mut y = self.rev(k);
153        for _i in 0..k {
154            match y.0 & 3 {
155                0 => {
156                    s.push('A');
157                }
158                1 => {
159                    s.push('C');
160                }
161                2 => {
162                    s.push('G');
163                }
164                3 => {
165                    s.push('T');
166                }
167                _ => {
168                    unreachable!();
169                }
170            }
171            y.0 >>= 2;
172        }
173        s
174    }
175
176    /// Compute the Hamming distance between two k-mers.
177    ///
178    pub fn ham(x: &Kmer, y: &Kmer) -> usize {
179        const M1: u64 = 0x5555555555555555;
180        let z = x.0 ^ y.0;
181        let v = (z | (z >> 1)) & M1;
182        v.count_ones() as usize
183    }
184
185    /// Compute the frequency of k-mers on the forward strand of a sequence.
186    ///
187    /// Compute a frequency vector with the number of instances in `seq` of
188    /// each of the 4**k possible k-mers.
189    ///
190    /// Note that this is a dense representation, so the vector will have
191    /// 4**k elements.
192    pub fn frequency_vector<S>(k: usize, seq: &S) -> Vec<usize>
193    where
194        S: AsRef<[u8]>,
195    {
196        let n: usize = 1 << (2 * k);
197        let mut v: Vec<usize> = Vec::new();
198        v.resize(n, 0);
199        Kmer::with_many(k, seq, |x| {
200            v[x.0 as usize] += 1;
201        });
202        v
203    }
204
205    /// Compute the frequency of k-mers on both strands of a sequence.
206    ///
207    /// Compute a frequency vector with the number of instances in `seq` of
208    /// each of the 4**k possible k-mers.
209    ///
210    /// Note that this is a dense representation, so the vector will have
211    /// 4**k elements.
212    pub fn frequency_vector_both<S>(k: usize, seq: &S) -> Vec<usize>
213    where
214        S: AsRef<[u8]>,
215    {
216        let n: usize = 1 << (2 * k);
217        let mut v: Vec<usize> = Vec::new();
218        v.resize(n, 0);
219        Kmer::with_many_both(k, seq, |x, y| {
220            v[x.0 as usize] += 1;
221            v[y.0 as usize] += 1;
222        });
223        v
224    }
225}
226
227/// An iterator over the k-mers drawn from a sequence.
228pub struct KmerIterator<'a, Src> 
229where
230Src: Iterator<Item = &'a u8>
231{
232    k: usize,
233    shift: usize,
234    mask: u64,
235    x: Kmer,
236    y: Kmer,
237    i: usize,
238    src: Src
239}
240
241impl<'a, Src> KmerIterator<'a, Src>
242where
243Src: Iterator<Item = &'a u8>
244{
245    /// Create a new iterator yields the k-mers from both strands of a sequence.
246    pub fn new(k: usize, src: Src) -> KmerIterator<'a, Src> {
247        let shift: usize = 2 * (k - 1);
248        let mask: u64 = (1 << (2 * k)) - 1;
249        let x: Kmer = Kmer(0);
250        let y: Kmer = Kmer(0);
251        let i: usize = 0;
252        KmerIterator { k, shift, mask, x, y, i, src }
253    }
254}
255
256impl<'a, Src> Iterator for KmerIterator<'a, Src>
257where
258Src: Iterator<Item = &'a u8>
259{
260    type Item = (Kmer, Kmer);
261    fn next(&mut self) -> Option<Self::Item> {
262        while let Some(c) = self.src.next() {
263            match Kmer::byte(*c) {
264                None => {
265                    self.i = 0;
266                    self.x.0 = 0;
267                    self.y.0 = 0;
268                }
269                Some(b) => {
270                    self.x.0 = (self.x.0 << 2) | b.0;
271                    self.y.0 = (self.y.0 >> 2) | ((3 - b.0) << self.shift);
272                    self.i += 1;
273                    if self.i == self.k {
274                        self.x.0 &= self.mask;
275                        self.i -= 1;
276                        return Some((self.x.clone(), self.y.clone()));
277                    }
278                }
279            }
280        }
281        None
282    }
283}
284
285#[cfg(test)]
286mod tests {
287    use super::*;
288
289    #[test]
290    fn test_0a() {
291        let seq = "CGAT";
292        let ox = Kmer::make(seq);
293        assert_eq!(ox, Some(Kmer(0b01100011u64)));
294    }
295
296    #[test]
297    fn test_0b() {
298        let seq = "CGXAT";
299        let ox = Kmer::make(seq);
300        assert_eq!(ox, None);
301    }
302
303    #[test]
304    fn test_0c() {
305        let seq = "CTTTCTGGGGCTAGAGCAGGCAAACGTGGTACA";
306        assert_eq!(seq.len(), 33);
307        let ox = Kmer::make(seq);
308        assert_eq!(ox, None);
309    }
310
311    #[test]
312    fn test_1() {
313        let seq = "CTTTCTGGGGCTAGAGCAGGCAAACGTGGTACAGTCGACTCCATTCTTTCTTCCTCTGAGACCCCTTCCAGGAATTCAANGGCGCTGGTGAGTCATGAGGCCTCGGAGCAGGGAGTGGTGGTGGTTACATAATTCAGATTAACTCTCAGT";
314        let k = 11;
315        let xs = Kmer::make_many(k, seq);
316        assert_eq!(xs.len(), seq.len() - k + 1 - k);
317
318        let ys = vec![
319            "CTTTCTGGGGC",
320            "TTTCTGGGGCT",
321            "TTCTGGGGCTA",
322            "TCTGGGGCTAG",
323            "CTGGGGCTAGA",
324            "TGGGGCTAGAG",
325            "GGGGCTAGAGC",
326            "GGGCTAGAGCA",
327            "GGCTAGAGCAG",
328            "GCTAGAGCAGG",
329            "CTAGAGCAGGC",
330            "TAGAGCAGGCA",
331            "AGAGCAGGCAA",
332            "GAGCAGGCAAA",
333            "AGCAGGCAAAC",
334            "GCAGGCAAACG",
335            "CAGGCAAACGT",
336            "AGGCAAACGTG",
337            "GGCAAACGTGG",
338            "GCAAACGTGGT",
339            "CAAACGTGGTA",
340            "AAACGTGGTAC",
341            "AACGTGGTACA",
342            "ACGTGGTACAG",
343            "CGTGGTACAGT",
344            "GTGGTACAGTC",
345            "TGGTACAGTCG",
346            "GGTACAGTCGA",
347            "GTACAGTCGAC",
348            "TACAGTCGACT",
349            "ACAGTCGACTC",
350            "CAGTCGACTCC",
351            "AGTCGACTCCA",
352            "GTCGACTCCAT",
353            "TCGACTCCATT",
354            "CGACTCCATTC",
355            "GACTCCATTCT",
356            "ACTCCATTCTT",
357            "CTCCATTCTTT",
358            "TCCATTCTTTC",
359            "CCATTCTTTCT",
360            "CATTCTTTCTT",
361            "ATTCTTTCTTC",
362            "TTCTTTCTTCC",
363            "TCTTTCTTCCT",
364            "CTTTCTTCCTC",
365            "TTTCTTCCTCT",
366            "TTCTTCCTCTG",
367            "TCTTCCTCTGA",
368            "CTTCCTCTGAG",
369            "TTCCTCTGAGA",
370            "TCCTCTGAGAC",
371            "CCTCTGAGACC",
372            "CTCTGAGACCC",
373            "TCTGAGACCCC",
374            "CTGAGACCCCT",
375            "TGAGACCCCTT",
376            "GAGACCCCTTC",
377            "AGACCCCTTCC",
378            "GACCCCTTCCA",
379            "ACCCCTTCCAG",
380            "CCCCTTCCAGG",
381            "CCCTTCCAGGA",
382            "CCTTCCAGGAA",
383            "CTTCCAGGAAT",
384            "TTCCAGGAATT",
385            "TCCAGGAATTC",
386            "CCAGGAATTCA",
387            "CAGGAATTCAA",
388            "GGCGCTGGTGA",
389            "GCGCTGGTGAG",
390            "CGCTGGTGAGT",
391            "GCTGGTGAGTC",
392            "CTGGTGAGTCA",
393            "TGGTGAGTCAT",
394            "GGTGAGTCATG",
395            "GTGAGTCATGA",
396            "TGAGTCATGAG",
397            "GAGTCATGAGG",
398            "AGTCATGAGGC",
399            "GTCATGAGGCC",
400            "TCATGAGGCCT",
401            "CATGAGGCCTC",
402            "ATGAGGCCTCG",
403            "TGAGGCCTCGG",
404            "GAGGCCTCGGA",
405            "AGGCCTCGGAG",
406            "GGCCTCGGAGC",
407            "GCCTCGGAGCA",
408            "CCTCGGAGCAG",
409            "CTCGGAGCAGG",
410            "TCGGAGCAGGG",
411            "CGGAGCAGGGA",
412            "GGAGCAGGGAG",
413            "GAGCAGGGAGT",
414            "AGCAGGGAGTG",
415            "GCAGGGAGTGG",
416            "CAGGGAGTGGT",
417            "AGGGAGTGGTG",
418            "GGGAGTGGTGG",
419            "GGAGTGGTGGT",
420            "GAGTGGTGGTG",
421            "AGTGGTGGTGG",
422            "GTGGTGGTGGT",
423            "TGGTGGTGGTT",
424            "GGTGGTGGTTA",
425            "GTGGTGGTTAC",
426            "TGGTGGTTACA",
427            "GGTGGTTACAT",
428            "GTGGTTACATA",
429            "TGGTTACATAA",
430            "GGTTACATAAT",
431            "GTTACATAATT",
432            "TTACATAATTC",
433            "TACATAATTCA",
434            "ACATAATTCAG",
435            "CATAATTCAGA",
436            "ATAATTCAGAT",
437            "TAATTCAGATT",
438            "AATTCAGATTA",
439            "ATTCAGATTAA",
440            "TTCAGATTAAC",
441            "TCAGATTAACT",
442            "CAGATTAACTC",
443            "AGATTAACTCT",
444            "GATTAACTCTC",
445            "ATTAACTCTCA",
446            "TTAACTCTCAG",
447            "TAACTCTCAGT",
448        ];
449        assert_eq!(xs.len(), ys.len());
450        for i in 0..xs.len() {
451            assert_eq!(xs[i].render(k), ys[i]);
452        }
453    }
454
455    #[test]
456    fn test_2() {
457        let seq = "CTTTCTGGGGCTAGAGCAGGCAAACGTGGTACAGTCGACTCCATTCTTTCTTCCTCTGAGACCCCTTCCAGGAATTCAANGGCGCTGGTGAGTCATGAGGCCTCGGAGCAGGGAGTGGTGGTGGTTACATAATTCAGATTAACTCTCAGT";
458        let k = 11;
459        let mut xs = Vec::new();
460        Kmer::with_many_both(k, &seq, |x, y| {
461            xs.push(x.clone());
462            xs.push(y.clone())
463        });
464        assert_eq!(xs.len(), 2 * (seq.len() - k + 1 - k));
465    }
466
467    #[test]
468    fn test_3() {
469        let seq = "CTTTCTGGGGCTAGAGCAGGCAAACGTGGTACAGTCGACTCCATTCTTTCTTCCTCTGAGACCCCTTCCAGGAATTCAAAGGCGCTGGTGAGTCATGAGGCCTCGGAGCAGGGAGTGGTGGTGGTTACATAATTCAGATTAACTCTCAGT";
470        let xs = Kmer::frequency_vector(3, &seq);
471        assert_eq!(
472            xs,
473            vec![
474                2, 2, 1, 2, 2, 1, 1, 2, 3, 2, 5, 4, 1, 0, 1, 4, 2, 0, 6, 3, 2, 2, 0, 3, 1, 1, 1, 1,
475                1, 5, 3, 4, 1, 2, 6, 1, 3, 1, 1, 2, 3, 4, 3, 5, 1, 2, 5, 1, 2, 2, 1, 0, 4, 3, 2, 5,
476                3, 0, 6, 0, 2, 7, 0, 2
477            ]
478        );
479    }
480
481    #[test]
482    fn test_4() {
483        let seq = "CTTTCTGGGGCTAGAGCAGGCAAACGTGGTACAGTCGACTCCATTCTTTCTTCCTCTGAGACCCCTTCCAGGAATTCAAAGGCGCTGGTGAGTCATGAGGCCTCGGAGCAGGGAGTGGTGGTGGTTACATAATTCAGATTAACTCTCAGT";
484        let xs = Kmer::frequency_vector_both(3, &seq);
485        assert_eq!(
486            xs,
487            vec![
488                4, 3, 5, 6, 2, 6, 2, 6, 8, 4, 8, 6, 1, 1, 4, 6, 2, 5, 9, 4, 8, 5, 1, 8, 3, 2, 1, 2,
489                2, 11, 9, 5, 8, 4, 11, 1, 3, 5, 2, 4, 6, 5, 5, 6, 3, 4, 5, 3, 4, 3, 2, 1, 7, 6, 3,
490                8, 7, 3, 8, 2, 4, 8, 2, 4
491            ]
492        );
493    }
494
495    #[test]
496    fn test_5() {
497        let seq = "CTTTCTGGGGC";
498        let k = seq.len();
499        let x = Kmer::make(seq).unwrap();
500        let y = x.rev_comp(k);
501        assert_eq!(y.rev_comp(k), x);
502        assert_eq!(y.render(k), "GCCCCAGAAAG");
503    }
504
505    #[test]
506    fn test_6() {
507        let x = Kmer::make("CTTTCTGGGGC").unwrap();
508        let y = Kmer::make("CTATGTGGCGC").unwrap();
509        assert_eq!(Kmer::ham(&x, &x), 0);
510        assert_eq!(Kmer::ham(&y, &y), 0);
511        assert_eq!(Kmer::ham(&x, &y), 3);
512        assert_eq!(Kmer::ham(&y, &x), 3);
513    }
514
515    #[test]
516    fn test_7() {
517        let seq = "CTTTCTGGGGCTANGGCAAACGTGG";
518        let k = 11;
519        let mut itr = KmerIterator::new(k, seq.as_bytes().iter());
520        let x1 = Kmer::make("CTTTCTGGGGC").unwrap();
521        assert_eq!(itr.next(), Some((x1.clone(), x1.rev_comp(k))));
522        let x2 = Kmer::make("TTTCTGGGGCT").unwrap();
523        assert_eq!(itr.next(), Some((x2.clone(), x2.rev_comp(k))));
524        let x3 = Kmer::make("TTCTGGGGCTA").unwrap();
525        assert_eq!(itr.next(), Some((x3.clone(), x3.rev_comp(k))));
526        let x4 = Kmer::make("GGCAAACGTGG").unwrap();
527        assert_eq!(itr.next(), Some((x4.clone(), x4.rev_comp(k))));
528        assert_eq!(itr.next(), None);
529    }
530}