kmers/
canonical.rs

1use std::collections::hash_map::DefaultHasher;
2use std::hash::Hasher;
3
4use crate::Kmer;
5
6/// A Trait for capturing k-mer canonicalisation.
7///
8/// K-mers come in pairs over reverse complementation. This trait generalises
9/// an interface for implementing a variety of normalisation methods.
10pub trait Canonical {
11    /// Return true iff `x` is the canonical choice out of `x` and it's reverse complement.
12    fn is_canonical(&self, x: &Kmer) -> bool {
13        *x == self.canonical(x)
14    }
15
16    /// Return the canonical form for `x` which will be either `x` or its reverse complement.
17    fn canonical(&self, x: &Kmer) -> Kmer;
18}
19
20/// A simple canonicalisation based on lexicographic ordering
21pub struct CanonicalLex {
22    k: usize,
23}
24
25impl CanonicalLex {
26    /// Create a new lexicographic canonicalisation object.
27    pub fn new(k: usize) -> CanonicalLex {
28        CanonicalLex { k }
29    }
30}
31
32impl Canonical for CanonicalLex {
33    fn is_canonical(&self, x: &Kmer) -> bool {
34        let s = 2 * (self.k - 1);
35        let msb = x.0 >> s;
36        let lsb = 3 - (x.0 & 3);
37        println!("x = {}", x.render(self.k));
38        println!("msb = {}, lsb = {}", msb, lsb);
39        if msb != lsb {
40            return msb < lsb;
41        }
42        *x == self.canonical(x)
43    }
44
45    fn canonical(&self, x: &Kmer) -> Kmer {
46        let x_bar = x.rev_comp(self.k);
47        if *x <= x_bar {
48            x.clone()
49        } else {
50            x_bar
51        }
52    }
53}
54
55/// Canonicalisation based on hashing.
56pub struct CanonicalHash {
57    k: usize,
58}
59
60impl CanonicalHash {
61    /// Create a new hash based canonicalisation object.
62    pub fn new(k: usize) -> CanonicalHash {
63        CanonicalHash { k }
64    }
65
66    /// hash a value.
67    fn hash(x: &Kmer) -> u64 {
68        let mut h = DefaultHasher::new();
69        h.write_u64(x.0);
70        h.finish()
71    }
72}
73
74impl Canonical for CanonicalHash {
75    fn canonical(&self, x: &Kmer) -> Kmer {
76        let y = x.rev_comp(self.k);
77        let x_hash = CanonicalHash::hash(x);
78        let y_hash = CanonicalHash::hash(&y);
79        if x_hash <= y_hash {
80            x.clone()
81        } else {
82            y
83        }
84    }
85}
86
87#[cfg(test)]
88mod tests {
89    use super::*;
90
91    #[test]
92    fn test_lex_1() {
93        let k = 11;
94        let seq = "CCACGACACATCATAGTCAAATTTCTAAAAATTAAGGACCACACAAACACCTTAAAAACAGAGAAAGAAAAATGACACATTACCTATACTGCAAAAACAATTCAAATGACAAAGTATTATTCATTGGAAACCACAGAGTTCAGAATGAAG";
95
96        Kmer::with_many_both(k, &seq, |x, y| {
97            let c = CanonicalLex::new(k);
98            let x_can = c.canonical(x);
99            let y_can = c.canonical(y);
100            assert_eq!(x_can, y_can);
101            assert_eq!(c.is_canonical(x), (*x <= *y));
102            assert_eq!(c.is_canonical(y), (*y <= *x));
103        });
104    }
105
106    #[test]
107    fn test_hash_1() {
108        let k = 11;
109        let seq = "CCACGACACATCATAGTCAAATTTCTAAAAATTAAGGACCACACAAACACCTTAAAAACAGAGAAAGAAAAATGACACATTACCTATACTGCAAAAACAATTCAAATGACAAAGTATTATTCATTGGAAACCACAGAGTTCAGAATGAAG";
110
111        Kmer::with_many_both(k, &seq, |x, y| {
112            let c = CanonicalHash::new(k);
113            let x_can = c.canonical(x);
114            let y_can = c.canonical(y);
115            assert_eq!(x_can, y_can);
116        });
117    }
118
119    struct MiniRng {
120        x: u64,
121    }
122
123    impl MiniRng {
124        fn new(seed: u64) -> MiniRng {
125            MiniRng { x: seed }
126        }
127
128        fn rnd(&mut self) -> u64 {
129            self.x = self.x.wrapping_mul(2862933555777941757u64);
130            self.x = self.x.wrapping_add(3037000493u64);
131            self.x
132        }
133    }
134
135    fn make_some_sequence(rng: &mut MiniRng) -> String {
136        let mut s = String::new();
137        let k: usize = 17;
138        let m = (1u64 << (2 * k)) - 1;
139        for _i in 0..20 {
140            let u = rng.rnd() & m;
141            let x = Kmer::from_u64(u);
142            s += &x.render(k);
143        }
144        s
145    }
146
147
148    /*
149
150    This one is expected to fail - the whole point of using
151    hash based canonicalisation is to even out the distribution
152    where you get a skewed distribution from lexicographic
153    canonicalisation.
154
155    #[test]
156    fn test_lex_2() {
157        let k = 15;
158
159        let mut xs = Vec::new();
160        let mut rng = MiniRng::new(19);
161        let c = CanonicalLex::new(k);
162        while xs.len() < 1024*1024 {
163            let seq = make_some_sequence(&mut rng);
164            Kmer::with_many(k, &seq, |x| {
165                let y = c.canonical(x);
166                xs.push(y);
167            });
168        }
169        xs.sort();
170        xs.dedup();
171        let mut n = 0;
172        let mut sx = 0;
173        let mut sx2 = 0;
174        for i in 1..xs.len() {
175            let d = xs[i].0 - xs[i-1].0;
176            n += 1;
177            sx += d;
178            sx2 += d*d;
179        }
180        let m = sx / n;
181        let v = sx2 / n - m*m;
182        let sd = (v as f64).sqrt();
183        println!("mean = {}", m);
184        println!("std.dev = {}", sd);
185        assert_eq!(m, 1024);
186        assert!((sd - 1024.0).abs() < 10.0);
187
188    }
189    */
190
191    #[test]
192    fn test_hash_2() {
193        let k = 15;
194
195        let mut xs = Vec::new();
196        let mut rng = MiniRng::new(19);
197        let c = CanonicalHash::new(k);
198        while xs.len() < 1024*1024 {
199            let seq = make_some_sequence(&mut rng);
200            Kmer::with_many(k, &seq, |x| {
201                let y = c.canonical(x);
202                xs.push(y);
203            });
204        }
205        xs.sort();
206        xs.dedup();
207        let mut n = 0;
208        let mut sx = 0;
209        let mut sx2 = 0;
210        for i in 1..xs.len() {
211            let d = xs[i].0 - xs[i-1].0;
212            n += 1;
213            sx += d;
214            sx2 += d*d;
215        }
216        let m = sx / n;
217        let v = sx2 / n - m*m;
218        let sd = (v as f64).sqrt();
219        assert_eq!(m, 1024);
220        assert!((sd - 1024.0).abs() < 10.0);
221    }
222}