1use std::collections::hash_map::DefaultHasher;
2use std::hash::Hasher;
3
4use crate::Kmer;
5
6pub trait Canonical {
11 fn is_canonical(&self, x: &Kmer) -> bool {
13 *x == self.canonical(x)
14 }
15
16 fn canonical(&self, x: &Kmer) -> Kmer;
18}
19
20pub struct CanonicalLex {
22 k: usize,
23}
24
25impl CanonicalLex {
26 pub fn new(k: usize) -> CanonicalLex {
28 CanonicalLex { k }
29 }
30}
31
32impl Canonical for CanonicalLex {
33 fn is_canonical(&self, x: &Kmer) -> bool {
34 let s = 2 * (self.k - 1);
35 let msb = x.0 >> s;
36 let lsb = 3 - (x.0 & 3);
37 println!("x = {}", x.render(self.k));
38 println!("msb = {}, lsb = {}", msb, lsb);
39 if msb != lsb {
40 return msb < lsb;
41 }
42 *x == self.canonical(x)
43 }
44
45 fn canonical(&self, x: &Kmer) -> Kmer {
46 let x_bar = x.rev_comp(self.k);
47 if *x <= x_bar {
48 x.clone()
49 } else {
50 x_bar
51 }
52 }
53}
54
55pub struct CanonicalHash {
57 k: usize,
58}
59
60impl CanonicalHash {
61 pub fn new(k: usize) -> CanonicalHash {
63 CanonicalHash { k }
64 }
65
66 fn hash(x: &Kmer) -> u64 {
68 let mut h = DefaultHasher::new();
69 h.write_u64(x.0);
70 h.finish()
71 }
72}
73
74impl Canonical for CanonicalHash {
75 fn canonical(&self, x: &Kmer) -> Kmer {
76 let y = x.rev_comp(self.k);
77 let x_hash = CanonicalHash::hash(x);
78 let y_hash = CanonicalHash::hash(&y);
79 if x_hash <= y_hash {
80 x.clone()
81 } else {
82 y
83 }
84 }
85}
86
87#[cfg(test)]
88mod tests {
89 use super::*;
90
91 #[test]
92 fn test_lex_1() {
93 let k = 11;
94 let seq = "CCACGACACATCATAGTCAAATTTCTAAAAATTAAGGACCACACAAACACCTTAAAAACAGAGAAAGAAAAATGACACATTACCTATACTGCAAAAACAATTCAAATGACAAAGTATTATTCATTGGAAACCACAGAGTTCAGAATGAAG";
95
96 Kmer::with_many_both(k, &seq, |x, y| {
97 let c = CanonicalLex::new(k);
98 let x_can = c.canonical(x);
99 let y_can = c.canonical(y);
100 assert_eq!(x_can, y_can);
101 assert_eq!(c.is_canonical(x), (*x <= *y));
102 assert_eq!(c.is_canonical(y), (*y <= *x));
103 });
104 }
105
106 #[test]
107 fn test_hash_1() {
108 let k = 11;
109 let seq = "CCACGACACATCATAGTCAAATTTCTAAAAATTAAGGACCACACAAACACCTTAAAAACAGAGAAAGAAAAATGACACATTACCTATACTGCAAAAACAATTCAAATGACAAAGTATTATTCATTGGAAACCACAGAGTTCAGAATGAAG";
110
111 Kmer::with_many_both(k, &seq, |x, y| {
112 let c = CanonicalHash::new(k);
113 let x_can = c.canonical(x);
114 let y_can = c.canonical(y);
115 assert_eq!(x_can, y_can);
116 });
117 }
118
119 struct MiniRng {
120 x: u64,
121 }
122
123 impl MiniRng {
124 fn new(seed: u64) -> MiniRng {
125 MiniRng { x: seed }
126 }
127
128 fn rnd(&mut self) -> u64 {
129 self.x = self.x.wrapping_mul(2862933555777941757u64);
130 self.x = self.x.wrapping_add(3037000493u64);
131 self.x
132 }
133 }
134
135 fn make_some_sequence(rng: &mut MiniRng) -> String {
136 let mut s = String::new();
137 let k: usize = 17;
138 let m = (1u64 << (2 * k)) - 1;
139 for _i in 0..20 {
140 let u = rng.rnd() & m;
141 let x = Kmer::from_u64(u);
142 s += &x.render(k);
143 }
144 s
145 }
146
147
148 #[test]
192 fn test_hash_2() {
193 let k = 15;
194
195 let mut xs = Vec::new();
196 let mut rng = MiniRng::new(19);
197 let c = CanonicalHash::new(k);
198 while xs.len() < 1024*1024 {
199 let seq = make_some_sequence(&mut rng);
200 Kmer::with_many(k, &seq, |x| {
201 let y = c.canonical(x);
202 xs.push(y);
203 });
204 }
205 xs.sort();
206 xs.dedup();
207 let mut n = 0;
208 let mut sx = 0;
209 let mut sx2 = 0;
210 for i in 1..xs.len() {
211 let d = xs[i].0 - xs[i-1].0;
212 n += 1;
213 sx += d;
214 sx2 += d*d;
215 }
216 let m = sx / n;
217 let v = sx2 / n - m*m;
218 let sd = (v as f64).sqrt();
219 assert_eq!(m, 1024);
220 assert!((sd - 1024.0).abs() < 10.0);
221 }
222}