hyper_gen/
types.rs

1use std::arch::x86_64::*;
2use std::collections::HashSet;
3use std::path::PathBuf;
4
5use serde::{Deserialize, Serialize};
6use t1ha;
7
8#[inline]
9pub fn mm_hash(bytes: &[u8]) -> usize {
10    let mut key = usize::from_ne_bytes(bytes.try_into().unwrap());
11    key = !key.wrapping_add(key << 21); // key = (key << 21) - key - 1;
12    key = key ^ key >> 24;
13    key = (key.wrapping_add(key << 3)).wrapping_add(key << 8); // key * 265
14    key = key ^ key >> 14;
15    key = (key.wrapping_add(key << 2)).wrapping_add(key << 4); // key * 21
16    key = key ^ key >> 28;
17    key = key.wrapping_add(key << 31);
18    key
19}
20
21#[inline]
22pub fn mm_hash64(kmer: u64) -> u64 {
23    let mut key = kmer;
24    key = !key + (key << 21);
25    key = key ^ key >> 24;
26    key = (key + (key << 3)) + (key << 8);
27    key = key ^ key >> 14;
28    key = (key + (key << 2)) + (key << 4);
29    key = key ^ key >> 28;
30    key = key + (key << 31);
31    key
32}
33
34// Use avx2 mmhash in https://github.com/bluenote-1577/skani/blob/main/src/avx2_seeding.rs
35#[inline]
36#[target_feature(enable = "avx2")]
37pub unsafe fn mm_hash64_avx2(kmer: __m256i) -> __m256i {
38    let mut key = kmer;
39    let s1 = _mm256_slli_epi64(key, 21);
40    key = _mm256_add_epi64(key, s1);
41    key = _mm256_xor_si256(key, _mm256_cmpeq_epi64(key, key));
42
43    key = _mm256_xor_si256(key, _mm256_srli_epi64(key, 24));
44    let s2 = _mm256_slli_epi64(key, 3);
45    let s3 = _mm256_slli_epi64(key, 8);
46
47    key = _mm256_add_epi64(key, s2);
48    key = _mm256_add_epi64(key, s3);
49    key = _mm256_xor_si256(key, _mm256_srli_epi64(key, 14));
50    let s4 = _mm256_slli_epi64(key, 2);
51    let s5 = _mm256_slli_epi64(key, 4);
52    key = _mm256_add_epi64(key, s4);
53    key = _mm256_add_epi64(key, s5);
54    key = _mm256_xor_si256(key, _mm256_srli_epi64(key, 28));
55
56    let s6 = _mm256_slli_epi64(key, 31);
57    key = _mm256_add_epi64(key, s6);
58
59    key
60}
61
62pub struct CliParams {
63    pub mode: String,
64    pub path: PathBuf,
65    pub path_ref_sketch: PathBuf,
66    pub path_query_sketch: PathBuf,
67    pub out_file: PathBuf,
68
69    pub ksize: u8,
70    pub seed: u64,
71    pub sketch_method: String,
72    pub canonical: bool,
73    pub device: String,
74    pub scaled: u64,
75    pub hv_d: usize,
76    pub hv_quant_scale: f32,
77    pub ani_threshold: f32,
78    pub if_compressed: bool,
79
80    pub threads: u8,
81}
82
83pub struct SketchParams {
84    pub path: PathBuf,
85    pub out_file: PathBuf,
86    pub sketch_method: String,
87    pub canonical: bool,
88    pub device: String,
89    pub ksize: u8,
90    pub seed: u64,
91    pub scaled: u64,
92    pub hv_d: usize,
93    pub hv_quant_scale: f32,
94    pub if_compressed: bool,
95}
96
97impl Default for SketchParams {
98    fn default() -> Self {
99        SketchParams {
100            path: (PathBuf::new()),
101            out_file: (PathBuf::new()),
102            sketch_method: String::from("t1ha2"),
103            canonical: (true),
104            device: String::from("cpu"),
105            ksize: (21),
106            seed: (123),
107            scaled: (1500),
108            hv_d: (4096),
109            hv_quant_scale: (1.0),
110            if_compressed: (true),
111        }
112    }
113}
114
115impl SketchParams {
116    pub fn new(params: &CliParams) -> SketchParams {
117        let mut new_sketch = SketchParams::default();
118        new_sketch.path = params.path.clone();
119        new_sketch.out_file = params.out_file.clone();
120        new_sketch.sketch_method = params.sketch_method.clone();
121        new_sketch.canonical = params.canonical.clone();
122        new_sketch.device = params.device.clone();
123        new_sketch.ksize = params.ksize;
124        new_sketch.seed = params.seed;
125        new_sketch.scaled = params.scaled;
126        new_sketch.hv_d = params.hv_d;
127        new_sketch.hv_quant_scale = params.hv_quant_scale;
128        new_sketch.if_compressed = params.if_compressed;
129        new_sketch
130    }
131}
132
133pub struct Sketch {
134    pub file_name: String,
135    pub sketch_method: String,
136    pub canonical: bool,
137    pub ksize: u8,
138    pub seed: u64,
139    pub scaled: u64,
140    pub threshold: u64,
141    pub hash_set: HashSet<u64>,
142    pub hv_quant_scale: f32,
143    pub hv_quant_bits: u8,
144    pub hv_d: usize,
145    pub hv: Vec<i16>,
146    pub hv_l2_norm_sq: i32,
147}
148
149impl Default for Sketch {
150    fn default() -> Self {
151        Sketch {
152            file_name: String::from(""),
153            sketch_method: String::from("xxh3"),
154            canonical: (true),
155            ksize: (21),
156            seed: (123),
157            scaled: (2000),
158            threshold: (u64::MAX / 2000),
159            hash_set: HashSet::default(),
160            hv_quant_scale: (1.0),
161            hv_quant_bits: (0),
162            hv_d: (4096),
163            hv: (vec![]),
164            hv_l2_norm_sq: (0),
165        }
166    }
167}
168
169impl Sketch {
170    pub fn new(file: String, params: &SketchParams) -> Sketch {
171        let mut new_sketch = Sketch::default();
172        new_sketch.file_name = file;
173        new_sketch.sketch_method = params.sketch_method.clone();
174        new_sketch.canonical = params.canonical;
175        new_sketch.ksize = params.ksize;
176        new_sketch.seed = params.seed;
177        new_sketch.scaled = params.scaled;
178        new_sketch.hv_d = params.hv_d;
179        new_sketch.hv_quant_scale = params.hv_quant_scale;
180        new_sketch.threshold = u64::MAX / params.scaled;
181        new_sketch
182    }
183
184    pub fn insert_kmer(&mut self, kmer: &[u8]) {
185        let h = match self.sketch_method.as_str() {
186            "t1ha2" => t1ha::t1ha2_atonce(kmer, self.seed),
187            "mmhash" => mm_hash(kmer) as u64,
188            _ => t1ha::t1ha2_atonce(kmer, self.seed),
189        };
190
191        if h < self.threshold {
192            self.hash_set.insert(h);
193        }
194    }
195
196    pub fn insert_kmer_u64(&mut self, kmer: u64) {
197        let h = match self.sketch_method.as_str() {
198            "t1ha2_64" => t1ha::t1ha2_atonce(&kmer.to_be_bytes(), 123),
199            "mmhash64" => mm_hash64(kmer),
200            _ => t1ha::t1ha2_atonce(&kmer.to_be_bytes(), 123),
201        };
202
203        if h < self.threshold {
204            self.hash_set.insert(h);
205        }
206    }
207
208    pub unsafe fn insert_kmer_u64_avx2(&mut self, kmer: __m256i) {
209        let hash_256 = mm_hash64_avx2(kmer);
210
211        let h1 = _mm256_extract_epi64(hash_256, 0) as u64;
212        let h2 = _mm256_extract_epi64(hash_256, 1) as u64;
213        let h3 = _mm256_extract_epi64(hash_256, 2) as u64;
214        let h4 = _mm256_extract_epi64(hash_256, 3) as u64;
215
216        for h in [h1, h2, h3, h4] {
217            if h > 0 && h < self.threshold {
218                self.hash_set.insert(h);
219            }
220        }
221    }
222}
223
224#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)]
225pub struct FileSketch {
226    pub ksize: u8,
227    pub scaled: u64,
228    pub canonical: bool,
229    pub seed: u64,
230    pub hv_d: usize,
231    pub hv_quant_bits: u8,
232    pub hv_norm_2: i32,
233    pub file_str: String,
234    pub hv: Vec<i16>,
235}
236
237pub struct SketchDist {
238    pub path_ref_sketch: PathBuf,
239    pub path_query_sketch: PathBuf,
240    pub out_file: PathBuf,
241    pub ksize: u8,
242    pub hv_d: usize,
243    pub ani_threshold: f32,
244    pub file_ani: Vec<((String, String), f32)>,
245}
246
247impl Default for SketchDist {
248    fn default() -> Self {
249        SketchDist {
250            path_ref_sketch: (PathBuf::new()),
251            path_query_sketch: (PathBuf::new()),
252            out_file: (PathBuf::new()),
253            ksize: (21),
254            hv_d: (1024),
255            ani_threshold: (85.0),
256            file_ani: (Vec::<((String, String), f32)>::new()),
257        }
258    }
259}
260
261impl SketchDist {
262    pub fn new(params: &CliParams) -> SketchDist {
263        let mut new_dist = SketchDist::default();
264        new_dist.path_ref_sketch = params.path_ref_sketch.clone();
265        new_dist.path_query_sketch = params.path_query_sketch.clone();
266        new_dist.out_file = params.out_file.clone();
267        new_dist.ksize = params.ksize;
268        new_dist.hv_d = params.hv_d;
269        new_dist.ani_threshold = params.ani_threshold;
270        new_dist
271    }
272}