1pub mod dist;
2pub mod fastx_reader;
3pub mod hd;
4pub mod params;
5pub mod sketch;
6pub mod sketch_cuda;
7pub mod types;
8pub mod utils;
9
10#[cfg(test)]
11mod tests {
12 use std::collections::HashSet;
13 use std::default;
14 use std::hash::Hasher;
15
16 use std::process::exit;
17 use std::ptr::eq;
18 use std::time::Instant;
19
20 use needletail::{FastxReader, Sequence};
21 use rand_xoshiro::Xoroshiro64Star;
22 use rayon::iter::Enumerate;
23 use wyhash::{wyrng, WyRng};
24 use xxhash_rust::xxh3::xxh3_64_with_seed;
25
26 use crate::hd;
27 use crate::{
28 hd::{compress_hd_sketch, decompress_hd_sketch},
29 types::{FileSketch, Sketch},
30 };
31
32 #[test]
33 fn test_ahash() {
34 use ahash::AHasher;
35 use rand::{RngCore, SeedableRng};
36 use t1ha;
37
38 let test_kmer = "AAAAAAAAAAGGGGGGGGGG";
39
40 let start = Instant::now();
41 let mut hasher = AHasher::default();
42 for _ in 0..100000 {
43 hasher.write(test_kmer.as_bytes());
44 hasher.finish();
45 }
46 let duration = start.elapsed();
47 println!("Time elapsed in ahash is: {:?}", duration);
48
49 let start = Instant::now();
50 let mut h = wyhash::wyhash(test_kmer.as_bytes(), 42);
51 for _ in 0..100000 {
52 h = wyhash::wyhash(&h.to_ne_bytes(), 42);
53 }
54 let duration = start.elapsed();
55 println!("Time elapsed in wyhash2 is: {:?}", duration);
56
57 let start = Instant::now();
58 let mut h = t1ha::t1ha0(test_kmer.as_bytes(), 42);
59 for _ in 0..100000 {
60 h = t1ha::t1ha0(&h.to_ne_bytes(), 42);
61 }
62 let duration = start.elapsed();
63 println!("Time elapsed in t1ha0 is: {:?}", duration);
64
65 let start = Instant::now();
66 let mut h = t1ha::t1ha1(test_kmer.as_bytes(), 42);
67 for _ in 0..100000 {
68 h = t1ha::t1ha1(&h.to_ne_bytes(), 42);
69 }
70 let duration = start.elapsed();
71 println!("Time elapsed in t1ha1 is: {:?}", duration);
72
73 let start = Instant::now();
74 let mut h = t1ha::t1ha2_atonce(test_kmer.as_bytes(), 42);
75 for _ in 0..100000 {
76 h = t1ha::t1ha2_atonce(&h.to_ne_bytes(), 42);
77 }
78 let duration = start.elapsed();
79 println!("Time elapsed in t1ha2 is: {:?}", duration);
80
81 let start = Instant::now();
82 for i in 0..10000 {
83 let mut rng = Xoroshiro64Star::seed_from_u64(i);
84 let mut hv = vec![0; 1024];
85 for i in 0..(1024 / 64) {
86 let rnd_btis = rng.next_u64();
87
88 for j in 0..64 {
89 hv[i * 64 + j] += (((rnd_btis >> j) & 1) << 1) as i16;
90 }
91 }
92 }
93 let duration = start.elapsed();
94 println!("Time elapsed in Xoroshiro64Star is: {:?}", duration);
95
96 let start = Instant::now();
97 for i in 0..10000 {
98 let mut rng = WyRng::seed_from_u64(i);
99 let mut hv = vec![0; 1024];
100 for i in 0..(1024 / 64) {
101 let rnd_btis = rng.next_u64();
102
103 for j in 0..64 {
104 hv[i * 64 + j] += (((rnd_btis >> j) & 1) << 1) as i16;
105 }
106 }
107 }
108 let duration = start.elapsed();
109 println!("Time elapsed in WyRng is: {:?}", duration);
110
111 let start = Instant::now();
112 for i in 0..10000 as u64 {
113 let rng = t1ha::t1ha1(&i.to_ne_bytes(), 42);
114 let mut hv = vec![0; 1024];
115 for i in 0..(1024 / 64) {
116 let rnd_btis = t1ha::t1ha1(&rng.to_ne_bytes(), 42);
117
118 for j in 0..64 {
119 hv[i * 64 + j] += (((rnd_btis >> j) & 1) << 1) as i16;
120 }
121 }
122 }
123 let duration = start.elapsed();
124 println!("Time elapsed in t1ha1 is: {:?}", duration);
125 }
126
127 #[test]
128 fn test_kmer_bit_pack() {
129 use needletail::{bitkmer, parse_fastx_file, Sequence};
130
131 let file = "./test/test.fna";
132 let mut fastx_reader = parse_fastx_file(&file).expect("Opening .fna files failed");
133
134 while let Some(record) = fastx_reader.next() {
135 let seq = record.expect("invalid record");
136
137 println!("{:?}", String::from_utf8(seq.seq().to_vec()));
138 for i in seq.bit_kmers(5, true) {
139 println!("{:?}\t{:?}", i, 0);
140 }
141 }
142 }
143
144 #[test]
145 fn test_fast_xxhash() {
146 use needletail::parse_fastx_file;
147 use xxhash_rust::xxh3::xxh3_64;
148
149 let file = "./test/test.fna";
150 let mut fastx_reader = parse_fastx_file(&file).expect("Opening .fna files failed");
151
152 while let Some(record) = fastx_reader.next() {
153 let seq = record.expect("invalid record");
154 let norm_seq = seq.normalize(false);
155
156 let rc = norm_seq.reverse_complement();
157
158 println!("{:?}", String::from_utf8(norm_seq.to_vec()));
159 for (_, kmer, _) in norm_seq.canonical_kmers(5, &rc) {
160 let h = xxh3_64(kmer);
161 println!("{:?}\t{:?}", String::from_utf8(kmer.to_vec()), h);
162 }
163 }
164 }
165
166 #[test]
167 fn test_vec_norm() {
168 let a = vec![1, 2, 3, -4];
169 let l2_norm_sq = a.iter().fold(0, |sum: u64, &num| sum + (num * num) as u64);
170
171 println!("L2 norm square = {}", l2_norm_sq);
172 }
173
174 fn dot_product(x: &Vec<i16>, y: &Vec<i16>) -> i32 {
175 assert_eq!(x.len(), y.len());
176
177 let mut ip: i32 = 0;
178 for i in 0..x.len() {
179 ip += x[i] as i32 * y[i] as i32;
180 }
181 ip
182 }
183
184 #[test]
185 fn test_simd_hd_enc() {
186 use crate::hd;
187
188 let hv_dim = 8192;
189 let mut sketch_A = Sketch::default();
190 sketch_A.hv_d = hv_dim;
191 for i in 0..123 {
192 sketch_A.hash_set.insert(i);
193 }
194
195 let mut sketch_B = Sketch::default();
196 sketch_B.hv_d = hv_dim;
197 for i in 10..456 {
198 sketch_B.hash_set.insert(i);
199 }
200
201 unsafe {
202 hd::encode_hash_hd(&mut sketch_A);
203 hd::encode_hash_hd(&mut sketch_B);
204 let norm_A = dot_product(&sketch_A.hv, &sketch_A.hv);
205 let norm_B = dot_product(&sketch_B.hv, &sketch_B.hv);
206 println!(
207 "HV norm using scalar encoding: A={:?}, B={:?}",
208 norm_A, norm_B
209 );
210 let ip_scalar = dot_product(&sketch_A.hv, &sketch_B.hv);
211 println!("HV IP using scalar encoding: {:?}", ip_scalar);
212
213 hd::encode_hash_hd_avx2(&mut sketch_A);
214 hd::encode_hash_hd_avx2(&mut sketch_B);
215 let norm_A = dot_product(&sketch_A.hv, &sketch_A.hv);
216 let norm_B = dot_product(&sketch_B.hv, &sketch_B.hv);
217 println!(
218 "\nHV norm using SIMD encoding: A={:?}, B={:?}",
219 norm_A, norm_B
220 );
221 let ip_simd = dot_product(&sketch_A.hv, &sketch_B.hv);
222 println!("HV IP using SIMD encoding: {:?}", ip_simd);
223
224 assert_eq!(ip_scalar, ip_simd);
225 }
226 }
227
228 #[test]
229 fn test_bit_vec() {
230 use rand::Rng;
231 extern crate bitpacking;
232 use bitpacking::{BitPacker, BitPacker8x};
233
234 let mut rng = rand::thread_rng();
235
236 let hv_dim: usize = 4096 * 1024;
237 let hv: Vec<i16> = (0..hv_dim).map(|_| rng.gen_range(-600..600)).collect();
238
239 let mut sketch = Sketch::default();
241 sketch.hv_d = hv_dim;
242 sketch.hv = hv.clone();
243 let tstart = Instant::now();
244 unsafe {
245 hd::compress_hd_sketch(&mut sketch);
246 }
247 println!(
248 "Scalar HV compression took {:.3}s",
249 tstart.elapsed().as_secs_f32()
250 );
251 let hv_compressed = sketch.hv.clone();
252
253 let tstart = Instant::now();
255 let hv_decompressed: Vec<i16> =
256 unsafe { hd::decompress_hd_sketch(&hv_compressed, hv_dim, sketch.hv_quant_bits) };
257 println!(
258 "Naive HV decompression took {:.3}s",
259 tstart.elapsed().as_secs_f32()
260 );
261 assert_eq!(&hv_decompressed, &hv);
262
263 let quant_bit: u8 = sketch.hv_quant_bits;
264 println!("num_bits = {}", quant_bit);
265
266 let hv: Vec<u32> = vec![77; hv_dim];
268 let bitpacker = BitPacker8x::new();
269 let tstart = Instant::now();
270 let mut hv_compressed = vec![0u8; (quant_bit as usize) * (hv_dim >> 3)];
271 let bits_per_block = quant_bit as usize * 32;
272 for i in 0..(hv_dim / 256) {
273 bitpacker.compress(
274 &hv[i * 256..(i + 1) * 256],
275 &mut hv_compressed[(bits_per_block * i)..(bits_per_block * (i + 1))],
276 quant_bit,
277 );
278 }
279 println!(
280 "SIMD HV compression took {:.3}s",
281 tstart.elapsed().as_secs_f32()
282 );
283
284 let mut hv_decompressed: Vec<u32> = vec![0; hv_dim];
286 let tstart = Instant::now();
287 for i in 0..(hv_dim / BitPacker8x::BLOCK_LEN) {
288 bitpacker.decompress(
289 &hv_compressed[(bits_per_block * i)..(bits_per_block * (i + 1))],
290 &mut hv_decompressed[i * BitPacker8x::BLOCK_LEN..(i + 1) * BitPacker8x::BLOCK_LEN],
291 quant_bit,
292 );
293 }
294 println!(
295 "SIMD HV decompression took {:.3}s",
296 tstart.elapsed().as_secs_f32()
297 );
298 assert_eq!(&hv, &hv_decompressed);
299 }
300
301 #[test]
302 fn test_xxhash_rng() {
303 let mut rng = xxhash_rust::xxh3::Xxh3::with_seed(1);
304
305 for i in 0..4 {
306 let h = rng.digest();
307 rng.write_u64(h);
308 println!("{}", h);
309 }
310 }
311
312 #[test]
313 fn test_minmer() {
314 use needletail::sequence::minimizer;
315 use rust_seq2kminmers::KminmersIterator;
316
317 let seq = b"AACTGCACTGCACTGCACTGCACACTGCACTGCACTGCACTGCACACTGCACTGCACTGACTGCACTGCACTGCACTGCACTGCCTGC";
318
319 let iter = KminmersIterator::new(seq, 4, 7, 0.1, false).unwrap();
320 for kminmer in iter {
321 println!("kminmer: {:?}", kminmer);
322 println!("{:?}", kminmer.mers()[0]);
323 }
324
325 }
328
329 #[test]
330 fn test_minimizer_strobemer() {
331 use needletail::{bitkmer, parse_fastx_file, Sequence};
332 use rust_seq2kminmers::KminmersIterator;
333 use std::collections::HashSet;
334 use std::time;
335
336 fn Jaccard_to_ANI(J: f32, k: usize) -> f32 {
337 let ani: f32 = 1.0 + (2.0 / (1.0 / J + 1.0)).ln() / (k as f32);
338 ani
339 }
340
341 let file_query = "./test/Escherichia_coli_0_1288_GCA_000303255.LargeContigs.fna";
343 let file_ref = "./test/Escherichia_coli_HVH_217__4_1022806__GCA_000459375.LargeContigs.fna";
345 let mut sketch_vecs: Vec<HashSet<u64>> = vec![HashSet::<u64>::default(); 2];
346
347 let w = 20;
348 let k = 21;
349 let density = 0.01;
350
351 for (i, file) in [file_query, file_ref].iter().enumerate() {
352 let mut fastx_reader = parse_fastx_file(&file).expect("Opening .fna files failed");
353
354 while let Some(record) = fastx_reader.next() {
355 let seq = record.expect("invalid record");
356 let seq = seq.normalize(false);
357
358 let iter = KminmersIterator::new(seq.sequence(), k, w, density, false).unwrap();
359 for kminmer in iter {
360 sketch_vecs[i].insert(kminmer.mers()[0]);
361 }
364 }
370 println!("{:?}", sketch_vecs[i].len());
371 }
372
373 let i = sketch_vecs[0].intersection(&sketch_vecs[1]).count();
374 let u = sketch_vecs[0].union(&sketch_vecs[1]).count();
375 let J = (i as f32) / u as f32;
376 let ANI = Jaccard_to_ANI(J, k);
377
378 println!("Minmer: J = {}\t ANI = {}", J, ANI);
379
380 let scaled = 2000;
382 let mut sketch_vecs: Vec<HashSet<u64>> = vec![HashSet::<u64>::default(); 2];
383
384 let now = time::Instant::now();
385 for (i, file) in [file_query, file_ref].iter().enumerate() {
386 let mut fastx_reader = parse_fastx_file(&file).expect("Opening .fna files failed");
387
388 while let Some(record) = fastx_reader.next() {
389 let seq = record.expect("invalid record");
390 let norm_seq = seq.normalize(false);
391
392 for kmer in norm_seq.kmers(k as u8) {
401 let h = xxh3_64_with_seed(kmer, 1);
402 if h < u64::MAX / scaled {
403 sketch_vecs[i].insert(h);
404 }
405 }
406
407 }
414 println!("{:?}", sketch_vecs[i].len());
415 }
416
417 let elapsed = now.elapsed();
418
419 let i = sketch_vecs[0].intersection(&sketch_vecs[1]).count();
420 let u = sketch_vecs[0].union(&sketch_vecs[1]).count();
421 let J = (i as f32) / u as f32;
422 let ANI = Jaccard_to_ANI(J, k);
423
424 println!(
425 "kmer: J = {}\t ANI = {}, elapsed time = {:?}",
426 J, ANI, elapsed
427 );
428 }
430
431 #[test]
432 fn test_bitkmer_cuda() {
433 use glob::glob;
434 use std::path::{Path, PathBuf};
435
436 use crate::{fastx_reader, sketch_cuda, types};
437 use needletail::{bitkmer, parse_fastx_file, Sequence};
438
439 const SEQ_NT4_TABLE: [u8; 256] = [
440 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
441 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
442 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4,
443 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
444 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
445 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
446 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
447 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
448 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
449 ];
450
451 let fna_file = "./test2/test.fna";
453
454 let k: u8 = 21;
456 let scaled = 1000;
457 let mut fastx_reader = parse_fastx_file(&fna_file).expect("Opening .fna files failed");
458
459 while let Some(record) = fastx_reader.next() {
460 let seqrec = record.expect("invalid sequence");
461
462 let norm_seq = seqrec.normalize(false);
463
464 for (_, (kmer_u64, _), _) in norm_seq.bit_kmers(k, true) {
465 }
469 }
470
471 let fna_path = "../genome-HD/dna-dataset/D3";
473 let hashset_cpu =
474 sketch_cuda::sketch_cpu_parallel(&fna_path.to_string(), k as usize, scaled);
475
476 let hashset_gpu = sketch_cuda::cuda_mmhash_bitpack_parallel(
477 &fna_path.to_string(),
478 k as usize,
479 true,
480 scaled,
481 );
482
483 let files: Vec<_> = glob(Path::new(&fna_path).join("*.fna").to_str().unwrap())
485 .expect("Failed to read glob pattern")
486 .collect();
487
488 for i in 0..hashset_cpu.len() {
489 let mut vec_cpu: Vec<_> = hashset_cpu[i].clone().into_iter().collect();
490 let mut vec_gpu: Vec<_> = hashset_gpu[i].clone().into_iter().collect();
491
492 vec_cpu.sort();
493 vec_gpu.sort();
494
495 let inter: Vec<_> = hashset_cpu[i]
496 .intersection(&hashset_gpu[i])
497 .into_iter()
498 .collect();
499 let overlap_ratio = inter.len() as f32 / hashset_cpu[i].len() as f32;
500
501 if overlap_ratio < 1.0 {
502 println!("File {:?} Ratio = {:.4}", files[i], overlap_ratio);
503 }
504
505 }
515 }
516
517 #[test]
518 fn test_t1ha2_cuda() {
519 use crate::sketch_cuda;
520 use needletail::parse_fastx_file;
521 use t1ha;
522
523 let fna_file = "./test2/test.fna";
524 let mut fastx_reader = parse_fastx_file(&fna_file).expect("Opening .fna files failed");
526
527 let k: u8 = 21;
528 let scaled = 1000;
529 let seed: u64 = 123;
531 let canonical = true;
532
533 let mut hash_cpu = Vec::<u64>::new();
534 while let Some(record) = fastx_reader.next() {
535 let seqrec = record.expect("invalid sequence");
536
537 let norm_seq = seqrec.normalize(false);
538
539 let rc = norm_seq.reverse_complement();
540
541 for (_, kmer, can) in norm_seq.canonical_kmers(k, &rc) {
563 let hash = t1ha::t1ha2_atonce(kmer, seed);
564
565 if hash < u64::MAX / scaled {
566 hash_cpu.push(hash);
567 }
574 }
575 }
576
577 hash_cpu.sort();
578 hash_cpu.dedup();
579 let p = "./test2".to_string();
582 let hash_gpu =
583 sketch_cuda::cuda_t1ha2_hash_parallel(&p, k as usize, canonical, scaled, seed);
584 let mut hash_gpu: Vec<u64> = hash_gpu[0].clone().into_iter().collect();
587 hash_gpu.sort();
588 assert_eq!(hash_cpu, hash_gpu);
589 }
590
591 #[test]
592 fn test_set_vec_dedup() {
593 use rand::Rng;
594
595 let n = 100000;
596 let mut rng = rand::thread_rng();
597 let mut vals: Vec<u64> = (0..n).map(|_| rng.gen_range(0..5000)).collect();
598
599 let start = Instant::now();
600 vals.sort();
601 vals.dedup();
602 println!("dedup time = {:?}", start.elapsed());
603
604 let vals: Vec<u64> = (0..n).map(|_| rng.gen_range(0..5000)).collect();
605 let start = Instant::now();
606 let mut sketch_kmer_set = HashSet::<u64>::default();
607 for h in vals {
608 if h != 0 {
609 sketch_kmer_set.insert(h);
610 }
611 }
612 println!("set time = {:?}", start.elapsed());
613 }
614}