hyper_gen/
lib.rs

1pub mod dist;
2pub mod fastx_reader;
3pub mod hd;
4pub mod params;
5pub mod sketch;
6pub mod sketch_cuda;
7pub mod types;
8pub mod utils;
9
10#[cfg(test)]
11mod tests {
12    use std::collections::HashSet;
13    use std::default;
14    use std::hash::Hasher;
15
16    use std::process::exit;
17    use std::ptr::eq;
18    use std::time::Instant;
19
20    use needletail::{FastxReader, Sequence};
21    use rand_xoshiro::Xoroshiro64Star;
22    use rayon::iter::Enumerate;
23    use wyhash::{wyrng, WyRng};
24    use xxhash_rust::xxh3::xxh3_64_with_seed;
25
26    use crate::hd;
27    use crate::{
28        hd::{compress_hd_sketch, decompress_hd_sketch},
29        types::{FileSketch, Sketch},
30    };
31
32    #[test]
33    fn test_ahash() {
34        use ahash::AHasher;
35        use rand::{RngCore, SeedableRng};
36        use t1ha;
37
38        let test_kmer = "AAAAAAAAAAGGGGGGGGGG";
39
40        let start = Instant::now();
41        let mut hasher = AHasher::default();
42        for _ in 0..100000 {
43            hasher.write(test_kmer.as_bytes());
44            hasher.finish();
45        }
46        let duration = start.elapsed();
47        println!("Time elapsed in ahash is: {:?}", duration);
48
49        let start = Instant::now();
50        let mut h = wyhash::wyhash(test_kmer.as_bytes(), 42);
51        for _ in 0..100000 {
52            h = wyhash::wyhash(&h.to_ne_bytes(), 42);
53        }
54        let duration = start.elapsed();
55        println!("Time elapsed in wyhash2 is: {:?}", duration);
56
57        let start = Instant::now();
58        let mut h = t1ha::t1ha0(test_kmer.as_bytes(), 42);
59        for _ in 0..100000 {
60            h = t1ha::t1ha0(&h.to_ne_bytes(), 42);
61        }
62        let duration = start.elapsed();
63        println!("Time elapsed in t1ha0 is: {:?}", duration);
64
65        let start = Instant::now();
66        let mut h = t1ha::t1ha1(test_kmer.as_bytes(), 42);
67        for _ in 0..100000 {
68            h = t1ha::t1ha1(&h.to_ne_bytes(), 42);
69        }
70        let duration = start.elapsed();
71        println!("Time elapsed in t1ha1 is: {:?}", duration);
72
73        let start = Instant::now();
74        let mut h = t1ha::t1ha2_atonce(test_kmer.as_bytes(), 42);
75        for _ in 0..100000 {
76            h = t1ha::t1ha2_atonce(&h.to_ne_bytes(), 42);
77        }
78        let duration = start.elapsed();
79        println!("Time elapsed in t1ha2 is: {:?}", duration);
80
81        let start = Instant::now();
82        for i in 0..10000 {
83            let mut rng = Xoroshiro64Star::seed_from_u64(i);
84            let mut hv = vec![0; 1024];
85            for i in 0..(1024 / 64) {
86                let rnd_btis = rng.next_u64();
87
88                for j in 0..64 {
89                    hv[i * 64 + j] += (((rnd_btis >> j) & 1) << 1) as i16;
90                }
91            }
92        }
93        let duration = start.elapsed();
94        println!("Time elapsed in Xoroshiro64Star is: {:?}", duration);
95
96        let start = Instant::now();
97        for i in 0..10000 {
98            let mut rng = WyRng::seed_from_u64(i);
99            let mut hv = vec![0; 1024];
100            for i in 0..(1024 / 64) {
101                let rnd_btis = rng.next_u64();
102
103                for j in 0..64 {
104                    hv[i * 64 + j] += (((rnd_btis >> j) & 1) << 1) as i16;
105                }
106            }
107        }
108        let duration = start.elapsed();
109        println!("Time elapsed in WyRng is: {:?}", duration);
110
111        let start = Instant::now();
112        for i in 0..10000 as u64 {
113            let rng = t1ha::t1ha1(&i.to_ne_bytes(), 42);
114            let mut hv = vec![0; 1024];
115            for i in 0..(1024 / 64) {
116                let rnd_btis = t1ha::t1ha1(&rng.to_ne_bytes(), 42);
117
118                for j in 0..64 {
119                    hv[i * 64 + j] += (((rnd_btis >> j) & 1) << 1) as i16;
120                }
121            }
122        }
123        let duration = start.elapsed();
124        println!("Time elapsed in t1ha1 is: {:?}", duration);
125    }
126
127    #[test]
128    fn test_kmer_bit_pack() {
129        use needletail::{bitkmer, parse_fastx_file, Sequence};
130
131        let file = "./test/test.fna";
132        let mut fastx_reader = parse_fastx_file(&file).expect("Opening .fna files failed");
133
134        while let Some(record) = fastx_reader.next() {
135            let seq = record.expect("invalid record");
136
137            println!("{:?}", String::from_utf8(seq.seq().to_vec()));
138            for i in seq.bit_kmers(5, true) {
139                println!("{:?}\t{:?}", i, 0);
140            }
141        }
142    }
143
144    #[test]
145    fn test_fast_xxhash() {
146        use needletail::parse_fastx_file;
147        use xxhash_rust::xxh3::xxh3_64;
148
149        let file = "./test/test.fna";
150        let mut fastx_reader = parse_fastx_file(&file).expect("Opening .fna files failed");
151
152        while let Some(record) = fastx_reader.next() {
153            let seq = record.expect("invalid record");
154            let norm_seq = seq.normalize(false);
155
156            let rc = norm_seq.reverse_complement();
157
158            println!("{:?}", String::from_utf8(norm_seq.to_vec()));
159            for (_, kmer, _) in norm_seq.canonical_kmers(5, &rc) {
160                let h = xxh3_64(kmer);
161                println!("{:?}\t{:?}", String::from_utf8(kmer.to_vec()), h);
162            }
163        }
164    }
165
166    #[test]
167    fn test_vec_norm() {
168        let a = vec![1, 2, 3, -4];
169        let l2_norm_sq = a.iter().fold(0, |sum: u64, &num| sum + (num * num) as u64);
170
171        println!("L2 norm square = {}", l2_norm_sq);
172    }
173
174    fn dot_product(x: &Vec<i16>, y: &Vec<i16>) -> i32 {
175        assert_eq!(x.len(), y.len());
176
177        let mut ip: i32 = 0;
178        for i in 0..x.len() {
179            ip += x[i] as i32 * y[i] as i32;
180        }
181        ip
182    }
183
184    #[test]
185    fn test_simd_hd_enc() {
186        use crate::hd;
187
188        let hv_dim = 8192;
189        let mut sketch_A = Sketch::default();
190        sketch_A.hv_d = hv_dim;
191        for i in 0..123 {
192            sketch_A.hash_set.insert(i);
193        }
194
195        let mut sketch_B = Sketch::default();
196        sketch_B.hv_d = hv_dim;
197        for i in 10..456 {
198            sketch_B.hash_set.insert(i);
199        }
200
201        unsafe {
202            hd::encode_hash_hd(&mut sketch_A);
203            hd::encode_hash_hd(&mut sketch_B);
204            let norm_A = dot_product(&sketch_A.hv, &sketch_A.hv);
205            let norm_B = dot_product(&sketch_B.hv, &sketch_B.hv);
206            println!(
207                "HV norm using scalar encoding: A={:?}, B={:?}",
208                norm_A, norm_B
209            );
210            let ip_scalar = dot_product(&sketch_A.hv, &sketch_B.hv);
211            println!("HV IP using scalar encoding: {:?}", ip_scalar);
212
213            hd::encode_hash_hd_avx2(&mut sketch_A);
214            hd::encode_hash_hd_avx2(&mut sketch_B);
215            let norm_A = dot_product(&sketch_A.hv, &sketch_A.hv);
216            let norm_B = dot_product(&sketch_B.hv, &sketch_B.hv);
217            println!(
218                "\nHV norm using SIMD encoding: A={:?}, B={:?}",
219                norm_A, norm_B
220            );
221            let ip_simd = dot_product(&sketch_A.hv, &sketch_B.hv);
222            println!("HV IP using SIMD encoding: {:?}", ip_simd);
223
224            assert_eq!(ip_scalar, ip_simd);
225        }
226    }
227
228    #[test]
229    fn test_bit_vec() {
230        use rand::Rng;
231        extern crate bitpacking;
232        use bitpacking::{BitPacker, BitPacker8x};
233
234        let mut rng = rand::thread_rng();
235
236        let hv_dim: usize = 4096 * 1024;
237        let hv: Vec<i16> = (0..hv_dim).map(|_| rng.gen_range(-600..600)).collect();
238
239        // 1.1 Scalar Bit Packing
240        let mut sketch = Sketch::default();
241        sketch.hv_d = hv_dim;
242        sketch.hv = hv.clone();
243        let tstart = Instant::now();
244        unsafe {
245            hd::compress_hd_sketch(&mut sketch);
246        }
247        println!(
248            "Scalar HV compression took {:.3}s",
249            tstart.elapsed().as_secs_f32()
250        );
251        let hv_compressed = sketch.hv.clone();
252
253        // 1.2 Scalar Bit Unpacking
254        let tstart = Instant::now();
255        let hv_decompressed: Vec<i16> =
256            unsafe { hd::decompress_hd_sketch(&hv_compressed, hv_dim, sketch.hv_quant_bits) };
257        println!(
258            "Naive HV decompression took {:.3}s",
259            tstart.elapsed().as_secs_f32()
260        );
261        assert_eq!(&hv_decompressed, &hv);
262
263        let quant_bit: u8 = sketch.hv_quant_bits;
264        println!("num_bits = {}", quant_bit);
265
266        // 2.1 SIMD Bit Packing
267        let hv: Vec<u32> = vec![77; hv_dim];
268        let bitpacker = BitPacker8x::new();
269        let tstart = Instant::now();
270        let mut hv_compressed = vec![0u8; (quant_bit as usize) * (hv_dim >> 3)];
271        let bits_per_block = quant_bit as usize * 32;
272        for i in 0..(hv_dim / 256) {
273            bitpacker.compress(
274                &hv[i * 256..(i + 1) * 256],
275                &mut hv_compressed[(bits_per_block * i)..(bits_per_block * (i + 1))],
276                quant_bit,
277            );
278        }
279        println!(
280            "SIMD HV compression took {:.3}s",
281            tstart.elapsed().as_secs_f32()
282        );
283
284        // 2.2 SIMD Bit Unpacking
285        let mut hv_decompressed: Vec<u32> = vec![0; hv_dim];
286        let tstart = Instant::now();
287        for i in 0..(hv_dim / BitPacker8x::BLOCK_LEN) {
288            bitpacker.decompress(
289                &hv_compressed[(bits_per_block * i)..(bits_per_block * (i + 1))],
290                &mut hv_decompressed[i * BitPacker8x::BLOCK_LEN..(i + 1) * BitPacker8x::BLOCK_LEN],
291                quant_bit,
292            );
293        }
294        println!(
295            "SIMD HV decompression took {:.3}s",
296            tstart.elapsed().as_secs_f32()
297        );
298        assert_eq!(&hv, &hv_decompressed);
299    }
300
301    #[test]
302    fn test_xxhash_rng() {
303        let mut rng = xxhash_rust::xxh3::Xxh3::with_seed(1);
304
305        for i in 0..4 {
306            let h = rng.digest();
307            rng.write_u64(h);
308            println!("{}", h);
309        }
310    }
311
312    #[test]
313    fn test_minmer() {
314        use needletail::sequence::minimizer;
315        use rust_seq2kminmers::KminmersIterator;
316
317        let seq = b"AACTGCACTGCACTGCACTGCACACTGCACTGCACTGCACTGCACACTGCACTGCACTGACTGCACTGCACTGCACTGCACTGCCTGC";
318
319        let iter = KminmersIterator::new(seq, 4, 7, 0.1, false).unwrap();
320        for kminmer in iter {
321            println!("kminmer: {:?}", kminmer);
322            println!("{:?}", kminmer.mers()[0]);
323        }
324
325        // let mz = minimizer(seq, 5);
326        // println!("{:?}", mz);
327    }
328
329    #[test]
330    fn test_minimizer_strobemer() {
331        use needletail::{bitkmer, parse_fastx_file, Sequence};
332        use rust_seq2kminmers::KminmersIterator;
333        use std::collections::HashSet;
334        use std::time;
335
336        fn Jaccard_to_ANI(J: f32, k: usize) -> f32 {
337            let ani: f32 = 1.0 + (2.0 / (1.0 / J + 1.0)).ln() / (k as f32);
338            ani
339        }
340
341        // use minimizer for sampling
342        let file_query = "./test/Escherichia_coli_0_1288_GCA_000303255.LargeContigs.fna";
343        // let file_ref = "./test/Escherichia_coli_2871950_GCA_000355455.LargeContigs.fna";
344        let file_ref = "./test/Escherichia_coli_HVH_217__4_1022806__GCA_000459375.LargeContigs.fna";
345        let mut sketch_vecs: Vec<HashSet<u64>> = vec![HashSet::<u64>::default(); 2];
346
347        let w = 20;
348        let k = 21;
349        let density = 0.01;
350
351        for (i, file) in [file_query, file_ref].iter().enumerate() {
352            let mut fastx_reader = parse_fastx_file(&file).expect("Opening .fna files failed");
353
354            while let Some(record) = fastx_reader.next() {
355                let seq = record.expect("invalid record");
356                let seq = seq.normalize(false);
357
358                let iter = KminmersIterator::new(seq.sequence(), k, w, density, false).unwrap();
359                for kminmer in iter {
360                    sketch_vecs[i].insert(kminmer.mers()[0]);
361                    // println!("kminmer: {:?}", kminmer);
362                    // println!("{:?}", kminmer.mers()[0]);
363                }
364                // println!("{:?}", String::from_utf8(seq.seq().to_vec()));
365                // for i in seq.bit_kmers(5, true) {
366                //     println!("{:?}\t{:?}", i, 0);
367                // }
368                // return;
369            }
370            println!("{:?}", sketch_vecs[i].len());
371        }
372
373        let i = sketch_vecs[0].intersection(&sketch_vecs[1]).count();
374        let u = sketch_vecs[0].union(&sketch_vecs[1]).count();
375        let J = (i as f32) / u as f32;
376        let ANI = Jaccard_to_ANI(J, k);
377
378        println!("Minmer: J = {}\t ANI = {}", J, ANI);
379
380        //
381        let scaled = 2000;
382        let mut sketch_vecs: Vec<HashSet<u64>> = vec![HashSet::<u64>::default(); 2];
383
384        let now = time::Instant::now();
385        for (i, file) in [file_query, file_ref].iter().enumerate() {
386            let mut fastx_reader = parse_fastx_file(&file).expect("Opening .fna files failed");
387
388            while let Some(record) = fastx_reader.next() {
389                let seq = record.expect("invalid record");
390                let norm_seq = seq.normalize(false);
391
392                // let rc = norm_seq.reverse_complement();
393                // for (_, kmer, _) in norm_seq.canonical_kmers(k as u8, &rc) {
394                //     let h = xxh3_64_with_seed(kmer, 1);
395                //     if h < u64::MAX / scaled {
396                //         sketch_vecs[i].insert(h);
397                //     }
398                // }
399
400                for kmer in norm_seq.kmers(k as u8) {
401                    let h = xxh3_64_with_seed(kmer, 1);
402                    if h < u64::MAX / scaled {
403                        sketch_vecs[i].insert(h);
404                    }
405                }
406
407                // for (_, (kmer_u64, _), _) in norm_seq.bit_kmers(k as u8, true) {
408                // let h = xxh3_64_with_seed(&kmer_u64.to_be_bytes(), 1);
409                // if h < u64::MAX / scaled {
410                //     sketch_vecs[i].insert(h);
411                // }
412                // }
413            }
414            println!("{:?}", sketch_vecs[i].len());
415        }
416
417        let elapsed = now.elapsed();
418
419        let i = sketch_vecs[0].intersection(&sketch_vecs[1]).count();
420        let u = sketch_vecs[0].union(&sketch_vecs[1]).count();
421        let J = (i as f32) / u as f32;
422        let ANI = Jaccard_to_ANI(J, k);
423
424        println!(
425            "kmer: J = {}\t ANI = {}, elapsed time = {:?}",
426            J, ANI, elapsed
427        );
428        // use strobemer for sampling
429    }
430
431    #[test]
432    fn test_bitkmer_cuda() {
433        use glob::glob;
434        use std::path::{Path, PathBuf};
435
436        use crate::{fastx_reader, sketch_cuda, types};
437        use needletail::{bitkmer, parse_fastx_file, Sequence};
438
439        const SEQ_NT4_TABLE: [u8; 256] = [
440            0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
441            4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
442            4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4,
443            4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
444            3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
445            4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
446            4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
447            4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
448            4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
449        ];
450
451        // let fna_file = "./test/Escherichia_coli_HVH_217__4_1022806__GCA_000459375.LargeContigs.fna";
452        let fna_file = "./test2/test.fna";
453
454        // Ground-truth Bit Kmer
455        let k: u8 = 21;
456        let scaled = 1000;
457        let mut fastx_reader = parse_fastx_file(&fna_file).expect("Opening .fna files failed");
458
459        while let Some(record) = fastx_reader.next() {
460            let seqrec = record.expect("invalid sequence");
461
462            let norm_seq = seqrec.normalize(false);
463
464            for (_, (kmer_u64, _), _) in norm_seq.bit_kmers(k, true) {
465                // sketch.insert_kmer_u64(kmer_u64);
466                // let h = types::mm_hash64(kmer_u64);
467                // println!("{} : {}", kmer_u64, h);
468            }
469        }
470
471        // let fna_path = "./test2";
472        let fna_path = "../genome-HD/dna-dataset/D3";
473        let hashset_cpu =
474            sketch_cuda::sketch_cpu_parallel(&fna_path.to_string(), k as usize, scaled);
475
476        let hashset_gpu = sketch_cuda::cuda_mmhash_bitpack_parallel(
477            &fna_path.to_string(),
478            k as usize,
479            true,
480            scaled,
481        );
482
483        // get files
484        let files: Vec<_> = glob(Path::new(&fna_path).join("*.fna").to_str().unwrap())
485            .expect("Failed to read glob pattern")
486            .collect();
487
488        for i in 0..hashset_cpu.len() {
489            let mut vec_cpu: Vec<_> = hashset_cpu[i].clone().into_iter().collect();
490            let mut vec_gpu: Vec<_> = hashset_gpu[i].clone().into_iter().collect();
491
492            vec_cpu.sort();
493            vec_gpu.sort();
494
495            let inter: Vec<_> = hashset_cpu[i]
496                .intersection(&hashset_gpu[i])
497                .into_iter()
498                .collect();
499            let overlap_ratio = inter.len() as f32 / hashset_cpu[i].len() as f32;
500
501            if overlap_ratio < 1.0 {
502                println!("File {:?} Ratio = {:.4}", files[i], overlap_ratio);
503            }
504
505            // if !eq(&vec_cpu, &vec_gpu) {
506            // println!("{:?}", vec_cpu);
507            // println!("{:?}", vec_gpu);
508            // assert!(false, "File {:?} unmatched!", files[i].as_ref().unwrap());
509
510            // }
511
512            // println!("{}: {} {}", i, hashset_cpu[i].len(), hashset_gpu[i].len());
513            // assert_eq!(hashset_cpu[i], hashset_gpu[i], "{i} is wrong!");
514        }
515    }
516
517    #[test]
518    fn test_t1ha2_cuda() {
519        use crate::sketch_cuda;
520        use needletail::parse_fastx_file;
521        use t1ha;
522
523        let fna_file = "./test2/test.fna";
524        // let fna_file = "./test2/Acaryochloris_marina_MBIC11017.LargeContigs.fna";
525        let mut fastx_reader = parse_fastx_file(&fna_file).expect("Opening .fna files failed");
526
527        let k: u8 = 21;
528        let scaled = 1000;
529        // let scaled = 100000;
530        let seed: u64 = 123;
531        let canonical = true;
532
533        let mut hash_cpu = Vec::<u64>::new();
534        while let Some(record) = fastx_reader.next() {
535            let seqrec = record.expect("invalid sequence");
536
537            let norm_seq = seqrec.normalize(false);
538
539            let rc = norm_seq.reverse_complement();
540
541            // for i in norm_seq.kmers(k) {
542            //     let hash = t1ha::t1ha2_atonce(i, seed);
543            //     // hash_cpu.push(hash);
544
545            //     println!(
546            //         "fwd: {:?} -> {}",
547            //         String::from_utf8(i.to_vec()).unwrap(),
548            //         hash
549            //     );
550            // }
551
552            // for i in rc.kmers(k) {
553            //     let hash = t1ha::t1ha2_atonce(i, seed);
554            //     // hash_cpu.push(hash);
555            //     println!(
556            //         "rev: {:?} -> {}",
557            //         String::from_utf8(i.to_vec()).unwrap(),
558            //         hash
559            //     );
560            // }
561
562            for (_, kmer, can) in norm_seq.canonical_kmers(k, &rc) {
563                let hash = t1ha::t1ha2_atonce(kmer, seed);
564
565                if hash < u64::MAX / scaled {
566                    hash_cpu.push(hash);
567                    // println!(
568                    //     "can: {:?}, {} -> {}",
569                    //     String::from_utf8(kmer.to_vec()).unwrap(),
570                    //     can,
571                    //     hash
572                    // );
573                }
574            }
575        }
576
577        hash_cpu.sort();
578        hash_cpu.dedup();
579        // println!("{:?}", hash_cpu);
580
581        let p = "./test2".to_string();
582        let hash_gpu =
583            sketch_cuda::cuda_t1ha2_hash_parallel(&p, k as usize, canonical, scaled, seed);
584        // println!("{:?}", hash_gpu);
585
586        let mut hash_gpu: Vec<u64> = hash_gpu[0].clone().into_iter().collect();
587        hash_gpu.sort();
588        assert_eq!(hash_cpu, hash_gpu);
589    }
590
591    #[test]
592    fn test_set_vec_dedup() {
593        use rand::Rng;
594
595        let n = 100000;
596        let mut rng = rand::thread_rng();
597        let mut vals: Vec<u64> = (0..n).map(|_| rng.gen_range(0..5000)).collect();
598
599        let start = Instant::now();
600        vals.sort();
601        vals.dedup();
602        println!("dedup time = {:?}", start.elapsed());
603
604        let vals: Vec<u64> = (0..n).map(|_| rng.gen_range(0..5000)).collect();
605        let start = Instant::now();
606        let mut sketch_kmer_set = HashSet::<u64>::default();
607        for h in vals {
608            if h != 0 {
609                sketch_kmer_set.insert(h);
610            }
611        }
612        println!("set time = {:?}", start.elapsed());
613    }
614}