hyper_gen/sketch.rs
1use log::info;
2use std::collections::HashSet;
3use std::path::PathBuf;
4
5use needletail::{parse_fastx_file, Sequence};
6use rayon::prelude::*;
7
8use crate::types::*;
9use crate::{dist, hd, utils};
10
11#[cfg(target_arch = "x86_64")]
12pub fn sketch(params: SketchParams) {
13 let files = utils::get_fasta_files(¶ms.path);
14 let n_file = files.len();
15
16 info!("Start sketching...");
17 let pb = utils::get_progress_bar(n_file);
18
19 let mut all_filesketch: Vec<FileSketch> = (0..n_file)
20 .into_iter()
21 .map(|i| FileSketch {
22 ksize: params.ksize,
23 scaled: params.scaled,
24 seed: params.seed,
25 canonical: params.canonical,
26 hv_d: params.hv_d,
27 hv_quant_bits: 16 as u8,
28 hv_norm_2: 0 as i32,
29 file_str: files[i].display().to_string(),
30 hv: Vec::<i16>::new(),
31 })
32 .collect();
33
34 // Parallel sketching
35 all_filesketch.par_iter_mut().for_each(|sketch| {
36 // Extract kmer set from genome sequence
37 let kmer_hash_set = extract_kmer_hash(&sketch);
38
39 // Encode extracted kmer hash into sketch HV
40 let hv = if is_x86_feature_detected!("avx2") {
41 unsafe { hd::encode_hash_hd_avx2(&kmer_hash_set, &sketch) }
42 } else {
43 hd::encode_hash_hd(&kmer_hash_set, &sketch)
44 };
45
46 // Pre-compute HV's norm
47 sketch.hv_norm_2 = dist::compute_hv_l2_norm(&hv);
48
49 // Sketch HV compression
50 if params.if_compressed {
51 sketch.hv_quant_bits = unsafe { hd::compress_hd_sketch(sketch, &hv) };
52 }
53
54 pb.inc(1);
55 pb.eta();
56 });
57
58 pb.finish_and_clear();
59
60 info!(
61 "Sketching {} files took {:.2}s - Speed: {:.1} files/s",
62 n_file,
63 pb.elapsed().as_secs_f32(),
64 pb.per_sec()
65 );
66
67 // Dump sketch file
68 utils::dump_sketch(&all_filesketch, ¶ms.out_file);
69}
70
71fn extract_kmer_hash(sketch: &FileSketch) -> HashSet<u64> {
72 let ksize = sketch.ksize;
73 let threshold = u64::MAX / sketch.scaled;
74 let seed = sketch.seed;
75
76 let mut fastx_reader = parse_fastx_file(PathBuf::from(sketch.file_str.clone()))
77 .expect("Opening .fna files failed");
78
79 let mut hash_set = HashSet::<u64>::new();
80 while let Some(record) = fastx_reader.next() {
81 let seqrec: needletail::parser::SequenceRecord<'_> = record.expect("invalid record");
82
83 // normalize to make sure all the bases are consistently capitalized
84 let norm_seq = seqrec.normalize(false);
85
86 // we make a reverse complemented copy of the sequence
87 let rc = norm_seq.reverse_complement();
88
89 for (_, kmer, _) in norm_seq.canonical_kmers(ksize, &rc) {
90 let h = t1ha::t1ha2_atonce(kmer, seed);
91
92 if h < threshold {
93 hash_set.insert(h);
94 }
95 }
96 }
97 hash_set
98}
99
100// #[cfg(target_arch = "x86_64")]
101// unsafe fn extract_kmer_hash_avx2(file: PathBuf, sketch: &mut Sketch) {
102// let mut fastx_reader = parse_fastx_file(&file).expect("Opening .fna files failed");
103
104// while let Some(record) = fastx_reader.next() {
105// let seqrec: needletail::parser::SequenceRecord<'_> = record.expect("invalid record");
106
107// // normalize to make sure all the bases are consistently capitalized
108// let norm_seq = seqrec.normalize(false);
109
110// let mut bitkmer_array: [i64; 4] = [0, 0, 0, 0];
111// let mut bitkmer_m256: __m256i;
112// let mut cnt: usize = 0;
113// for (_, (bit_kmer_u64, _), _) in norm_seq.bit_kmers(sketch.ksize, true) {
114// if cnt < 4 {
115// bitkmer_array[cnt] = bit_kmer_u64 as i64;
116// cnt += 1;
117// } else {
118// bitkmer_m256 = _mm256_set_epi64x(
119// bitkmer_array[0],
120// bitkmer_array[1],
121// bitkmer_array[2],
122// bitkmer_array[3],
123// );
124// sketch.insert_kmer_u64_avx2(bitkmer_m256);
125// cnt = 0;
126// }
127// }
128
129// if cnt > 0 {
130// bitkmer_m256 = match cnt {
131// 1 => _mm256_set_epi64x(bitkmer_array[0], 0, 0, 0),
132// 2 => _mm256_set_epi64x(bitkmer_array[0], bitkmer_array[1], 0, 0),
133// 3 => _mm256_set_epi64x(bitkmer_array[0], bitkmer_array[1], bitkmer_array[2], 0),
134// _ => _mm256_set_epi64x(
135// bitkmer_array[0],
136// bitkmer_array[1],
137// bitkmer_array[2],
138// bitkmer_array[3],
139// ),
140// };
141// sketch.insert_kmer_u64_avx2(bitkmer_m256);
142// }
143// }
144// }