hyper_gen/
sketch.rs

1use log::info;
2use std::collections::HashSet;
3use std::path::PathBuf;
4
5use needletail::{parse_fastx_file, Sequence};
6use rayon::prelude::*;
7
8use crate::types::*;
9use crate::{dist, hd, utils};
10
11#[cfg(target_arch = "x86_64")]
12pub fn sketch(params: SketchParams) {
13    let files = utils::get_fasta_files(&params.path);
14    let n_file = files.len();
15
16    info!("Start sketching...");
17    let pb = utils::get_progress_bar(n_file);
18
19    let mut all_filesketch: Vec<FileSketch> = (0..n_file)
20        .into_iter()
21        .map(|i| FileSketch {
22            ksize: params.ksize,
23            scaled: params.scaled,
24            seed: params.seed,
25            canonical: params.canonical,
26            hv_d: params.hv_d,
27            hv_quant_bits: 16 as u8,
28            hv_norm_2: 0 as i32,
29            file_str: files[i].display().to_string(),
30            hv: Vec::<i16>::new(),
31        })
32        .collect();
33
34    // Parallel sketching
35    all_filesketch.par_iter_mut().for_each(|sketch| {
36        // Extract kmer set from genome sequence
37        let kmer_hash_set = extract_kmer_hash(&sketch);
38
39        // Encode extracted kmer hash into sketch HV
40        let hv = if is_x86_feature_detected!("avx2") {
41            unsafe { hd::encode_hash_hd_avx2(&kmer_hash_set, &sketch) }
42        } else {
43            hd::encode_hash_hd(&kmer_hash_set, &sketch)
44        };
45
46        // Pre-compute HV's norm
47        sketch.hv_norm_2 = dist::compute_hv_l2_norm(&hv);
48
49        // Sketch HV compression
50        if params.if_compressed {
51            sketch.hv_quant_bits = unsafe { hd::compress_hd_sketch(sketch, &hv) };
52        }
53
54        pb.inc(1);
55        pb.eta();
56    });
57
58    pb.finish_and_clear();
59
60    info!(
61        "Sketching {} files took {:.2}s - Speed: {:.1} files/s",
62        n_file,
63        pb.elapsed().as_secs_f32(),
64        pb.per_sec()
65    );
66
67    // Dump sketch file
68    utils::dump_sketch(&all_filesketch, &params.out_file);
69}
70
71fn extract_kmer_hash(sketch: &FileSketch) -> HashSet<u64> {
72    let ksize = sketch.ksize;
73    let threshold = u64::MAX / sketch.scaled;
74    let seed = sketch.seed;
75
76    let mut fastx_reader = parse_fastx_file(PathBuf::from(sketch.file_str.clone()))
77        .expect("Opening .fna files failed");
78
79    let mut hash_set = HashSet::<u64>::new();
80    while let Some(record) = fastx_reader.next() {
81        let seqrec: needletail::parser::SequenceRecord<'_> = record.expect("invalid record");
82
83        // normalize to make sure all the bases are consistently capitalized
84        let norm_seq = seqrec.normalize(false);
85
86        // we make a reverse complemented copy of the sequence
87        let rc = norm_seq.reverse_complement();
88
89        for (_, kmer, _) in norm_seq.canonical_kmers(ksize, &rc) {
90            let h = t1ha::t1ha2_atonce(kmer, seed);
91
92            if h < threshold {
93                hash_set.insert(h);
94            }
95        }
96    }
97    hash_set
98}
99
100// #[cfg(target_arch = "x86_64")]
101// unsafe fn extract_kmer_hash_avx2(file: PathBuf, sketch: &mut Sketch) {
102//     let mut fastx_reader = parse_fastx_file(&file).expect("Opening .fna files failed");
103
104//     while let Some(record) = fastx_reader.next() {
105//         let seqrec: needletail::parser::SequenceRecord<'_> = record.expect("invalid record");
106
107//         // normalize to make sure all the bases are consistently capitalized
108//         let norm_seq = seqrec.normalize(false);
109
110//         let mut bitkmer_array: [i64; 4] = [0, 0, 0, 0];
111//         let mut bitkmer_m256: __m256i;
112//         let mut cnt: usize = 0;
113//         for (_, (bit_kmer_u64, _), _) in norm_seq.bit_kmers(sketch.ksize, true) {
114//             if cnt < 4 {
115//                 bitkmer_array[cnt] = bit_kmer_u64 as i64;
116//                 cnt += 1;
117//             } else {
118//                 bitkmer_m256 = _mm256_set_epi64x(
119//                     bitkmer_array[0],
120//                     bitkmer_array[1],
121//                     bitkmer_array[2],
122//                     bitkmer_array[3],
123//                 );
124//                 sketch.insert_kmer_u64_avx2(bitkmer_m256);
125//                 cnt = 0;
126//             }
127//         }
128
129//         if cnt > 0 {
130//             bitkmer_m256 = match cnt {
131//                 1 => _mm256_set_epi64x(bitkmer_array[0], 0, 0, 0),
132//                 2 => _mm256_set_epi64x(bitkmer_array[0], bitkmer_array[1], 0, 0),
133//                 3 => _mm256_set_epi64x(bitkmer_array[0], bitkmer_array[1], bitkmer_array[2], 0),
134//                 _ => _mm256_set_epi64x(
135//                     bitkmer_array[0],
136//                     bitkmer_array[1],
137//                     bitkmer_array[2],
138//                     bitkmer_array[3],
139//                 ),
140//             };
141//             sketch.insert_kmer_u64_avx2(bitkmer_m256);
142//         }
143//     }
144// }