hyper_gen/
sketch_cuda.rs

1use crate::types::*;
2
3#[cfg(any(feature = "cuda-sketch-volta", feature = "cuda-sketch-ada-lovelace", feature = "cuda-sketch-ampere", feature = "cuda-sketch-hopper"))]
4use {
5    crate::{dist, fastx_reader, hd, utils},
6    cudarc::driver::{CudaDevice, LaunchAsync, LaunchConfig},
7    cudarc::nvrtc::Ptx,
8    glob::glob,
9    log::info,
10    rayon::prelude::*,
11    std::cmp::max,
12    std::collections::HashSet,
13    std::path::Path,
14    std::path::PathBuf,
15    std::sync::Arc,
16    std::time::Instant,
17};
18
19#[cfg(any(feature = "cuda-sketch-volta",feature = "cuda-sketch-ada-lovelace", feature = "cuda-sketch-ampere", feature = "cuda-sketch-hopper"))]
20const CUDA_KERNEL_MY_STRUCT: &str = include_str!(concat!(env!("OUT_DIR"), "/cuda_kmer_hash.ptx"));
21
22#[cfg(any(feature = "cuda-sketch-volta", feature = "cuda-sketch-ada-lovelace", feature = "cuda-sketch-ampere", feature = "cuda-sketch-hopper"))]
23const SEQ_NT4_TABLE: [u8; 256] = [
24    0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
25    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
26    4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
27    4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
28    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
29    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
30    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
31    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
32];
33
34#[allow(unused_variables)]
35#[cfg(not(any(feature = "cuda-sketch-volta", feature = "cuda-sketch-ada-lovelace", feature = "cuda-sketch-ampere", feature = "cuda-sketch-hopper")))]
36pub fn sketch_cuda(params: SketchParams) {
37    use log::error;
38
39    error!("Cuda sketching is not supported. Please add `--features cuda-sketch-ada-lovelace/cuda-sketch-ampere/cuda-sketch-hopper/cuda-sketch-volta` for installation to enable it.");
40}
41
42//  Sketch function to sketch all .fna files in folder path
43#[cfg(all(target_arch = "x86_64", any(feature = "cuda-sketch-ada-lovelace", feature = "cuda-sketch-ampere", feature = "cuda-sketch-hopper", feature = "cuda-sketch-volta")))]
44pub fn sketch_cuda(params: SketchParams) {
45    let files = utils::get_fasta_files(&params.path);
46    let n_file = files.len();
47
48    info!("Start GPU sketching...");
49    let pb = utils::get_progress_bar(n_file);
50
51    // setup GPU device
52    let gpu = CudaDevice::new(0).unwrap();
53    // compile ptx
54    let ptx = Ptx::from_src(CUDA_KERNEL_MY_STRUCT);
55    gpu.load_ptx(
56        ptx,
57        "cuda_kernel",
58        &["cuda_kmer_bit_pack_mmhash", "cuda_kmer_t1ha2"],
59    )
60    .unwrap();
61    let gpu = &gpu;
62
63    let mut all_filesketch: Vec<FileSketch> = (0..n_file)
64        .into_iter()
65        .map(|i| FileSketch {
66            ksize: params.ksize,
67            scaled: params.scaled,
68            seed: params.seed,
69            canonical: params.canonical,
70            hv_d: params.hv_d,
71            hv_quant_bits: 16 as u8,
72            hv_norm_2: 0 as i32,
73            file_str: files[i].display().to_string(),
74            hv: Vec::<i16>::new(),
75        })
76        .collect();
77
78    // start cuda sketching
79    all_filesketch.par_iter_mut().for_each(|sketch| {
80        // NOTE: this is the important call to have
81        // without this, you'll get a CUDA_ERROR_INVALID_CONTEXT
82        gpu.bind_to_thread().unwrap();
83
84        // Extract kmer set from genome sequence
85        let kmer_hash_set = extract_kmer_t1ha2_cuda(&sketch, gpu);
86
87        // Encode extracted kmer hash into sketch HV
88        let hv = if is_x86_feature_detected!("avx2") {
89            unsafe { hd::encode_hash_hd_avx2(&kmer_hash_set, &sketch) }
90        } else {
91            hd::encode_hash_hd(&kmer_hash_set, &sketch)
92        };
93
94        // Pre-compute HV's norm
95        sketch.hv_norm_2 = dist::compute_hv_l2_norm(&hv);
96
97        // Sketch HV compression
98        if params.if_compressed {
99            sketch.hv_quant_bits = unsafe { hd::compress_hd_sketch(sketch, &hv) };
100        }
101
102        pb.inc(1);
103        pb.eta();
104    });
105
106    pb.finish_and_clear();
107
108    info!(
109        "Sketching {} files took {:.2}s - Speed: {:.1} files/s",
110        files.len(),
111        pb.elapsed().as_secs_f32(),
112        pb.per_sec()
113    );
114
115    // Dump sketch file
116    utils::dump_sketch(&all_filesketch, &params.out_file);
117}
118
119#[cfg(any(feature = "cuda-sketch-volta", feature = "cuda-sketch-ada-lovelace", feature = "cuda-sketch-ampere", feature = "cuda-sketch-hopper"))]
120fn extract_kmer_t1ha2_cuda(sketch: &FileSketch, gpu: &Arc<CudaDevice>) -> HashSet<u64> {
121    let fna_file = PathBuf::from(sketch.file_str.clone());
122    let fna_seqs = fastx_reader::read_merge_seq(&fna_file);
123
124    let n_bps = fna_seqs.len();
125    let ksize = sketch.ksize as usize;
126    let canonical = sketch.canonical;
127    let scaled = sketch.scaled;
128    let seed = sketch.seed;
129    let n_kmers = n_bps - ksize + 1;
130    let kmer_per_thread = 512;
131    let n_threads = (n_kmers + kmer_per_thread - 1) / kmer_per_thread;
132
133    // copy to GPU
134    let gpu_seq = gpu.htod_copy(fna_seqs).unwrap();
135    // allocate 4x more space that expected
136    let n_hash_per_thread = max(kmer_per_thread / sketch.scaled as usize * 4, 8);
137    let n_hash_array = n_hash_per_thread * n_threads;
138    let gpu_kmer_hash = gpu.alloc_zeros::<u64>(n_hash_array).unwrap();
139
140    // execute kernel
141    let config = LaunchConfig::for_num_elems(n_threads as u32);
142    let params = (
143        &gpu_seq,
144        n_bps,
145        kmer_per_thread,
146        n_hash_per_thread,
147        ksize,
148        u64::MAX / scaled,
149        seed,
150        canonical,
151        &gpu_kmer_hash,
152    );
153    let f = gpu.get_func("cuda_kernel", "cuda_kmer_t1ha2").unwrap();
154    unsafe { f.launch(config, params) }.unwrap();
155
156    let host_kmer_hash = gpu.sync_reclaim(gpu_kmer_hash).unwrap();
157
158    let mut kmer_hash_set = HashSet::<u64>::new();
159    for h in host_kmer_hash {
160        if h != 0 {
161            kmer_hash_set.insert(h);
162        }
163    }
164
165    kmer_hash_set
166}
167
168#[cfg(any(feature = "cuda-sketch-volta", feature = "cuda-sketch-volta", feature = "cuda-sketch-ada-lovelace", feature = "cuda-sketch-ampere", feature = "cuda-sketch-hopper"))]
169pub fn cuda_mmhash_bitpack_parallel(
170    path_fna: &String,
171    ksize: usize,
172    canonical: bool,
173    scaled: u64,
174) -> Vec<HashSet<u64>> {
175    // get files
176    let files = utils::get_fasta_files(&PathBuf::from(path_fna));
177    let n_file = files.len();
178    let pb = utils::get_progress_bar(n_file);
179
180    // setup GPU device
181    let gpu = CudaDevice::new(0).unwrap();
182
183    // compile ptx
184    let ptx = Ptx::from_src(CUDA_KERNEL_MY_STRUCT);
185    gpu.load_ptx(ptx, "cuda_kernel", &["cuda_kmer_bit_pack_mmhash"])
186        .unwrap();
187    let gpu = &gpu;
188
189    // start sketching
190    let index_vec: Vec<usize> = (0..files.len()).collect();
191    let sketch_kmer_sets: Vec<HashSet<u64>> = index_vec
192        .par_iter()
193        .map(|&i| {
194            // NOTE: this is the important call to have
195            // without this, you'll get a CUDA_ERROR_INVALID_CONTEXT
196            gpu.bind_to_thread().unwrap();
197
198            // let now = Instant::now();
199
200            let fna_seqs = fastx_reader::read_merge_seq(&files[i]);
201
202            // println!("Time taken to extract seq: {:.2?}", now.elapsed());
203
204            let n_bps = fna_seqs.len();
205            let n_kmers = n_bps - ksize + 1;
206            let bp_per_thread = 512;
207            let n_threads = (n_kmers + bp_per_thread - 1) / bp_per_thread;
208
209            let gpu_seq = gpu.htod_copy(fna_seqs).unwrap();
210            let gpu_seq_nt4_table = gpu.htod_copy(SEQ_NT4_TABLE.to_vec()).unwrap();
211            // allocate 4x more space that expected
212            let n_hash_per_thread = max(bp_per_thread / scaled as usize * 3, 8);
213            let n_hash_array = n_hash_per_thread * n_threads;
214            let gpu_kmer_bit_hash = gpu.alloc_zeros::<u64>(n_hash_array).unwrap();
215
216            let config = LaunchConfig::for_num_elems(n_threads as u32);
217            let params = (
218                &gpu_seq,
219                n_bps,
220                bp_per_thread,
221                n_hash_per_thread,
222                ksize,
223                u64::MAX / scaled,
224                canonical,
225                &gpu_seq_nt4_table,
226                &gpu_kmer_bit_hash,
227            );
228            let f = gpu
229                .get_func("cuda_kernel", "cuda_kmer_bit_pack_mmhash")
230                .unwrap();
231            unsafe { f.clone().launch(config, params) }.unwrap();
232
233            let host_kmer_bit_hash = gpu.sync_reclaim(gpu_kmer_bit_hash).unwrap();
234
235            let mut sketch_kmer_set = HashSet::<u64>::default();
236            for h in host_kmer_bit_hash {
237                if h != 0 {
238                    sketch_kmer_set.insert(h);
239                }
240            }
241
242            pb.inc(1);
243            pb.eta();
244            sketch_kmer_set
245        })
246        .collect();
247
248    pb.finish_and_clear();
249
250    sketch_kmer_sets
251}
252
253#[cfg(any(feature = "cuda-sketch-volta", feature = "cuda-sketch-ada-lovelace", feature = "cuda-sketch-ampere", feature = "cuda-sketch-hopper"))]
254pub fn cuda_t1ha2_hash_parallel(
255    path_fna: &String,
256    ksize: usize,
257    canonical: bool,
258    scaled: u64,
259    seed: u64,
260) -> Vec<HashSet<u64>> {
261    // get files
262    let files: Vec<_> = glob(Path::new(&path_fna).join("*.fna").to_str().unwrap())
263        .expect("Failed to read glob pattern")
264        .collect();
265
266    let n_file = files.len();
267    let pb = utils::get_progress_bar(n_file);
268
269    // setup GPU device
270    let gpu = CudaDevice::new(0).unwrap();
271
272    // compile ptx
273    let ptx = Ptx::from_src(CUDA_KERNEL_MY_STRUCT);
274    gpu.load_ptx(ptx, "cuda_kernel", &["cuda_kmer_t1ha2"])
275        .unwrap();
276    let gpu = &gpu;
277
278    // start sketching
279    let index_vec: Vec<usize> = (0..files.len()).collect();
280    let sketch_kmer_sets: Vec<HashSet<u64>> = index_vec
281        .par_iter()
282        .map(|i| {
283            // NOTE: this is the important call to have
284            // without this, you'll get a CUDA_ERROR_INVALID_CONTEXT
285            gpu.bind_to_thread().unwrap();
286
287            let now = Instant::now();
288
289            let fna_seqs = fastx_reader::read_merge_seq(files[*i].as_ref().unwrap());
290
291            println!("Time taken to extract seq: {:.2?}", now.elapsed());
292
293            let n_bps = fna_seqs.len();
294            let n_kmers = n_bps - ksize + 1;
295            let kmer_per_thread = 512;
296            let n_threads = (n_kmers + kmer_per_thread - 1) / kmer_per_thread;
297
298            // copy to GPU
299            let now = Instant::now();
300            let gpu_seq = gpu.htod_copy(fna_seqs).unwrap();
301            // allocate 4x more space that expected
302            let n_hash_per_thread = max(kmer_per_thread / scaled as usize * 3, 8);
303            let n_hash_array = n_hash_per_thread * n_threads;
304            let gpu_kmer_hash = gpu.alloc_zeros::<u64>(n_hash_array).unwrap();
305
306            println!("Time taken to copy to gpu: {:.2?}", now.elapsed());
307
308            // execute kernel
309            let now = Instant::now();
310
311            let config = LaunchConfig::for_num_elems(n_threads as u32);
312            let params = (
313                &gpu_seq,
314                n_bps,
315                kmer_per_thread,
316                n_hash_per_thread,
317                ksize,
318                u64::MAX / scaled,
319                seed,
320                canonical,
321                &gpu_kmer_hash,
322            );
323            let f = gpu.get_func("cuda_kernel", "cuda_kmer_t1ha2").unwrap();
324            unsafe { f.launch(config, params) }.unwrap();
325
326            let host_kmer_hash = gpu.sync_reclaim(gpu_kmer_hash).unwrap();
327
328            println!("Time taken to run kernel: {:.2?}", now.elapsed());
329            let now = Instant::now();
330
331            let mut sketch_kmer_set = HashSet::<u64>::default();
332            for h in host_kmer_hash {
333                if h != 0 {
334                    sketch_kmer_set.insert(h);
335                }
336            }
337
338            println!("Time taken to postprocess: {:.2?}", now.elapsed());
339            pb.inc(1);
340            pb.eta();
341            sketch_kmer_set
342        })
343        .collect();
344
345    pb.finish_and_clear();
346
347    sketch_kmer_sets
348}