1use crate::types::*;
2
3#[cfg(any(feature = "cuda-sketch-volta", feature = "cuda-sketch-ada-lovelace", feature = "cuda-sketch-ampere", feature = "cuda-sketch-hopper"))]
4use {
5 crate::{dist, fastx_reader, hd, utils},
6 cudarc::driver::{CudaDevice, LaunchAsync, LaunchConfig},
7 cudarc::nvrtc::Ptx,
8 glob::glob,
9 log::info,
10 rayon::prelude::*,
11 std::cmp::max,
12 std::collections::HashSet,
13 std::path::Path,
14 std::path::PathBuf,
15 std::sync::Arc,
16 std::time::Instant,
17};
18
19#[cfg(any(feature = "cuda-sketch-volta",feature = "cuda-sketch-ada-lovelace", feature = "cuda-sketch-ampere", feature = "cuda-sketch-hopper"))]
20const CUDA_KERNEL_MY_STRUCT: &str = include_str!(concat!(env!("OUT_DIR"), "/cuda_kmer_hash.ptx"));
21
22#[cfg(any(feature = "cuda-sketch-volta", feature = "cuda-sketch-ada-lovelace", feature = "cuda-sketch-ampere", feature = "cuda-sketch-hopper"))]
23const SEQ_NT4_TABLE: [u8; 256] = [
24 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
25 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
26 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
27 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
28 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
29 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
30 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
31 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
32];
33
34#[allow(unused_variables)]
35#[cfg(not(any(feature = "cuda-sketch-volta", feature = "cuda-sketch-ada-lovelace", feature = "cuda-sketch-ampere", feature = "cuda-sketch-hopper")))]
36pub fn sketch_cuda(params: SketchParams) {
37 use log::error;
38
39 error!("Cuda sketching is not supported. Please add `--features cuda-sketch-ada-lovelace/cuda-sketch-ampere/cuda-sketch-hopper/cuda-sketch-volta` for installation to enable it.");
40}
41
42#[cfg(all(target_arch = "x86_64", any(feature = "cuda-sketch-ada-lovelace", feature = "cuda-sketch-ampere", feature = "cuda-sketch-hopper", feature = "cuda-sketch-volta")))]
44pub fn sketch_cuda(params: SketchParams) {
45 let files = utils::get_fasta_files(¶ms.path);
46 let n_file = files.len();
47
48 info!("Start GPU sketching...");
49 let pb = utils::get_progress_bar(n_file);
50
51 let gpu = CudaDevice::new(0).unwrap();
53 let ptx = Ptx::from_src(CUDA_KERNEL_MY_STRUCT);
55 gpu.load_ptx(
56 ptx,
57 "cuda_kernel",
58 &["cuda_kmer_bit_pack_mmhash", "cuda_kmer_t1ha2"],
59 )
60 .unwrap();
61 let gpu = &gpu;
62
63 let mut all_filesketch: Vec<FileSketch> = (0..n_file)
64 .into_iter()
65 .map(|i| FileSketch {
66 ksize: params.ksize,
67 scaled: params.scaled,
68 seed: params.seed,
69 canonical: params.canonical,
70 hv_d: params.hv_d,
71 hv_quant_bits: 16 as u8,
72 hv_norm_2: 0 as i32,
73 file_str: files[i].display().to_string(),
74 hv: Vec::<i16>::new(),
75 })
76 .collect();
77
78 all_filesketch.par_iter_mut().for_each(|sketch| {
80 gpu.bind_to_thread().unwrap();
83
84 let kmer_hash_set = extract_kmer_t1ha2_cuda(&sketch, gpu);
86
87 let hv = if is_x86_feature_detected!("avx2") {
89 unsafe { hd::encode_hash_hd_avx2(&kmer_hash_set, &sketch) }
90 } else {
91 hd::encode_hash_hd(&kmer_hash_set, &sketch)
92 };
93
94 sketch.hv_norm_2 = dist::compute_hv_l2_norm(&hv);
96
97 if params.if_compressed {
99 sketch.hv_quant_bits = unsafe { hd::compress_hd_sketch(sketch, &hv) };
100 }
101
102 pb.inc(1);
103 pb.eta();
104 });
105
106 pb.finish_and_clear();
107
108 info!(
109 "Sketching {} files took {:.2}s - Speed: {:.1} files/s",
110 files.len(),
111 pb.elapsed().as_secs_f32(),
112 pb.per_sec()
113 );
114
115 utils::dump_sketch(&all_filesketch, ¶ms.out_file);
117}
118
119#[cfg(any(feature = "cuda-sketch-volta", feature = "cuda-sketch-ada-lovelace", feature = "cuda-sketch-ampere", feature = "cuda-sketch-hopper"))]
120fn extract_kmer_t1ha2_cuda(sketch: &FileSketch, gpu: &Arc<CudaDevice>) -> HashSet<u64> {
121 let fna_file = PathBuf::from(sketch.file_str.clone());
122 let fna_seqs = fastx_reader::read_merge_seq(&fna_file);
123
124 let n_bps = fna_seqs.len();
125 let ksize = sketch.ksize as usize;
126 let canonical = sketch.canonical;
127 let scaled = sketch.scaled;
128 let seed = sketch.seed;
129 let n_kmers = n_bps - ksize + 1;
130 let kmer_per_thread = 512;
131 let n_threads = (n_kmers + kmer_per_thread - 1) / kmer_per_thread;
132
133 let gpu_seq = gpu.htod_copy(fna_seqs).unwrap();
135 let n_hash_per_thread = max(kmer_per_thread / sketch.scaled as usize * 4, 8);
137 let n_hash_array = n_hash_per_thread * n_threads;
138 let gpu_kmer_hash = gpu.alloc_zeros::<u64>(n_hash_array).unwrap();
139
140 let config = LaunchConfig::for_num_elems(n_threads as u32);
142 let params = (
143 &gpu_seq,
144 n_bps,
145 kmer_per_thread,
146 n_hash_per_thread,
147 ksize,
148 u64::MAX / scaled,
149 seed,
150 canonical,
151 &gpu_kmer_hash,
152 );
153 let f = gpu.get_func("cuda_kernel", "cuda_kmer_t1ha2").unwrap();
154 unsafe { f.launch(config, params) }.unwrap();
155
156 let host_kmer_hash = gpu.sync_reclaim(gpu_kmer_hash).unwrap();
157
158 let mut kmer_hash_set = HashSet::<u64>::new();
159 for h in host_kmer_hash {
160 if h != 0 {
161 kmer_hash_set.insert(h);
162 }
163 }
164
165 kmer_hash_set
166}
167
168#[cfg(any(feature = "cuda-sketch-volta", feature = "cuda-sketch-volta", feature = "cuda-sketch-ada-lovelace", feature = "cuda-sketch-ampere", feature = "cuda-sketch-hopper"))]
169pub fn cuda_mmhash_bitpack_parallel(
170 path_fna: &String,
171 ksize: usize,
172 canonical: bool,
173 scaled: u64,
174) -> Vec<HashSet<u64>> {
175 let files = utils::get_fasta_files(&PathBuf::from(path_fna));
177 let n_file = files.len();
178 let pb = utils::get_progress_bar(n_file);
179
180 let gpu = CudaDevice::new(0).unwrap();
182
183 let ptx = Ptx::from_src(CUDA_KERNEL_MY_STRUCT);
185 gpu.load_ptx(ptx, "cuda_kernel", &["cuda_kmer_bit_pack_mmhash"])
186 .unwrap();
187 let gpu = &gpu;
188
189 let index_vec: Vec<usize> = (0..files.len()).collect();
191 let sketch_kmer_sets: Vec<HashSet<u64>> = index_vec
192 .par_iter()
193 .map(|&i| {
194 gpu.bind_to_thread().unwrap();
197
198 let fna_seqs = fastx_reader::read_merge_seq(&files[i]);
201
202 let n_bps = fna_seqs.len();
205 let n_kmers = n_bps - ksize + 1;
206 let bp_per_thread = 512;
207 let n_threads = (n_kmers + bp_per_thread - 1) / bp_per_thread;
208
209 let gpu_seq = gpu.htod_copy(fna_seqs).unwrap();
210 let gpu_seq_nt4_table = gpu.htod_copy(SEQ_NT4_TABLE.to_vec()).unwrap();
211 let n_hash_per_thread = max(bp_per_thread / scaled as usize * 3, 8);
213 let n_hash_array = n_hash_per_thread * n_threads;
214 let gpu_kmer_bit_hash = gpu.alloc_zeros::<u64>(n_hash_array).unwrap();
215
216 let config = LaunchConfig::for_num_elems(n_threads as u32);
217 let params = (
218 &gpu_seq,
219 n_bps,
220 bp_per_thread,
221 n_hash_per_thread,
222 ksize,
223 u64::MAX / scaled,
224 canonical,
225 &gpu_seq_nt4_table,
226 &gpu_kmer_bit_hash,
227 );
228 let f = gpu
229 .get_func("cuda_kernel", "cuda_kmer_bit_pack_mmhash")
230 .unwrap();
231 unsafe { f.clone().launch(config, params) }.unwrap();
232
233 let host_kmer_bit_hash = gpu.sync_reclaim(gpu_kmer_bit_hash).unwrap();
234
235 let mut sketch_kmer_set = HashSet::<u64>::default();
236 for h in host_kmer_bit_hash {
237 if h != 0 {
238 sketch_kmer_set.insert(h);
239 }
240 }
241
242 pb.inc(1);
243 pb.eta();
244 sketch_kmer_set
245 })
246 .collect();
247
248 pb.finish_and_clear();
249
250 sketch_kmer_sets
251}
252
253#[cfg(any(feature = "cuda-sketch-volta", feature = "cuda-sketch-ada-lovelace", feature = "cuda-sketch-ampere", feature = "cuda-sketch-hopper"))]
254pub fn cuda_t1ha2_hash_parallel(
255 path_fna: &String,
256 ksize: usize,
257 canonical: bool,
258 scaled: u64,
259 seed: u64,
260) -> Vec<HashSet<u64>> {
261 let files: Vec<_> = glob(Path::new(&path_fna).join("*.fna").to_str().unwrap())
263 .expect("Failed to read glob pattern")
264 .collect();
265
266 let n_file = files.len();
267 let pb = utils::get_progress_bar(n_file);
268
269 let gpu = CudaDevice::new(0).unwrap();
271
272 let ptx = Ptx::from_src(CUDA_KERNEL_MY_STRUCT);
274 gpu.load_ptx(ptx, "cuda_kernel", &["cuda_kmer_t1ha2"])
275 .unwrap();
276 let gpu = &gpu;
277
278 let index_vec: Vec<usize> = (0..files.len()).collect();
280 let sketch_kmer_sets: Vec<HashSet<u64>> = index_vec
281 .par_iter()
282 .map(|i| {
283 gpu.bind_to_thread().unwrap();
286
287 let now = Instant::now();
288
289 let fna_seqs = fastx_reader::read_merge_seq(files[*i].as_ref().unwrap());
290
291 println!("Time taken to extract seq: {:.2?}", now.elapsed());
292
293 let n_bps = fna_seqs.len();
294 let n_kmers = n_bps - ksize + 1;
295 let kmer_per_thread = 512;
296 let n_threads = (n_kmers + kmer_per_thread - 1) / kmer_per_thread;
297
298 let now = Instant::now();
300 let gpu_seq = gpu.htod_copy(fna_seqs).unwrap();
301 let n_hash_per_thread = max(kmer_per_thread / scaled as usize * 3, 8);
303 let n_hash_array = n_hash_per_thread * n_threads;
304 let gpu_kmer_hash = gpu.alloc_zeros::<u64>(n_hash_array).unwrap();
305
306 println!("Time taken to copy to gpu: {:.2?}", now.elapsed());
307
308 let now = Instant::now();
310
311 let config = LaunchConfig::for_num_elems(n_threads as u32);
312 let params = (
313 &gpu_seq,
314 n_bps,
315 kmer_per_thread,
316 n_hash_per_thread,
317 ksize,
318 u64::MAX / scaled,
319 seed,
320 canonical,
321 &gpu_kmer_hash,
322 );
323 let f = gpu.get_func("cuda_kernel", "cuda_kmer_t1ha2").unwrap();
324 unsafe { f.launch(config, params) }.unwrap();
325
326 let host_kmer_hash = gpu.sync_reclaim(gpu_kmer_hash).unwrap();
327
328 println!("Time taken to run kernel: {:.2?}", now.elapsed());
329 let now = Instant::now();
330
331 let mut sketch_kmer_set = HashSet::<u64>::default();
332 for h in host_kmer_hash {
333 if h != 0 {
334 sketch_kmer_set.insert(h);
335 }
336 }
337
338 println!("Time taken to postprocess: {:.2?}", now.elapsed());
339 pb.inc(1);
340 pb.eta();
341 sketch_kmer_set
342 })
343 .collect();
344
345 pb.finish_and_clear();
346
347 sketch_kmer_sets
348}