1use crate::error::{FFTError, FFTResult};
9use crate::sparse_fft::{SparseFFTAlgorithm, WindowFunction};
10use scirs2_core::numeric::Complex64;
11use scirs2_core::numeric::NumCast;
12use scirs2_core::simd_ops::PlatformCapabilities;
13use std::fmt::Debug;
14
15#[derive(Debug, Clone)]
17pub struct KernelConfig {
18 pub block_size: usize,
20 pub grid_size: usize,
22 pub shared_memory_size: usize,
24 pub use_mixed_precision: bool,
26 pub registers_per_thread: usize,
28 pub use_tensor_cores: bool,
30}
31
32impl Default for KernelConfig {
33 fn default() -> Self {
34 Self {
35 block_size: 256,
36 grid_size: 0, shared_memory_size: 16 * 1024, use_mixed_precision: false,
39 registers_per_thread: 32,
40 use_tensor_cores: false,
41 }
42 }
43}
44
45#[derive(Debug, Clone, Copy, PartialEq, Eq)]
47pub enum KernelImplementation {
48 Throughput,
50 Latency,
52 MemoryEfficient,
54 HighAccuracy,
56 PowerEfficient,
58}
59
60#[derive(Debug, Clone)]
62pub struct KernelStats {
63 pub execution_time_ms: f64,
65 pub memory_bandwidth_gb_s: f64,
67 pub compute_throughput_gflops: f64,
69 pub bytes_transferred_to_device: usize,
71 pub bytes_transferred_from_device: usize,
73 pub occupancy_percent: f64,
75}
76
77pub trait GPUKernel {
79 fn name(&self) -> &str;
81
82 fn config(&self) -> &KernelConfig;
84
85 fn set_config(&mut self, config: KernelConfig);
87
88 fn execute(&self) -> FFTResult<KernelStats>;
90}
91
92#[derive(Debug)]
94pub struct FFTKernel {
95 config: KernelConfig,
97 input_size: usize,
99 #[allow(dead_code)]
101 input_address: usize,
102 #[allow(dead_code)]
104 output_address: usize,
105}
106
107impl FFTKernel {
108 pub fn new(input_size: usize, input_address: usize, outputaddress: usize) -> Self {
110 let mut config = KernelConfig::default();
111 config.grid_size = input_size.div_ceil(config.block_size);
113
114 Self {
115 config,
116 input_size,
117 input_address,
118 output_address: outputaddress,
119 }
120 }
121}
122
123impl GPUKernel for FFTKernel {
124 fn name(&self) -> &str {
125 "FFT_Kernel"
126 }
127
128 fn config(&self) -> &KernelConfig {
129 &self.config
130 }
131
132 fn set_config(&mut self, config: KernelConfig) {
133 self.config = config;
134 }
135
136 fn execute(&self) -> FFTResult<KernelStats> {
137 let execution_time_ms = self.input_size as f64 * 0.001;
142
143 let stats = KernelStats {
145 execution_time_ms,
146 memory_bandwidth_gb_s: 500.0,
147 compute_throughput_gflops: 10000.0,
148 bytes_transferred_to_device: self.input_size * std::mem::size_of::<Complex64>(),
149 bytes_transferred_from_device: self.input_size * std::mem::size_of::<Complex64>(),
150 occupancy_percent: 80.0,
151 };
152
153 Ok(stats)
154 }
155}
156
157#[derive(Debug)]
159pub struct SparseFFTKernel {
160 config: KernelConfig,
162 input_size: usize,
164 sparsity: usize,
166 #[allow(dead_code)]
168 input_address: usize,
169 #[allow(dead_code)]
171 output_values_address: usize,
172 #[allow(dead_code)]
174 output_indices_address: usize,
175 algorithm: SparseFFTAlgorithm,
177 window_function: WindowFunction,
179}
180
181impl SparseFFTKernel {
182 #[allow(clippy::too_many_arguments)]
184 pub fn new(
185 input_size: usize,
186 sparsity: usize,
187 input_address: usize,
188 output_values_address: usize,
189 output_indices_address: usize,
190 algorithm: SparseFFTAlgorithm,
191 window_function: WindowFunction,
192 ) -> Self {
193 let mut config = KernelConfig::default();
194 config.grid_size = input_size.div_ceil(config.block_size);
196
197 Self {
198 config,
199 input_size,
200 sparsity,
201 input_address,
202 output_values_address,
203 output_indices_address,
204 algorithm,
205 window_function,
206 }
207 }
208
209 pub fn apply_window(&self) -> FFTResult<KernelStats> {
211 let execution_time_ms = self.input_size as f64 * 0.0001;
214
215 let stats = KernelStats {
216 execution_time_ms,
217 memory_bandwidth_gb_s: 400.0,
218 compute_throughput_gflops: 1000.0,
219 bytes_transferred_to_device: 0,
220 bytes_transferred_from_device: 0,
221 occupancy_percent: 70.0,
222 };
223
224 Ok(stats)
225 }
226
227 pub fn get_algorithm_implementation(&self) -> FFTResult<KernelImplementation> {
229 match self.algorithm {
231 SparseFFTAlgorithm::Sublinear => Ok(KernelImplementation::Throughput),
232 SparseFFTAlgorithm::CompressedSensing => Ok(KernelImplementation::HighAccuracy),
233 SparseFFTAlgorithm::Iterative => Ok(KernelImplementation::Latency),
234 SparseFFTAlgorithm::Deterministic => Ok(KernelImplementation::Throughput),
235 SparseFFTAlgorithm::FrequencyPruning => Ok(KernelImplementation::MemoryEfficient),
236 SparseFFTAlgorithm::SpectralFlatness => Ok(KernelImplementation::HighAccuracy),
237 }
238 }
239}
240
241impl GPUKernel for SparseFFTKernel {
242 fn name(&self) -> &str {
243 "SparseFFT_Kernel"
244 }
245
246 fn config(&self) -> &KernelConfig {
247 &self.config
248 }
249
250 fn set_config(&mut self, config: KernelConfig) {
251 self.config = config;
252 }
253
254 fn execute(&self) -> FFTResult<KernelStats> {
255 let algorithm_factor = match self.algorithm {
260 SparseFFTAlgorithm::Sublinear => 0.8,
261 SparseFFTAlgorithm::CompressedSensing => 1.5,
262 SparseFFTAlgorithm::Iterative => 1.2,
263 SparseFFTAlgorithm::Deterministic => 1.0,
264 SparseFFTAlgorithm::FrequencyPruning => 0.9,
265 SparseFFTAlgorithm::SpectralFlatness => 1.3,
266 };
267
268 let window_factor = match self.window_function {
270 WindowFunction::None => 1.0,
271 WindowFunction::Hann => 1.1,
272 WindowFunction::Hamming => 1.1,
273 WindowFunction::Blackman => 1.2,
274 WindowFunction::FlatTop => 1.3,
275 WindowFunction::Kaiser => 1.4,
276 };
277
278 let execution_time_ms = self.input_size as f64 * algorithm_factor * window_factor * 0.001;
280
281 let stats = KernelStats {
283 execution_time_ms,
284 memory_bandwidth_gb_s: 450.0,
285 compute_throughput_gflops: 9000.0,
286 bytes_transferred_to_device: self.input_size * std::mem::size_of::<Complex64>(),
287 bytes_transferred_from_device: (self.sparsity * 2) * std::mem::size_of::<Complex64>(),
288 occupancy_percent: 75.0,
289 };
290
291 Ok(stats)
292 }
293}
294
295#[derive(Debug)]
297pub struct KernelFactory {
298 #[allow(dead_code)]
300 arch: String,
301 compute_capabilities: Vec<(i32, i32)>,
303 available_memory: usize,
305 shared_memory_per_block: usize,
307 max_threads_per_block: usize,
309}
310
311impl KernelFactory {
312 pub fn new(
314 arch: String,
315 compute_capabilities: Vec<(i32, i32)>,
316 available_memory: usize,
317 shared_memory_per_block: usize,
318 max_threads_per_block: usize,
319 ) -> Self {
320 Self {
321 arch,
322 compute_capabilities,
323 available_memory,
324 shared_memory_per_block,
325 max_threads_per_block,
326 }
327 }
328
329 pub fn create_fft_kernel(
331 &self,
332 input_size: usize,
333 input_address: usize,
334 output_address: usize,
335 ) -> FFTResult<FFTKernel> {
336 let mut kernel = FFTKernel::new(input_size, input_address, output_address);
337
338 let mut config = KernelConfig::default();
340
341 config.block_size = if self.max_threads_per_block >= 1024 {
343 1024
344 } else if self.max_threads_per_block >= 512 {
345 512
346 } else {
347 256
348 };
349
350 config.grid_size = input_size.div_ceil(config.block_size);
352
353 config.shared_memory_size = std::cmp::min(
355 self.shared_memory_per_block,
356 16 * 1024, );
358
359 if !self.compute_capabilities.is_empty()
361 && (self.compute_capabilities[0].0 >= 7
362 || (self.compute_capabilities[0].0 == 6 && self.compute_capabilities[0].1 >= 1))
363 {
364 config.use_mixed_precision = true;
365 }
366
367 if !self.compute_capabilities.is_empty() && self.compute_capabilities[0].0 >= 7 {
369 config.use_tensor_cores = true;
370 }
371
372 kernel.set_config(config);
373 Ok(kernel)
374 }
375
376 #[allow(clippy::too_many_arguments)]
378 pub fn create_sparse_fft_kernel(
379 &self,
380 input_size: usize,
381 sparsity: usize,
382 input_address: usize,
383 output_values_address: usize,
384 output_indices_address: usize,
385 algorithm: SparseFFTAlgorithm,
386 window_function: WindowFunction,
387 ) -> FFTResult<SparseFFTKernel> {
388 let mut kernel = SparseFFTKernel::new(
389 input_size,
390 sparsity,
391 input_address,
392 output_values_address,
393 output_indices_address,
394 algorithm,
395 window_function,
396 );
397
398 let mut config = KernelConfig::default();
400
401 config.block_size = match algorithm {
403 SparseFFTAlgorithm::Sublinear => 256,
404 SparseFFTAlgorithm::CompressedSensing => 512,
405 SparseFFTAlgorithm::Iterative => 128,
406 SparseFFTAlgorithm::Deterministic => 256,
407 SparseFFTAlgorithm::FrequencyPruning => 256,
408 SparseFFTAlgorithm::SpectralFlatness => 512,
409 };
410
411 config.block_size = std::cmp::min(config.block_size, self.max_threads_per_block);
413
414 config.grid_size = input_size.div_ceil(config.block_size);
416
417 config.shared_memory_size = match algorithm {
419 SparseFFTAlgorithm::Sublinear => 16 * 1024,
420 SparseFFTAlgorithm::CompressedSensing => 32 * 1024,
421 SparseFFTAlgorithm::Iterative => 8 * 1024,
422 SparseFFTAlgorithm::Deterministic => 16 * 1024,
423 SparseFFTAlgorithm::FrequencyPruning => 16 * 1024,
424 SparseFFTAlgorithm::SpectralFlatness => 32 * 1024,
425 };
426
427 config.shared_memory_size =
429 std::cmp::min(config.shared_memory_size, self.shared_memory_per_block);
430
431 if !self.compute_capabilities.is_empty()
433 && (self.compute_capabilities[0].0 >= 7
434 || (self.compute_capabilities[0].0 == 6 && self.compute_capabilities[0].1 >= 1))
435 {
436 match algorithm {
438 SparseFFTAlgorithm::Sublinear
439 | SparseFFTAlgorithm::Deterministic
440 | SparseFFTAlgorithm::FrequencyPruning => {
441 config.use_mixed_precision = true;
442 }
443 _ => {
444 config.use_mixed_precision = false;
445 }
446 }
447 }
448
449 if !self.compute_capabilities.is_empty() && self.compute_capabilities[0].0 >= 7 {
451 match algorithm {
453 SparseFFTAlgorithm::CompressedSensing | SparseFFTAlgorithm::SpectralFlatness => {
454 config.use_tensor_cores = true;
455 }
456 _ => {
457 config.use_tensor_cores = false;
458 }
459 }
460 }
461
462 kernel.set_config(config);
463 Ok(kernel)
464 }
465
466 pub fn check_memory_requirements(&self, total_bytesneeded: usize) -> FFTResult<()> {
468 if total_bytesneeded > self.available_memory {
469 return Err(FFTError::MemoryError(format!(
470 "Not enough GPU memory: need {} bytes, available {} bytes",
471 total_bytesneeded, self.available_memory
472 )));
473 }
474
475 Ok(())
476 }
477}
478
479pub struct KernelLauncher {
481 factory: KernelFactory,
483 active_kernels: Vec<Box<dyn GPUKernel>>,
485 total_memory_allocated: usize,
487}
488
489impl KernelLauncher {
490 pub fn new(factory: KernelFactory) -> Self {
492 Self {
493 factory,
494 active_kernels: Vec::new(),
495 total_memory_allocated: 0,
496 }
497 }
498
499 pub fn allocate_fft_memory(&mut self, inputsize: usize) -> FFTResult<(usize, usize)> {
501 let element_size = std::mem::size_of::<Complex64>();
502 let input_bytes = inputsize * element_size;
503 let output_bytes = inputsize * element_size;
504
505 let total_bytes = input_bytes + output_bytes;
506 self.factory.check_memory_requirements(total_bytes)?;
507
508 let input_address = 0x10000;
511 let output_address = 0x20000;
512
513 self.total_memory_allocated += total_bytes;
514
515 Ok((input_address, output_address))
516 }
517
518 pub fn allocate_sparse_fft_memory(
520 &mut self,
521 input_size: usize,
522 sparsity: usize,
523 ) -> FFTResult<(usize, usize, usize)> {
524 let element_size = std::mem::size_of::<Complex64>();
525 let index_size = std::mem::size_of::<usize>();
526
527 let input_bytes = input_size * element_size;
528 let output_values_bytes = sparsity * element_size;
529 let output_indices_bytes = sparsity * index_size;
530
531 let total_bytes = input_bytes + output_values_bytes + output_indices_bytes;
532 self.factory.check_memory_requirements(total_bytes)?;
533
534 let input_address = 0x10000;
537 let output_values_address = 0x20000;
538 let output_indices_address = 0x30000;
539
540 self.total_memory_allocated += total_bytes;
541
542 Ok((input_address, output_values_address, output_indices_address))
543 }
544
545 pub fn launch_fft_kernel(
547 &mut self,
548 input_size: usize,
549 input_address: usize,
550 output_address: usize,
551 ) -> FFTResult<KernelStats> {
552 let kernel = self
553 .factory
554 .create_fft_kernel(input_size, input_address, output_address)?;
555
556 let stats = kernel.execute()?;
557
558 Ok(stats)
562 }
563
564 #[allow(clippy::too_many_arguments)]
566 pub fn launch_sparse_fft_kernel(
567 &mut self,
568 input_size: usize,
569 sparsity: usize,
570 input_address: usize,
571 output_values_address: usize,
572 output_indices_address: usize,
573 algorithm: SparseFFTAlgorithm,
574 window_function: WindowFunction,
575 ) -> FFTResult<KernelStats> {
576 let kernel = self.factory.create_sparse_fft_kernel(
577 input_size,
578 sparsity,
579 input_address,
580 output_values_address,
581 output_indices_address,
582 algorithm,
583 window_function,
584 )?;
585
586 if window_function != WindowFunction::None {
588 kernel.apply_window()?;
590 }
591
592 let stats = kernel.execute()?;
593
594 Ok(stats)
598 }
599
600 pub fn get_total_memory_allocated(&self) -> usize {
602 self.total_memory_allocated
603 }
604
605 pub fn free_all_memory(&mut self) {
607 self.active_kernels.clear();
609 self.total_memory_allocated = 0;
610 }
611}
612
613#[allow(clippy::too_many_arguments)]
632#[allow(dead_code)]
633pub fn execute_sparse_fft_kernel<T>(
634 signal: &[T],
635 sparsity: usize,
636 algorithm: SparseFFTAlgorithm,
637 window_function: WindowFunction,
638 gpu_arch: &str,
639 compute_capability: (i32, i32),
640 available_memory: usize,
641) -> FFTResult<(Vec<Complex64>, Vec<usize>, KernelStats)>
642where
643 T: NumCast + Copy + Debug + 'static,
644{
645 let factory = KernelFactory::new(
647 gpu_arch.to_string(),
648 vec![compute_capability],
649 available_memory,
650 48 * 1024, 1024, );
653
654 let mut launcher = KernelLauncher::new(factory);
656
657 let (input_address, output_values_address, output_indices_address) =
659 launcher.allocate_sparse_fft_memory(signal.len(), sparsity)?;
660
661 let stats = launcher.launch_sparse_fft_kernel(
665 signal.len(),
666 sparsity,
667 input_address,
668 output_values_address,
669 output_indices_address,
670 algorithm,
671 window_function,
672 )?;
673
674 let mut values = Vec::with_capacity(sparsity);
679 let mut indices = Vec::with_capacity(sparsity);
680
681 for i in 0..sparsity {
682 let idx = i * (signal.len() / sparsity);
683 let val = Complex64::new(1.0 / (i + 1) as f64, 0.0);
684
685 values.push(val);
686 indices.push(idx);
687 }
688
689 launcher.free_all_memory();
691
692 Ok((values, indices, stats))
693}
694
695#[cfg(test)]
696mod tests {
697 use super::*;
698 use std::f64::consts::PI;
699
700 fn create_sparse_signal(n: usize, frequencies: &[(usize, f64)]) -> Vec<f64> {
702 let mut signal = vec![0.0; n];
703
704 for i in 0..n {
705 let t = 2.0 * PI * (i as f64) / (n as f64);
706 for &(freq, amp) in frequencies {
707 signal[i] += amp * (freq as f64 * t).sin();
708 }
709 }
710
711 signal
712 }
713
714 #[test]
715 fn test_kernel_factory() {
716 let caps = PlatformCapabilities::detect();
718 if !caps.cuda_available && !caps.gpu_available {
719 eprintln!("GPU not available, using mock kernel factory test");
721 let factory = KernelFactory::new(
723 "Mock Device".to_string(),
724 vec![(1, 1)],
725 1024 * 1024, 16 * 1024, 32, );
729 assert!(factory.arch.contains("Mock"));
730 return;
731 }
732
733 let factory = KernelFactory::new(
734 "NVIDIA GeForce RTX 3080".to_string(),
735 vec![(8, 6)],
736 10 * 1024 * 1024 * 1024, 48 * 1024, 1024, );
740
741 let kernel = factory.create_fft_kernel(1024, 0x10000, 0x20000).unwrap();
743
744 let config = kernel.config();
746 assert_eq!(config.block_size, 1024);
747 assert!(config.use_mixed_precision);
748 assert!(config.use_tensor_cores);
749
750 let kernel = factory
752 .create_sparse_fft_kernel(
753 1024,
754 10,
755 0x10000,
756 0x20000,
757 0x30000,
758 SparseFFTAlgorithm::Sublinear,
759 WindowFunction::Hann,
760 )
761 .unwrap();
762
763 let config = kernel.config();
765 assert_eq!(config.block_size, 256);
766 assert!(config.use_mixed_precision);
767 }
768
769 #[test]
770 fn test_kernel_launcher() {
771 let caps = PlatformCapabilities::detect();
773 if !caps.cuda_available && !caps.gpu_available {
774 eprintln!("GPU not available, using mock kernel launcher test");
776 let factory = KernelFactory::new(
777 "Mock Device".to_string(),
778 vec![(1, 1)],
779 1024 * 1024,
780 16 * 1024,
781 32,
782 );
783 let launcher = KernelLauncher::new(factory);
784 assert_eq!(launcher.get_total_memory_allocated(), 0);
786 return;
787 }
788
789 let factory = KernelFactory::new(
790 "NVIDIA GeForce RTX 3080".to_string(),
791 vec![(8, 6)],
792 10 * 1024 * 1024 * 1024, 48 * 1024, 1024, );
796
797 let mut launcher = KernelLauncher::new(factory);
798
799 let (input_address, output_address) = launcher.allocate_fft_memory(1024).unwrap();
801 assert_ne!(input_address, 0);
802 assert_ne!(output_address, 0);
803
804 let stats = launcher
806 .launch_fft_kernel(1024, input_address, output_address)
807 .unwrap();
808
809 assert!(stats.execution_time_ms > 0.0);
811 assert!(stats.memory_bandwidth_gb_s > 0.0);
812 assert!(stats.compute_throughput_gflops > 0.0);
813
814 launcher.free_all_memory();
816 assert_eq!(launcher.get_total_memory_allocated(), 0);
817 }
818
819 #[test]
820 fn test_execute_sparse_fft_kernel() {
821 let n = 1024;
823 let frequencies = vec![(3, 1.0), (7, 0.5), (15, 0.25)];
824 let signal = create_sparse_signal(n, &frequencies);
825
826 let caps = PlatformCapabilities::detect();
828 if !caps.cuda_available && !caps.gpu_available {
829 eprintln!("GPU not available, using mock sparse FFT kernel test");
831 let result = execute_sparse_fft_kernel(
833 &signal,
834 6,
835 SparseFFTAlgorithm::Sublinear,
836 WindowFunction::Hann,
837 "Mock Device",
838 (1, 1),
839 1024 * 1024, );
841 let (values, indices, stats) = result.unwrap();
843 assert_eq!(values.len(), 6);
844 assert_eq!(indices.len(), 6);
845 assert!(stats.execution_time_ms >= 0.0);
846 return;
847 }
848
849 let (values, indices, stats) = execute_sparse_fft_kernel(
851 &signal,
852 6,
853 SparseFFTAlgorithm::Sublinear,
854 WindowFunction::Hann,
855 "NVIDIA GeForce RTX 3080",
856 (8, 6),
857 10 * 1024 * 1024 * 1024, )
859 .unwrap();
860
861 assert_eq!(values.len(), 6);
863 assert_eq!(indices.len(), 6);
864 assert!(stats.execution_time_ms > 0.0);
865 }
866}