scirs2_fft/
sparse_fft_gpu_kernels.rs

1//! GPU kernel implementations for sparse FFT algorithms
2//!
3//! This module contains kernel implementations for various sparse FFT algorithms
4//! targeted at GPU acceleration. These kernels are designed to be highly
5//! optimized for specific GPU architectures and can be used with different
6//! GPU backends (CUDA, HIP, SYCL).
7
8use crate::error::{FFTError, FFTResult};
9use crate::sparse_fft::{SparseFFTAlgorithm, WindowFunction};
10use scirs2_core::numeric::Complex64;
11use scirs2_core::numeric::NumCast;
12use scirs2_core::simd_ops::PlatformCapabilities;
13use std::fmt::Debug;
14
15/// GPU kernel configuration
16#[derive(Debug, Clone)]
17pub struct KernelConfig {
18    /// Block size for GPU kernel
19    pub block_size: usize,
20    /// Grid size for GPU kernel
21    pub grid_size: usize,
22    /// Shared memory size per block in bytes
23    pub shared_memory_size: usize,
24    /// Whether to use mixed precision
25    pub use_mixed_precision: bool,
26    /// Number of registers per thread
27    pub registers_per_thread: usize,
28    /// Whether to use tensor cores (if available)
29    pub use_tensor_cores: bool,
30}
31
32impl Default for KernelConfig {
33    fn default() -> Self {
34        Self {
35            block_size: 256,
36            grid_size: 0,                  // will be computed based on input size
37            shared_memory_size: 16 * 1024, // 16 KB
38            use_mixed_precision: false,
39            registers_per_thread: 32,
40            use_tensor_cores: false,
41        }
42    }
43}
44
45/// Kernel implementation type
46#[derive(Debug, Clone, Copy, PartialEq, Eq)]
47pub enum KernelImplementation {
48    /// Optimized for throughput
49    Throughput,
50    /// Optimized for latency
51    Latency,
52    /// Optimized for memory efficiency
53    MemoryEfficient,
54    /// Optimized for accuracy
55    HighAccuracy,
56    /// Optimized for power efficiency
57    PowerEfficient,
58}
59
60/// Kernel execution statistics
61#[derive(Debug, Clone)]
62pub struct KernelStats {
63    /// Kernel execution time
64    pub execution_time_ms: f64,
65    /// Memory bandwidth used (GB/s)
66    pub memory_bandwidth_gb_s: f64,
67    /// Compute throughput (GFLOPS)
68    pub compute_throughput_gflops: f64,
69    /// Memory transfers host->device (bytes)
70    pub bytes_transferred_to_device: usize,
71    /// Memory transfers device->host (bytes)
72    pub bytes_transferred_from_device: usize,
73    /// Occupancy percentage
74    pub occupancy_percent: f64,
75}
76
77/// Trait for GPU kernels
78pub trait GPUKernel {
79    /// Get kernel name
80    fn name(&self) -> &str;
81
82    /// Get kernel configuration
83    fn config(&self) -> &KernelConfig;
84
85    /// Set kernel configuration
86    fn set_config(&mut self, config: KernelConfig);
87
88    /// Execute kernel
89    fn execute(&self) -> FFTResult<KernelStats>;
90}
91
92/// Kernel for computing FFT on GPU
93#[derive(Debug)]
94pub struct FFTKernel {
95    /// Kernel configuration
96    config: KernelConfig,
97    /// Size of the input signal
98    input_size: usize,
99    /// Input data GPU memory address/identifier
100    #[allow(dead_code)]
101    input_address: usize,
102    /// Output data GPU memory address/identifier
103    #[allow(dead_code)]
104    output_address: usize,
105}
106
107impl FFTKernel {
108    /// Create a new FFT kernel
109    pub fn new(input_size: usize, input_address: usize, outputaddress: usize) -> Self {
110        let mut config = KernelConfig::default();
111        // Calculate grid _size based on input _size and block _size
112        config.grid_size = input_size.div_ceil(config.block_size);
113
114        Self {
115            config,
116            input_size,
117            input_address,
118            output_address: outputaddress,
119        }
120    }
121}
122
123impl GPUKernel for FFTKernel {
124    fn name(&self) -> &str {
125        "FFT_Kernel"
126    }
127
128    fn config(&self) -> &KernelConfig {
129        &self.config
130    }
131
132    fn set_config(&mut self, config: KernelConfig) {
133        self.config = config;
134    }
135
136    fn execute(&self) -> FFTResult<KernelStats> {
137        // This would call into device-specific FFT implementation
138        // For now, just return dummy stats
139
140        // Estimate execution time based on input size
141        let execution_time_ms = self.input_size as f64 * 0.001;
142
143        // Create dummy stats
144        let stats = KernelStats {
145            execution_time_ms,
146            memory_bandwidth_gb_s: 500.0,
147            compute_throughput_gflops: 10000.0,
148            bytes_transferred_to_device: self.input_size * std::mem::size_of::<Complex64>(),
149            bytes_transferred_from_device: self.input_size * std::mem::size_of::<Complex64>(),
150            occupancy_percent: 80.0,
151        };
152
153        Ok(stats)
154    }
155}
156
157/// Kernel for computing sparse FFT on GPU
158#[derive(Debug)]
159pub struct SparseFFTKernel {
160    /// Kernel configuration
161    config: KernelConfig,
162    /// Size of the input signal
163    input_size: usize,
164    /// Expected number of significant frequency components
165    sparsity: usize,
166    /// Input data GPU memory address/identifier
167    #[allow(dead_code)]
168    input_address: usize,
169    /// Output values GPU memory address/identifier
170    #[allow(dead_code)]
171    output_values_address: usize,
172    /// Output indices GPU memory address/identifier
173    #[allow(dead_code)]
174    output_indices_address: usize,
175    /// Algorithm to use
176    algorithm: SparseFFTAlgorithm,
177    /// Window function to apply
178    window_function: WindowFunction,
179}
180
181impl SparseFFTKernel {
182    /// Create a new sparse FFT kernel
183    #[allow(clippy::too_many_arguments)]
184    pub fn new(
185        input_size: usize,
186        sparsity: usize,
187        input_address: usize,
188        output_values_address: usize,
189        output_indices_address: usize,
190        algorithm: SparseFFTAlgorithm,
191        window_function: WindowFunction,
192    ) -> Self {
193        let mut config = KernelConfig::default();
194        // Calculate grid _size based on input _size and block _size
195        config.grid_size = input_size.div_ceil(config.block_size);
196
197        Self {
198            config,
199            input_size,
200            sparsity,
201            input_address,
202            output_values_address,
203            output_indices_address,
204            algorithm,
205            window_function,
206        }
207    }
208
209    /// Apply window function on GPU
210    pub fn apply_window(&self) -> FFTResult<KernelStats> {
211        // This would apply the selected window function on GPU
212        // For now, just return dummy stats
213        let execution_time_ms = self.input_size as f64 * 0.0001;
214
215        let stats = KernelStats {
216            execution_time_ms,
217            memory_bandwidth_gb_s: 400.0,
218            compute_throughput_gflops: 1000.0,
219            bytes_transferred_to_device: 0,
220            bytes_transferred_from_device: 0,
221            occupancy_percent: 70.0,
222        };
223
224        Ok(stats)
225    }
226
227    /// Get algorithm-specific implementation
228    pub fn get_algorithm_implementation(&self) -> FFTResult<KernelImplementation> {
229        // Choose the best implementation based on algorithm, input size, and GPU capabilities
230        match self.algorithm {
231            SparseFFTAlgorithm::Sublinear => Ok(KernelImplementation::Throughput),
232            SparseFFTAlgorithm::CompressedSensing => Ok(KernelImplementation::HighAccuracy),
233            SparseFFTAlgorithm::Iterative => Ok(KernelImplementation::Latency),
234            SparseFFTAlgorithm::Deterministic => Ok(KernelImplementation::Throughput),
235            SparseFFTAlgorithm::FrequencyPruning => Ok(KernelImplementation::MemoryEfficient),
236            SparseFFTAlgorithm::SpectralFlatness => Ok(KernelImplementation::HighAccuracy),
237        }
238    }
239}
240
241impl GPUKernel for SparseFFTKernel {
242    fn name(&self) -> &str {
243        "SparseFFT_Kernel"
244    }
245
246    fn config(&self) -> &KernelConfig {
247        &self.config
248    }
249
250    fn set_config(&mut self, config: KernelConfig) {
251        self.config = config;
252    }
253
254    fn execute(&self) -> FFTResult<KernelStats> {
255        // This would call into device-specific sparse FFT implementation
256        // For now, just return dummy stats
257
258        // Different algorithms have different performance characteristics
259        let algorithm_factor = match self.algorithm {
260            SparseFFTAlgorithm::Sublinear => 0.8,
261            SparseFFTAlgorithm::CompressedSensing => 1.5,
262            SparseFFTAlgorithm::Iterative => 1.2,
263            SparseFFTAlgorithm::Deterministic => 1.0,
264            SparseFFTAlgorithm::FrequencyPruning => 0.9,
265            SparseFFTAlgorithm::SpectralFlatness => 1.3,
266        };
267
268        // Window functions also affect performance
269        let window_factor = match self.window_function {
270            WindowFunction::None => 1.0,
271            WindowFunction::Hann => 1.1,
272            WindowFunction::Hamming => 1.1,
273            WindowFunction::Blackman => 1.2,
274            WindowFunction::FlatTop => 1.3,
275            WindowFunction::Kaiser => 1.4,
276        };
277
278        // Estimate execution time based on input size, sparsity, algorithm, and window function
279        let execution_time_ms = self.input_size as f64 * algorithm_factor * window_factor * 0.001;
280
281        // Create stats
282        let stats = KernelStats {
283            execution_time_ms,
284            memory_bandwidth_gb_s: 450.0,
285            compute_throughput_gflops: 9000.0,
286            bytes_transferred_to_device: self.input_size * std::mem::size_of::<Complex64>(),
287            bytes_transferred_from_device: (self.sparsity * 2) * std::mem::size_of::<Complex64>(),
288            occupancy_percent: 75.0,
289        };
290
291        Ok(stats)
292    }
293}
294
295/// Kernel factory for creating optimized kernels
296#[derive(Debug)]
297pub struct KernelFactory {
298    /// Target GPU architecture
299    #[allow(dead_code)]
300    arch: String,
301    /// Available compute capabilities
302    compute_capabilities: Vec<(i32, i32)>,
303    /// Available memory (bytes)
304    available_memory: usize,
305    /// Shared memory per block (bytes)
306    shared_memory_per_block: usize,
307    /// Maximum threads per block
308    max_threads_per_block: usize,
309}
310
311impl KernelFactory {
312    /// Create a new kernel factory
313    pub fn new(
314        arch: String,
315        compute_capabilities: Vec<(i32, i32)>,
316        available_memory: usize,
317        shared_memory_per_block: usize,
318        max_threads_per_block: usize,
319    ) -> Self {
320        Self {
321            arch,
322            compute_capabilities,
323            available_memory,
324            shared_memory_per_block,
325            max_threads_per_block,
326        }
327    }
328
329    /// Create an FFT kernel optimized for the target GPU
330    pub fn create_fft_kernel(
331        &self,
332        input_size: usize,
333        input_address: usize,
334        output_address: usize,
335    ) -> FFTResult<FFTKernel> {
336        let mut kernel = FFTKernel::new(input_size, input_address, output_address);
337
338        // Customize configuration based on GPU
339        let mut config = KernelConfig::default();
340
341        // Set block _size based on GPU capabilities
342        config.block_size = if self.max_threads_per_block >= 1024 {
343            1024
344        } else if self.max_threads_per_block >= 512 {
345            512
346        } else {
347            256
348        };
349
350        // Calculate grid _size
351        config.grid_size = input_size.div_ceil(config.block_size);
352
353        // Set shared memory _size
354        config.shared_memory_size = std::cmp::min(
355            self.shared_memory_per_block,
356            16 * 1024, // 16 KB default
357        );
358
359        // Enable mixed precision for newer GPUs
360        if !self.compute_capabilities.is_empty()
361            && (self.compute_capabilities[0].0 >= 7
362                || (self.compute_capabilities[0].0 == 6 && self.compute_capabilities[0].1 >= 1))
363        {
364            config.use_mixed_precision = true;
365        }
366
367        // Enable tensor cores for supported architectures
368        if !self.compute_capabilities.is_empty() && self.compute_capabilities[0].0 >= 7 {
369            config.use_tensor_cores = true;
370        }
371
372        kernel.set_config(config);
373        Ok(kernel)
374    }
375
376    /// Create a sparse FFT kernel optimized for the target GPU
377    #[allow(clippy::too_many_arguments)]
378    pub fn create_sparse_fft_kernel(
379        &self,
380        input_size: usize,
381        sparsity: usize,
382        input_address: usize,
383        output_values_address: usize,
384        output_indices_address: usize,
385        algorithm: SparseFFTAlgorithm,
386        window_function: WindowFunction,
387    ) -> FFTResult<SparseFFTKernel> {
388        let mut kernel = SparseFFTKernel::new(
389            input_size,
390            sparsity,
391            input_address,
392            output_values_address,
393            output_indices_address,
394            algorithm,
395            window_function,
396        );
397
398        // Customize configuration based on GPU and algorithm
399        let mut config = KernelConfig::default();
400
401        // Optimize block _size based on algorithm
402        config.block_size = match algorithm {
403            SparseFFTAlgorithm::Sublinear => 256,
404            SparseFFTAlgorithm::CompressedSensing => 512,
405            SparseFFTAlgorithm::Iterative => 128,
406            SparseFFTAlgorithm::Deterministic => 256,
407            SparseFFTAlgorithm::FrequencyPruning => 256,
408            SparseFFTAlgorithm::SpectralFlatness => 512,
409        };
410
411        // Ensure block _size is within GPU limits
412        config.block_size = std::cmp::min(config.block_size, self.max_threads_per_block);
413
414        // Calculate grid _size
415        config.grid_size = input_size.div_ceil(config.block_size);
416
417        // Optimize shared memory based on algorithm
418        config.shared_memory_size = match algorithm {
419            SparseFFTAlgorithm::Sublinear => 16 * 1024,
420            SparseFFTAlgorithm::CompressedSensing => 32 * 1024,
421            SparseFFTAlgorithm::Iterative => 8 * 1024,
422            SparseFFTAlgorithm::Deterministic => 16 * 1024,
423            SparseFFTAlgorithm::FrequencyPruning => 16 * 1024,
424            SparseFFTAlgorithm::SpectralFlatness => 32 * 1024,
425        };
426
427        // Ensure shared memory is within GPU limits
428        config.shared_memory_size =
429            std::cmp::min(config.shared_memory_size, self.shared_memory_per_block);
430
431        // Enable mixed precision for newer GPUs and certain algorithms
432        if !self.compute_capabilities.is_empty()
433            && (self.compute_capabilities[0].0 >= 7
434                || (self.compute_capabilities[0].0 == 6 && self.compute_capabilities[0].1 >= 1))
435        {
436            // Only enable for algorithms that can benefit without significant accuracy loss
437            match algorithm {
438                SparseFFTAlgorithm::Sublinear
439                | SparseFFTAlgorithm::Deterministic
440                | SparseFFTAlgorithm::FrequencyPruning => {
441                    config.use_mixed_precision = true;
442                }
443                _ => {
444                    config.use_mixed_precision = false;
445                }
446            }
447        }
448
449        // Enable tensor cores for supported architectures and algorithms
450        if !self.compute_capabilities.is_empty() && self.compute_capabilities[0].0 >= 7 {
451            // Only enable for algorithms that can benefit from tensor cores
452            match algorithm {
453                SparseFFTAlgorithm::CompressedSensing | SparseFFTAlgorithm::SpectralFlatness => {
454                    config.use_tensor_cores = true;
455                }
456                _ => {
457                    config.use_tensor_cores = false;
458                }
459            }
460        }
461
462        kernel.set_config(config);
463        Ok(kernel)
464    }
465
466    /// Check if there's enough memory for the requested operation
467    pub fn check_memory_requirements(&self, total_bytesneeded: usize) -> FFTResult<()> {
468        if total_bytesneeded > self.available_memory {
469            return Err(FFTError::MemoryError(format!(
470                "Not enough GPU memory: need {} bytes, available {} bytes",
471                total_bytesneeded, self.available_memory
472            )));
473        }
474
475        Ok(())
476    }
477}
478
479/// Kernel launcher for executing kernels with optimal parameters
480pub struct KernelLauncher {
481    /// Kernel factory for creating optimized kernels
482    factory: KernelFactory,
483    /// Active kernels
484    active_kernels: Vec<Box<dyn GPUKernel>>,
485    /// Total memory allocated
486    total_memory_allocated: usize,
487}
488
489impl KernelLauncher {
490    /// Create a new kernel launcher
491    pub fn new(factory: KernelFactory) -> Self {
492        Self {
493            factory,
494            active_kernels: Vec::new(),
495            total_memory_allocated: 0,
496        }
497    }
498
499    /// Allocate memory for FFT operation
500    pub fn allocate_fft_memory(&mut self, inputsize: usize) -> FFTResult<(usize, usize)> {
501        let element_size = std::mem::size_of::<Complex64>();
502        let input_bytes = inputsize * element_size;
503        let output_bytes = inputsize * element_size;
504
505        let total_bytes = input_bytes + output_bytes;
506        self.factory.check_memory_requirements(total_bytes)?;
507
508        // In a real implementation, this would allocate actual GPU memory
509        // For now, just return dummy addresses
510        let input_address = 0x10000;
511        let output_address = 0x20000;
512
513        self.total_memory_allocated += total_bytes;
514
515        Ok((input_address, output_address))
516    }
517
518    /// Allocate memory for sparse FFT operation
519    pub fn allocate_sparse_fft_memory(
520        &mut self,
521        input_size: usize,
522        sparsity: usize,
523    ) -> FFTResult<(usize, usize, usize)> {
524        let element_size = std::mem::size_of::<Complex64>();
525        let index_size = std::mem::size_of::<usize>();
526
527        let input_bytes = input_size * element_size;
528        let output_values_bytes = sparsity * element_size;
529        let output_indices_bytes = sparsity * index_size;
530
531        let total_bytes = input_bytes + output_values_bytes + output_indices_bytes;
532        self.factory.check_memory_requirements(total_bytes)?;
533
534        // In a real implementation, this would allocate actual GPU memory
535        // For now, just return dummy addresses
536        let input_address = 0x10000;
537        let output_values_address = 0x20000;
538        let output_indices_address = 0x30000;
539
540        self.total_memory_allocated += total_bytes;
541
542        Ok((input_address, output_values_address, output_indices_address))
543    }
544
545    /// Launch FFT kernel
546    pub fn launch_fft_kernel(
547        &mut self,
548        input_size: usize,
549        input_address: usize,
550        output_address: usize,
551    ) -> FFTResult<KernelStats> {
552        let kernel = self
553            .factory
554            .create_fft_kernel(input_size, input_address, output_address)?;
555
556        let stats = kernel.execute()?;
557
558        // In a real implementation, we would keep track of the kernel
559        // self.active_kernels.push(Box::new(kernel));
560
561        Ok(stats)
562    }
563
564    /// Launch sparse FFT kernel
565    #[allow(clippy::too_many_arguments)]
566    pub fn launch_sparse_fft_kernel(
567        &mut self,
568        input_size: usize,
569        sparsity: usize,
570        input_address: usize,
571        output_values_address: usize,
572        output_indices_address: usize,
573        algorithm: SparseFFTAlgorithm,
574        window_function: WindowFunction,
575    ) -> FFTResult<KernelStats> {
576        let kernel = self.factory.create_sparse_fft_kernel(
577            input_size,
578            sparsity,
579            input_address,
580            output_values_address,
581            output_indices_address,
582            algorithm,
583            window_function,
584        )?;
585
586        // Apply window _function if needed
587        if window_function != WindowFunction::None {
588            // Launch window kernel first
589            kernel.apply_window()?;
590        }
591
592        let stats = kernel.execute()?;
593
594        // In a real implementation, we would keep track of the kernel
595        // self.active_kernels.push(Box::new(kernel));
596
597        Ok(stats)
598    }
599
600    /// Get total memory allocated
601    pub fn get_total_memory_allocated(&self) -> usize {
602        self.total_memory_allocated
603    }
604
605    /// Free all allocated memory
606    pub fn free_all_memory(&mut self) {
607        // In a real implementation, this would free all GPU memory
608        self.active_kernels.clear();
609        self.total_memory_allocated = 0;
610    }
611}
612
613/// Execute sparse FFT on GPU using optimized kernels
614///
615/// This function provides a high-level interface to the GPU kernel implementation,
616/// handling memory allocation, kernel execution, and result collection.
617///
618/// # Arguments
619///
620/// * `signal` - Input signal
621/// * `sparsity` - Expected number of significant frequency components
622/// * `algorithm` - Sparse FFT algorithm to use
623/// * `window_function` - Window function to apply
624/// * `gpu_arch` - GPU architecture name
625/// * `compute_capability` - GPU compute capability
626/// * `available_memory` - Available GPU memory in bytes
627///
628/// # Returns
629///
630/// * Result containing sparse frequency components and kernel statistics
631#[allow(clippy::too_many_arguments)]
632#[allow(dead_code)]
633pub fn execute_sparse_fft_kernel<T>(
634    signal: &[T],
635    sparsity: usize,
636    algorithm: SparseFFTAlgorithm,
637    window_function: WindowFunction,
638    gpu_arch: &str,
639    compute_capability: (i32, i32),
640    available_memory: usize,
641) -> FFTResult<(Vec<Complex64>, Vec<usize>, KernelStats)>
642where
643    T: NumCast + Copy + Debug + 'static,
644{
645    // Create kernel factory
646    let factory = KernelFactory::new(
647        gpu_arch.to_string(),
648        vec![compute_capability],
649        available_memory,
650        48 * 1024, // 48 KB shared _memory per block
651        1024,      // 1024 threads per block
652    );
653
654    // Create kernel launcher
655    let mut launcher = KernelLauncher::new(factory);
656
657    // Allocate _memory
658    let (input_address, output_values_address, output_indices_address) =
659        launcher.allocate_sparse_fft_memory(signal.len(), sparsity)?;
660
661    // In a real implementation, this would copy the signal to GPU _memory
662
663    // Launch kernel
664    let stats = launcher.launch_sparse_fft_kernel(
665        signal.len(),
666        sparsity,
667        input_address,
668        output_values_address,
669        output_indices_address,
670        algorithm,
671        window_function,
672    )?;
673
674    // In a real implementation, this would copy the results back from GPU _memory
675    // For now, just return dummy data
676
677    // Create dummy frequency components
678    let mut values = Vec::with_capacity(sparsity);
679    let mut indices = Vec::with_capacity(sparsity);
680
681    for i in 0..sparsity {
682        let idx = i * (signal.len() / sparsity);
683        let val = Complex64::new(1.0 / (i + 1) as f64, 0.0);
684
685        values.push(val);
686        indices.push(idx);
687    }
688
689    // Free _memory
690    launcher.free_all_memory();
691
692    Ok((values, indices, stats))
693}
694
695#[cfg(test)]
696mod tests {
697    use super::*;
698    use std::f64::consts::PI;
699
700    // Helper function to create a sparse signal
701    fn create_sparse_signal(n: usize, frequencies: &[(usize, f64)]) -> Vec<f64> {
702        let mut signal = vec![0.0; n];
703
704        for i in 0..n {
705            let t = 2.0 * PI * (i as f64) / (n as f64);
706            for &(freq, amp) in frequencies {
707                signal[i] += amp * (freq as f64 * t).sin();
708            }
709        }
710
711        signal
712    }
713
714    #[test]
715    fn test_kernel_factory() {
716        // Check if GPU is available
717        let caps = PlatformCapabilities::detect();
718        if !caps.cuda_available && !caps.gpu_available {
719            // Mock test for CPU-only environments
720            eprintln!("GPU not available, using mock kernel factory test");
721            // Test factory creation still works
722            let factory = KernelFactory::new(
723                "Mock Device".to_string(),
724                vec![(1, 1)],
725                1024 * 1024, // 1 MB
726                16 * 1024,   // 16 KB
727                32,          // 32 threads
728            );
729            assert!(factory.arch.contains("Mock"));
730            return;
731        }
732
733        let factory = KernelFactory::new(
734            "NVIDIA GeForce RTX 3080".to_string(),
735            vec![(8, 6)],
736            10 * 1024 * 1024 * 1024, // 10 GB
737            48 * 1024,               // 48 KB
738            1024,                    // 1024 threads per block
739        );
740
741        // Test creating FFT kernel
742        let kernel = factory.create_fft_kernel(1024, 0x10000, 0x20000).unwrap();
743
744        // Check configuration
745        let config = kernel.config();
746        assert_eq!(config.block_size, 1024);
747        assert!(config.use_mixed_precision);
748        assert!(config.use_tensor_cores);
749
750        // Test creating sparse FFT kernel
751        let kernel = factory
752            .create_sparse_fft_kernel(
753                1024,
754                10,
755                0x10000,
756                0x20000,
757                0x30000,
758                SparseFFTAlgorithm::Sublinear,
759                WindowFunction::Hann,
760            )
761            .unwrap();
762
763        // Check configuration
764        let config = kernel.config();
765        assert_eq!(config.block_size, 256);
766        assert!(config.use_mixed_precision);
767    }
768
769    #[test]
770    fn test_kernel_launcher() {
771        // Check if GPU is available
772        let caps = PlatformCapabilities::detect();
773        if !caps.cuda_available && !caps.gpu_available {
774            // Mock test for CPU-only environments
775            eprintln!("GPU not available, using mock kernel launcher test");
776            let factory = KernelFactory::new(
777                "Mock Device".to_string(),
778                vec![(1, 1)],
779                1024 * 1024,
780                16 * 1024,
781                32,
782            );
783            let launcher = KernelLauncher::new(factory);
784            // Test that launcher is created successfully
785            assert_eq!(launcher.get_total_memory_allocated(), 0);
786            return;
787        }
788
789        let factory = KernelFactory::new(
790            "NVIDIA GeForce RTX 3080".to_string(),
791            vec![(8, 6)],
792            10 * 1024 * 1024 * 1024, // 10 GB
793            48 * 1024,               // 48 KB
794            1024,                    // 1024 threads per block
795        );
796
797        let mut launcher = KernelLauncher::new(factory);
798
799        // Test allocating memory
800        let (input_address, output_address) = launcher.allocate_fft_memory(1024).unwrap();
801        assert_ne!(input_address, 0);
802        assert_ne!(output_address, 0);
803
804        // Test launching FFT kernel
805        let stats = launcher
806            .launch_fft_kernel(1024, input_address, output_address)
807            .unwrap();
808
809        // Check stats
810        assert!(stats.execution_time_ms > 0.0);
811        assert!(stats.memory_bandwidth_gb_s > 0.0);
812        assert!(stats.compute_throughput_gflops > 0.0);
813
814        // Test freeing memory
815        launcher.free_all_memory();
816        assert_eq!(launcher.get_total_memory_allocated(), 0);
817    }
818
819    #[test]
820    fn test_execute_sparse_fft_kernel() {
821        // Create a sparse signal
822        let n = 1024;
823        let frequencies = vec![(3, 1.0), (7, 0.5), (15, 0.25)];
824        let signal = create_sparse_signal(n, &frequencies);
825
826        // Check if GPU is available
827        let caps = PlatformCapabilities::detect();
828        if !caps.cuda_available && !caps.gpu_available {
829            // Mock test for CPU-only environments
830            eprintln!("GPU not available, using mock sparse FFT kernel test");
831            // Test with mock device
832            let result = execute_sparse_fft_kernel(
833                &signal,
834                6,
835                SparseFFTAlgorithm::Sublinear,
836                WindowFunction::Hann,
837                "Mock Device",
838                (1, 1),
839                1024 * 1024, // 1 MB
840            );
841            // In mock mode, this should still return valid dummy data
842            let (values, indices, stats) = result.unwrap();
843            assert_eq!(values.len(), 6);
844            assert_eq!(indices.len(), 6);
845            assert!(stats.execution_time_ms >= 0.0);
846            return;
847        }
848
849        // Execute sparse FFT kernel with GPU
850        let (values, indices, stats) = execute_sparse_fft_kernel(
851            &signal,
852            6,
853            SparseFFTAlgorithm::Sublinear,
854            WindowFunction::Hann,
855            "NVIDIA GeForce RTX 3080",
856            (8, 6),
857            10 * 1024 * 1024 * 1024, // 10 GB
858        )
859        .unwrap();
860
861        // Check results
862        assert_eq!(values.len(), 6);
863        assert_eq!(indices.len(), 6);
864        assert!(stats.execution_time_ms > 0.0);
865    }
866}