use crate::error::{FFTError, FFTResult};
use crate::sparse_fft::{SparseFFTAlgorithm, WindowFunction};
use scirs2_core::numeric::Complex64;
use scirs2_core::numeric::NumCast;
use scirs2_core::simd_ops::PlatformCapabilities;
use std::fmt::Debug;
#[derive(Debug, Clone)]
pub struct KernelConfig {
pub block_size: usize,
pub grid_size: usize,
pub shared_memory_size: usize,
pub use_mixed_precision: bool,
pub registers_per_thread: usize,
pub use_tensor_cores: bool,
}
impl Default for KernelConfig {
fn default() -> Self {
Self {
block_size: 256,
grid_size: 0, shared_memory_size: 16 * 1024, use_mixed_precision: false,
registers_per_thread: 32,
use_tensor_cores: false,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum KernelImplementation {
Throughput,
Latency,
MemoryEfficient,
HighAccuracy,
PowerEfficient,
}
#[derive(Debug, Clone)]
pub struct KernelStats {
pub execution_time_ms: f64,
pub memory_bandwidth_gb_s: f64,
pub compute_throughput_gflops: f64,
pub bytes_transferred_to_device: usize,
pub bytes_transferred_from_device: usize,
pub occupancy_percent: f64,
}
pub trait GPUKernel {
fn name(&self) -> &str;
fn config(&self) -> &KernelConfig;
fn set_config(&mut self, config: KernelConfig);
fn execute(&self) -> FFTResult<KernelStats>;
}
#[derive(Debug)]
pub struct FFTKernel {
config: KernelConfig,
input_size: usize,
#[allow(dead_code)]
input_address: usize,
#[allow(dead_code)]
output_address: usize,
}
impl FFTKernel {
pub fn new(input_size: usize, input_address: usize, outputaddress: usize) -> Self {
let mut config = KernelConfig::default();
config.grid_size = input_size.div_ceil(config.block_size);
Self {
config,
input_size,
input_address,
output_address: outputaddress,
}
}
}
impl GPUKernel for FFTKernel {
fn name(&self) -> &str {
"FFT_Kernel"
}
fn config(&self) -> &KernelConfig {
&self.config
}
fn set_config(&mut self, config: KernelConfig) {
self.config = config;
}
fn execute(&self) -> FFTResult<KernelStats> {
let execution_time_ms = self.input_size as f64 * 0.001;
let stats = KernelStats {
execution_time_ms,
memory_bandwidth_gb_s: 500.0,
compute_throughput_gflops: 10000.0,
bytes_transferred_to_device: self.input_size * std::mem::size_of::<Complex64>(),
bytes_transferred_from_device: self.input_size * std::mem::size_of::<Complex64>(),
occupancy_percent: 80.0,
};
Ok(stats)
}
}
#[derive(Debug)]
pub struct SparseFFTKernel {
config: KernelConfig,
input_size: usize,
sparsity: usize,
#[allow(dead_code)]
input_address: usize,
#[allow(dead_code)]
output_values_address: usize,
#[allow(dead_code)]
output_indices_address: usize,
algorithm: SparseFFTAlgorithm,
window_function: WindowFunction,
}
impl SparseFFTKernel {
#[allow(clippy::too_many_arguments)]
pub fn new(
input_size: usize,
sparsity: usize,
input_address: usize,
output_values_address: usize,
output_indices_address: usize,
algorithm: SparseFFTAlgorithm,
window_function: WindowFunction,
) -> Self {
let mut config = KernelConfig::default();
config.grid_size = input_size.div_ceil(config.block_size);
Self {
config,
input_size,
sparsity,
input_address,
output_values_address,
output_indices_address,
algorithm,
window_function,
}
}
pub fn apply_window(&self) -> FFTResult<KernelStats> {
let execution_time_ms = self.input_size as f64 * 0.0001;
let stats = KernelStats {
execution_time_ms,
memory_bandwidth_gb_s: 400.0,
compute_throughput_gflops: 1000.0,
bytes_transferred_to_device: 0,
bytes_transferred_from_device: 0,
occupancy_percent: 70.0,
};
Ok(stats)
}
pub fn get_algorithm_implementation(&self) -> FFTResult<KernelImplementation> {
match self.algorithm {
SparseFFTAlgorithm::Sublinear => Ok(KernelImplementation::Throughput),
SparseFFTAlgorithm::CompressedSensing => Ok(KernelImplementation::HighAccuracy),
SparseFFTAlgorithm::Iterative => Ok(KernelImplementation::Latency),
SparseFFTAlgorithm::Deterministic => Ok(KernelImplementation::Throughput),
SparseFFTAlgorithm::FrequencyPruning => Ok(KernelImplementation::MemoryEfficient),
SparseFFTAlgorithm::SpectralFlatness => Ok(KernelImplementation::HighAccuracy),
}
}
}
impl GPUKernel for SparseFFTKernel {
fn name(&self) -> &str {
"SparseFFT_Kernel"
}
fn config(&self) -> &KernelConfig {
&self.config
}
fn set_config(&mut self, config: KernelConfig) {
self.config = config;
}
fn execute(&self) -> FFTResult<KernelStats> {
let algorithm_factor = match self.algorithm {
SparseFFTAlgorithm::Sublinear => 0.8,
SparseFFTAlgorithm::CompressedSensing => 1.5,
SparseFFTAlgorithm::Iterative => 1.2,
SparseFFTAlgorithm::Deterministic => 1.0,
SparseFFTAlgorithm::FrequencyPruning => 0.9,
SparseFFTAlgorithm::SpectralFlatness => 1.3,
};
let window_factor = match self.window_function {
WindowFunction::None => 1.0,
WindowFunction::Hann => 1.1,
WindowFunction::Hamming => 1.1,
WindowFunction::Blackman => 1.2,
WindowFunction::FlatTop => 1.3,
WindowFunction::Kaiser => 1.4,
};
let execution_time_ms = self.input_size as f64 * algorithm_factor * window_factor * 0.001;
let stats = KernelStats {
execution_time_ms,
memory_bandwidth_gb_s: 450.0,
compute_throughput_gflops: 9000.0,
bytes_transferred_to_device: self.input_size * std::mem::size_of::<Complex64>(),
bytes_transferred_from_device: (self.sparsity * 2) * std::mem::size_of::<Complex64>(),
occupancy_percent: 75.0,
};
Ok(stats)
}
}
#[derive(Debug)]
pub struct KernelFactory {
#[allow(dead_code)]
arch: String,
compute_capabilities: Vec<(i32, i32)>,
available_memory: usize,
shared_memory_per_block: usize,
max_threads_per_block: usize,
}
impl KernelFactory {
pub fn new(
arch: String,
compute_capabilities: Vec<(i32, i32)>,
available_memory: usize,
shared_memory_per_block: usize,
max_threads_per_block: usize,
) -> Self {
Self {
arch,
compute_capabilities,
available_memory,
shared_memory_per_block,
max_threads_per_block,
}
}
pub fn create_fft_kernel(
&self,
input_size: usize,
input_address: usize,
output_address: usize,
) -> FFTResult<FFTKernel> {
let mut kernel = FFTKernel::new(input_size, input_address, output_address);
let mut config = KernelConfig::default();
config.block_size = if self.max_threads_per_block >= 1024 {
1024
} else if self.max_threads_per_block >= 512 {
512
} else {
256
};
config.grid_size = input_size.div_ceil(config.block_size);
config.shared_memory_size = std::cmp::min(
self.shared_memory_per_block,
16 * 1024, );
if !self.compute_capabilities.is_empty()
&& (self.compute_capabilities[0].0 >= 7
|| (self.compute_capabilities[0].0 == 6 && self.compute_capabilities[0].1 >= 1))
{
config.use_mixed_precision = true;
}
if !self.compute_capabilities.is_empty() && self.compute_capabilities[0].0 >= 7 {
config.use_tensor_cores = true;
}
kernel.set_config(config);
Ok(kernel)
}
#[allow(clippy::too_many_arguments)]
pub fn create_sparse_fft_kernel(
&self,
input_size: usize,
sparsity: usize,
input_address: usize,
output_values_address: usize,
output_indices_address: usize,
algorithm: SparseFFTAlgorithm,
window_function: WindowFunction,
) -> FFTResult<SparseFFTKernel> {
let mut kernel = SparseFFTKernel::new(
input_size,
sparsity,
input_address,
output_values_address,
output_indices_address,
algorithm,
window_function,
);
let mut config = KernelConfig::default();
config.block_size = match algorithm {
SparseFFTAlgorithm::Sublinear => 256,
SparseFFTAlgorithm::CompressedSensing => 512,
SparseFFTAlgorithm::Iterative => 128,
SparseFFTAlgorithm::Deterministic => 256,
SparseFFTAlgorithm::FrequencyPruning => 256,
SparseFFTAlgorithm::SpectralFlatness => 512,
};
config.block_size = std::cmp::min(config.block_size, self.max_threads_per_block);
config.grid_size = input_size.div_ceil(config.block_size);
config.shared_memory_size = match algorithm {
SparseFFTAlgorithm::Sublinear => 16 * 1024,
SparseFFTAlgorithm::CompressedSensing => 32 * 1024,
SparseFFTAlgorithm::Iterative => 8 * 1024,
SparseFFTAlgorithm::Deterministic => 16 * 1024,
SparseFFTAlgorithm::FrequencyPruning => 16 * 1024,
SparseFFTAlgorithm::SpectralFlatness => 32 * 1024,
};
config.shared_memory_size =
std::cmp::min(config.shared_memory_size, self.shared_memory_per_block);
if !self.compute_capabilities.is_empty()
&& (self.compute_capabilities[0].0 >= 7
|| (self.compute_capabilities[0].0 == 6 && self.compute_capabilities[0].1 >= 1))
{
match algorithm {
SparseFFTAlgorithm::Sublinear
| SparseFFTAlgorithm::Deterministic
| SparseFFTAlgorithm::FrequencyPruning => {
config.use_mixed_precision = true;
}
_ => {
config.use_mixed_precision = false;
}
}
}
if !self.compute_capabilities.is_empty() && self.compute_capabilities[0].0 >= 7 {
match algorithm {
SparseFFTAlgorithm::CompressedSensing | SparseFFTAlgorithm::SpectralFlatness => {
config.use_tensor_cores = true;
}
_ => {
config.use_tensor_cores = false;
}
}
}
kernel.set_config(config);
Ok(kernel)
}
pub fn check_memory_requirements(&self, total_bytesneeded: usize) -> FFTResult<()> {
if total_bytesneeded > self.available_memory {
return Err(FFTError::MemoryError(format!(
"Not enough GPU memory: need {} bytes, available {} bytes",
total_bytesneeded, self.available_memory
)));
}
Ok(())
}
}
pub struct KernelLauncher {
factory: KernelFactory,
active_kernels: Vec<Box<dyn GPUKernel>>,
total_memory_allocated: usize,
}
impl KernelLauncher {
pub fn new(factory: KernelFactory) -> Self {
Self {
factory,
active_kernels: Vec::new(),
total_memory_allocated: 0,
}
}
pub fn allocate_fft_memory(&mut self, inputsize: usize) -> FFTResult<(usize, usize)> {
let element_size = std::mem::size_of::<Complex64>();
let input_bytes = inputsize * element_size;
let output_bytes = inputsize * element_size;
let total_bytes = input_bytes + output_bytes;
self.factory.check_memory_requirements(total_bytes)?;
let input_address = 0x10000;
let output_address = 0x20000;
self.total_memory_allocated += total_bytes;
Ok((input_address, output_address))
}
pub fn allocate_sparse_fft_memory(
&mut self,
input_size: usize,
sparsity: usize,
) -> FFTResult<(usize, usize, usize)> {
let element_size = std::mem::size_of::<Complex64>();
let index_size = std::mem::size_of::<usize>();
let input_bytes = input_size * element_size;
let output_values_bytes = sparsity * element_size;
let output_indices_bytes = sparsity * index_size;
let total_bytes = input_bytes + output_values_bytes + output_indices_bytes;
self.factory.check_memory_requirements(total_bytes)?;
let input_address = 0x10000;
let output_values_address = 0x20000;
let output_indices_address = 0x30000;
self.total_memory_allocated += total_bytes;
Ok((input_address, output_values_address, output_indices_address))
}
pub fn launch_fft_kernel(
&mut self,
input_size: usize,
input_address: usize,
output_address: usize,
) -> FFTResult<KernelStats> {
let kernel = self
.factory
.create_fft_kernel(input_size, input_address, output_address)?;
let stats = kernel.execute()?;
Ok(stats)
}
#[allow(clippy::too_many_arguments)]
pub fn launch_sparse_fft_kernel(
&mut self,
input_size: usize,
sparsity: usize,
input_address: usize,
output_values_address: usize,
output_indices_address: usize,
algorithm: SparseFFTAlgorithm,
window_function: WindowFunction,
) -> FFTResult<KernelStats> {
let kernel = self.factory.create_sparse_fft_kernel(
input_size,
sparsity,
input_address,
output_values_address,
output_indices_address,
algorithm,
window_function,
)?;
if window_function != WindowFunction::None {
kernel.apply_window()?;
}
let stats = kernel.execute()?;
Ok(stats)
}
pub fn get_total_memory_allocated(&self) -> usize {
self.total_memory_allocated
}
pub fn free_all_memory(&mut self) {
self.active_kernels.clear();
self.total_memory_allocated = 0;
}
}
#[allow(clippy::too_many_arguments)]
#[allow(dead_code)]
pub fn execute_sparse_fft_kernel<T>(
signal: &[T],
sparsity: usize,
algorithm: SparseFFTAlgorithm,
window_function: WindowFunction,
gpu_arch: &str,
compute_capability: (i32, i32),
available_memory: usize,
) -> FFTResult<(Vec<Complex64>, Vec<usize>, KernelStats)>
where
T: NumCast + Copy + Debug + 'static,
{
let factory = KernelFactory::new(
gpu_arch.to_string(),
vec![compute_capability],
available_memory,
48 * 1024, 1024, );
let mut launcher = KernelLauncher::new(factory);
let (input_address, output_values_address, output_indices_address) =
launcher.allocate_sparse_fft_memory(signal.len(), sparsity)?;
let stats = launcher.launch_sparse_fft_kernel(
signal.len(),
sparsity,
input_address,
output_values_address,
output_indices_address,
algorithm,
window_function,
)?;
let mut values = Vec::with_capacity(sparsity);
let mut indices = Vec::with_capacity(sparsity);
for i in 0..sparsity {
let idx = i * (signal.len() / sparsity);
let val = Complex64::new(1.0 / (i + 1) as f64, 0.0);
values.push(val);
indices.push(idx);
}
launcher.free_all_memory();
Ok((values, indices, stats))
}
#[cfg(test)]
mod tests {
use super::*;
use std::f64::consts::PI;
fn create_sparse_signal(n: usize, frequencies: &[(usize, f64)]) -> Vec<f64> {
let mut signal = vec![0.0; n];
for i in 0..n {
let t = 2.0 * PI * (i as f64) / (n as f64);
for &(freq, amp) in frequencies {
signal[i] += amp * (freq as f64 * t).sin();
}
}
signal
}
#[test]
fn test_kernel_factory() {
let caps = PlatformCapabilities::detect();
if !caps.cuda_available && !caps.gpu_available {
eprintln!("GPU not available, using mock kernel factory test");
let factory = KernelFactory::new(
"Mock Device".to_string(),
vec![(1, 1)],
1024 * 1024, 16 * 1024, 32, );
assert!(factory.arch.contains("Mock"));
return;
}
let factory = KernelFactory::new(
"NVIDIA GeForce RTX 3080".to_string(),
vec![(8, 6)],
10 * 1024 * 1024 * 1024, 48 * 1024, 1024, );
let kernel = factory
.create_fft_kernel(1024, 0x10000, 0x20000)
.expect("Operation failed");
let config = kernel.config();
assert_eq!(config.block_size, 1024);
assert!(config.use_mixed_precision);
assert!(config.use_tensor_cores);
let kernel = factory
.create_sparse_fft_kernel(
1024,
10,
0x10000,
0x20000,
0x30000,
SparseFFTAlgorithm::Sublinear,
WindowFunction::Hann,
)
.expect("Operation failed");
let config = kernel.config();
assert_eq!(config.block_size, 256);
assert!(config.use_mixed_precision);
}
#[test]
fn test_kernel_launcher() {
let caps = PlatformCapabilities::detect();
if !caps.cuda_available && !caps.gpu_available {
eprintln!("GPU not available, using mock kernel launcher test");
let factory = KernelFactory::new(
"Mock Device".to_string(),
vec![(1, 1)],
1024 * 1024,
16 * 1024,
32,
);
let launcher = KernelLauncher::new(factory);
assert_eq!(launcher.get_total_memory_allocated(), 0);
return;
}
let factory = KernelFactory::new(
"NVIDIA GeForce RTX 3080".to_string(),
vec![(8, 6)],
10 * 1024 * 1024 * 1024, 48 * 1024, 1024, );
let mut launcher = KernelLauncher::new(factory);
let (input_address, output_address) = launcher
.allocate_fft_memory(1024)
.expect("Operation failed");
assert_ne!(input_address, 0);
assert_ne!(output_address, 0);
let stats = launcher
.launch_fft_kernel(1024, input_address, output_address)
.expect("Operation failed");
assert!(stats.execution_time_ms > 0.0);
assert!(stats.memory_bandwidth_gb_s > 0.0);
assert!(stats.compute_throughput_gflops > 0.0);
launcher.free_all_memory();
assert_eq!(launcher.get_total_memory_allocated(), 0);
}
#[test]
fn test_execute_sparse_fft_kernel() {
let n = 1024;
let frequencies = vec![(3, 1.0), (7, 0.5), (15, 0.25)];
let signal = create_sparse_signal(n, &frequencies);
let caps = PlatformCapabilities::detect();
if !caps.cuda_available && !caps.gpu_available {
eprintln!("GPU not available, using mock sparse FFT kernel test");
let result = execute_sparse_fft_kernel(
&signal,
6,
SparseFFTAlgorithm::Sublinear,
WindowFunction::Hann,
"Mock Device",
(1, 1),
1024 * 1024, );
let (values, indices, stats) = result.expect("Operation failed");
assert_eq!(values.len(), 6);
assert_eq!(indices.len(), 6);
assert!(stats.execution_time_ms >= 0.0);
return;
}
let (values, indices, stats) = execute_sparse_fft_kernel(
&signal,
6,
SparseFFTAlgorithm::Sublinear,
WindowFunction::Hann,
"NVIDIA GeForce RTX 3080",
(8, 6),
10 * 1024 * 1024 * 1024, )
.expect("Operation failed");
assert_eq!(values.len(), 6);
assert_eq!(indices.len(), 6);
assert!(stats.execution_time_ms > 0.0);
}
}