#![allow(clippy::needless_range_loop)]
use plotly::common::Title;
use plotly::{common::Mode, layout::Axis, Layout, Plot, Scatter};
use scirs2_core::Complex64;
use scirs2_fft::{
sparse_fft,
sparse_fft::SparseFFTAlgorithm,
sparse_fft_gpu::GPUBackend,
sparse_fft_gpu_cuda::{cuda_sparse_fft, get_cuda_devices},
sparse_fft_gpu_memory::{init_global_memory_manager, is_cuda_available},
};
use std::f64::consts::PI;
#[allow(dead_code)]
fn main() {
println!("CUDA-Accelerated Sparse FFT Example");
println!("==================================\n");
println!("Checking for CUDA availability...");
if !is_cuda_available() {
println!("CUDA is not available. Using CPU fallback implementation.");
} else {
println!("CUDA is available!");
let devices = get_cuda_devices().expect("Operation failed");
println!("Found {} CUDA device(s):", devices.len());
for (idx, device) in devices.iter().enumerate() {
println!(" - Device {} (initialized: {})", idx, device.initialized);
}
}
println!("\nInitializing GPU memory manager...");
init_global_memory_manager(
GPUBackend::CUDA,
0, scirs2_fft::sparse_fft_gpu_memory::AllocationStrategy::CacheBySize,
1024 * 1024 * 1024, )
.expect("Operation failed");
let n = 1024;
println!("\nCreating a signal with n = {n} samples and 3 frequency components");
let frequencies = vec![(3, 1.0), (7, 0.5), (15, 0.25)];
let signal = create_sparse_signal(n, &frequencies);
println!("\nComputing regular CPU sparse FFT for comparison...");
let cpu_start = std::time::Instant::now();
let cpu_result = sparse_fft(&signal, 6, Some(SparseFFTAlgorithm::Sublinear), None)
.expect("Operation failed");
let cpu_elapsed = cpu_start.elapsed();
println!(
"CPU Sparse FFT: Found {} frequency components in {:?}",
cpu_result.values.len(),
cpu_elapsed
);
println!("\nComputing CUDA-accelerated sparse FFT...");
let cuda_start = std::time::Instant::now();
let cuda_result = cuda_sparse_fft(
&signal,
6,
0, Some(SparseFFTAlgorithm::Sublinear),
None,
)
.expect("Operation failed");
let cuda_elapsed = cuda_start.elapsed();
println!(
"CUDA Sparse FFT: Found {} frequency components in {:?}",
cuda_result.values.len(),
cuda_elapsed
);
if cuda_elapsed < cpu_elapsed {
println!(
"CUDA implementation was {:.2}x faster than CPU",
cpu_elapsed.as_secs_f64() / cuda_elapsed.as_secs_f64()
);
} else {
println!(
"CPU implementation was {:.2}x faster than CUDA (this is expected for small signals)",
cuda_elapsed.as_secs_f64() / cpu_elapsed.as_secs_f64()
);
println!("For larger signals or batch processing, CUDA will show better performance");
}
println!("\nComparing CPU and CUDA results:");
println!(" Top frequency components from CPU implementation:");
let mut cpu_components: Vec<(usize, Complex64)> = Vec::new();
for (&idx, &val) in cpu_result.indices.iter().zip(cpu_result.values.iter()) {
if !cpu_components.iter().any(|(i_, _)| *i_ == idx) {
cpu_components.push((idx, val));
}
}
cpu_components.sort_by(|(_, a), (_, b)| {
b.norm()
.partial_cmp(&a.norm())
.unwrap_or(std::cmp::Ordering::Equal)
});
for (i, (idx, val)) in cpu_components.iter().take(3).enumerate() {
println!(
" {}. Index {}: magnitude = {:.3}",
i + 1,
idx,
val.norm()
);
}
println!(" Top frequency components from CUDA implementation:");
let mut cuda_components: Vec<(usize, Complex64)> = Vec::new();
for (&idx, &val) in cuda_result.indices.iter().zip(cuda_result.values.iter()) {
if !cuda_components.iter().any(|(i_, _)| *i_ == idx) {
cuda_components.push((idx, val));
}
}
cuda_components.sort_by(|(_, a), (_, b)| {
b.norm()
.partial_cmp(&a.norm())
.unwrap_or(std::cmp::Ordering::Equal)
});
for (i, (idx, val)) in cuda_components.iter().take(3).enumerate() {
println!(
" {}. Index {}: magnitude = {:.3}",
i + 1,
idx,
val.norm()
);
}
println!("\nCreating visualization...");
create_comparison_plot(&signal, &cpu_result, &cuda_result);
println!("\nTesting with a larger signal (16K samples)...");
let large_n = 16 * 1024;
let large_signal = create_sparse_signal(large_n, &frequencies);
let cpu_start = std::time::Instant::now();
let _large_cpu_result = sparse_fft(&large_signal, 6, Some(SparseFFTAlgorithm::Sublinear), None)
.expect("Operation failed");
let cpu_elapsed = cpu_start.elapsed();
let cuda_start = std::time::Instant::now();
let _large_cuda_result = cuda_sparse_fft(
&large_signal,
6,
0, Some(SparseFFTAlgorithm::Sublinear),
None,
)
.expect("Operation failed");
let cuda_elapsed = cuda_start.elapsed();
println!(" CPU elapsed time: {cpu_elapsed:?}");
println!(" CUDA elapsed time: {cuda_elapsed:?}");
if cuda_elapsed < cpu_elapsed {
println!(
" CUDA implementation was {:.2}x faster than CPU",
cpu_elapsed.as_secs_f64() / cuda_elapsed.as_secs_f64()
);
} else {
println!(
" CPU implementation was {:.2}x faster than CUDA",
cuda_elapsed.as_secs_f64() / cpu_elapsed.as_secs_f64()
);
}
println!("\nCUDA integration benefits:");
println!(" - GPU acceleration for large signals (>16K samples)");
println!(" - Efficient batch processing of multiple signals");
println!(" - Improved performance for compute-intensive algorithms");
println!(" - Better scaling with signal size compared to CPU");
println!("\nExample completed successfully!");
}
#[allow(dead_code)]
fn create_sparse_signal(n: usize, frequencies: &[(usize, f64)]) -> Vec<f64> {
let mut signal = vec![0.0; n];
for i in 0..n {
let t = 2.0 * PI * (i as f64) / (n as f64);
for &(freq, amp) in frequencies {
signal[i] += amp * (freq as f64 * t).sin();
}
}
signal
}
#[allow(dead_code)]
fn create_comparison_plot(
signal: &[f64],
cpu_result: &scirs2_fft::sparse_fft::SparseFFTResult,
cuda_result: &scirs2_fft::sparse_fft::SparseFFTResult,
) {
let mut time_plot = Plot::new();
let time_trace = Scatter::new((0..200).collect::<Vec<_>>(), signal[0..200].to_vec())
.mode(Mode::Lines)
.name("Original Signal (first 200 samples)");
time_plot.add_trace(time_trace);
time_plot.set_layout(
Layout::new()
.title(Title::with_text("Time Domain Signal"))
.x_axis(Axis::new().title(Title::with_text("Time")))
.y_axis(Axis::new().title(Title::with_text("Amplitude"))),
);
time_plot.write_html("cuda_sparse_fft_time_domain.html");
let mut freq_plot = Plot::new();
let signal_complex: Vec<Complex64> = signal.iter().map(|&x| Complex64::new(x, 0.0)).collect();
let full_spectrum = scirs2_fft::fft(&signal_complex, None).expect("Operation failed");
let full_magnitudes: Vec<f64> = full_spectrum.iter().map(|c| c.norm()).collect();
let full_trace = Scatter::new(
(0..full_magnitudes.len().min(100)).collect::<Vec<_>>(),
full_magnitudes[0..full_magnitudes.len().min(100)].to_vec(),
)
.mode(Mode::Lines)
.name("Full FFT (first 100 bins)");
let cpu_indices: Vec<_> = cpu_result
.indices
.iter()
.cloned()
.filter(|&idx| idx < 100)
.collect();
let cpu_values: Vec<_> = cpu_indices
.iter()
.map(|&idx| {
let pos = cpu_result
.indices
.iter()
.position(|&i| i == idx)
.expect("Operation failed");
cpu_result.values[pos].norm()
})
.collect();
let cpu_trace = Scatter::new(cpu_indices, cpu_values)
.mode(Mode::Markers)
.name("CPU Sparse FFT Components");
let cuda_indices: Vec<_> = cuda_result
.indices
.iter()
.cloned()
.filter(|&idx| idx < 100)
.collect();
let cuda_values: Vec<_> = cuda_indices
.iter()
.map(|&idx| {
let pos = cuda_result
.indices
.iter()
.position(|&i| i == idx)
.expect("Operation failed");
cuda_result.values[pos].norm()
})
.collect();
let cuda_trace = Scatter::new(cuda_indices, cuda_values)
.mode(Mode::Markers)
.name("CUDA Sparse FFT Components");
freq_plot.add_trace(full_trace);
freq_plot.add_trace(cpu_trace);
freq_plot.add_trace(cuda_trace);
freq_plot.set_layout(
Layout::new()
.title(Title::with_text("Frequency Domain Comparison"))
.x_axis(Axis::new().title(Title::with_text("Frequency Bin")))
.y_axis(Axis::new().title(Title::with_text("Magnitude"))),
);
freq_plot.write_html("cuda_sparse_fft_frequency_domain.html");
println!("Plots saved as 'cuda_sparse_fft_time_domain.html' and 'cuda_sparse_fft_frequency_domain.html'");
}