1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
//! Pure-Rust CUDA FFT dense path via the `oxicuda` ecosystem.
//!
//! This module provides an optional, **off-by-default** CUDA acceleration path for
//! 1-D complex FFTs, built entirely on the pure-Rust `oxicuda` crates
//! (`oxicuda-driver`, `oxicuda-memory`, `oxicuda-fft`). It is compiled only when the
//! `cuda` feature is enabled and it does **not** route through `scirs2-core`; the
//! existing CPU and `wgpu` FFT paths are untouched.
//!
//! ## Availability (runtime-probed, NVIDIA-only)
//!
//! `oxicuda-driver` loads `libcuda` at runtime. On a machine with no NVIDIA driver
//! (for example this development Mac), initialization fails and the public functions
//! return [`FFTError::BackendError`] rather than panicking. Call
//! [`cuda_is_available`] first to probe for a usable device; it never panics and
//! returns `false` when CUDA is unavailable.
//!
//! ## Normalization semantics (matches scirs2-fft CPU `fft`/`ifft`)
//!
//! `oxicuda`'s C2C transforms are **unnormalized** in both directions. To stay
//! consistent with scirs2-fft's CPU convention (SciPy-style: forward unnormalized,
//! inverse scaled by `1/N`), [`cuda_fft_1d`] applies no scaling while
//! [`cuda_ifft_1d`] divides every output element by `N`.
//!
//! ## Extensibility
//!
//! The flow (build a handle, upload host data to a [`oxicuda_memory::DeviceBuffer`],
//! create an [`oxicuda_fft::FftPlan`], execute, download) is intentionally explicit so
//! that 2-D / 3-D and R2C variants can be added by swapping the plan constructor and
//! the buffer element type without reworking the device-management boilerplate.
use crate::error::{FFTError, FFTResult};
use scirs2_core::numeric::Complex64;
/// Map an `oxicuda-fft` error into an [`FFTError`].
fn fft_err(e: oxicuda_fft::FftError) -> FFTError {
FFTError::ComputationError(format!("oxicuda-fft: {e}"))
}
/// Map an `oxicuda-driver` CUDA error into an [`FFTError`].
fn cuda_err(e: oxicuda_driver::CudaError) -> FFTError {
FFTError::BackendError(format!("oxicuda CUDA driver: {e}"))
}
/// Probe whether a usable NVIDIA CUDA device is available at runtime.
///
/// Never panics. Returns `false` when the CUDA driver cannot be initialized
/// (for example on non-NVIDIA platforms such as macOS) or when no device is present.
pub fn cuda_is_available() -> bool {
oxicuda_driver::init().is_ok()
&& oxicuda_driver::device::Device::count()
.map(|c| c > 0)
.unwrap_or(false)
}
/// Initialize the CUDA driver and build an FFT handle bound to device 0.
///
/// Returns the owning [`oxicuda_driver::Context`] (wrapped in an `Arc`) alongside the
/// [`oxicuda_fft::FftHandle`]; the caller must keep the context alive for the lifetime
/// of any plan execution.
fn build_handle() -> FFTResult<(
std::sync::Arc<oxicuda_driver::Context>,
oxicuda_fft::FftHandle,
)> {
oxicuda_driver::init().map_err(|e| FFTError::BackendError(format!("CUDA unavailable: {e}")))?;
let count = oxicuda_driver::device::Device::count().map_err(cuda_err)?;
if count <= 0 {
return Err(FFTError::BackendError(
"no NVIDIA CUDA device available".into(),
));
}
let dev = oxicuda_driver::device::Device::get(0).map_err(cuda_err)?;
let ctx = std::sync::Arc::new(oxicuda_driver::Context::new(&dev).map_err(cuda_err)?);
let handle = oxicuda_fft::FftHandle::new(&ctx).map_err(fft_err)?;
Ok((ctx, handle))
}
/// Compute a 1-D forward complex FFT on a CUDA device (unnormalized, SciPy convention).
///
/// Mirrors scirs2-fft's CPU `fft`: the forward transform applies no normalization.
/// Returns [`FFTError::BackendError`] if no NVIDIA CUDA device is available at runtime.
pub fn cuda_fft_1d(input: &[Complex64]) -> FFTResult<Vec<Complex64>> {
if input.is_empty() {
return Ok(Vec::new());
}
let n = input.len();
// Keep the context alive (`_ctx`) until the end of the function so device memory
// and the FFT plan remain valid through `execute`/`copy_to_host`.
let (_ctx, handle) = build_handle()?;
let dev_in: Vec<oxicuda_fft::Complex<f64>> = input
.iter()
.map(|c| oxicuda_fft::Complex::<f64>::new(c.re, c.im))
.collect();
let plan = oxicuda_fft::FftPlan::new_1d(n, oxicuda_fft::FftType::C2C, 1)
.map_err(fft_err)?
.with_precision(oxicuda_fft::FftPrecision::Double);
let d_in = oxicuda_memory::DeviceBuffer::from_host(&dev_in).map_err(cuda_err)?;
let d_out =
oxicuda_memory::DeviceBuffer::<oxicuda_fft::Complex<f64>>::alloc(n).map_err(cuda_err)?;
handle
.execute(
&plan,
d_in.as_device_ptr(),
d_out.as_device_ptr(),
oxicuda_fft::FftDirection::Forward,
)
.map_err(fft_err)?;
let mut host_out = vec![oxicuda_fft::Complex::<f64>::new(0.0, 0.0); n];
d_out.copy_to_host(&mut host_out).map_err(cuda_err)?;
// Forward transform is unnormalized (matches scirs2-fft CPU `fft`): no scaling.
let out: Vec<Complex64> = host_out
.iter()
.map(|c| Complex64::new(c.re, c.im))
.collect();
Ok(out)
}
/// Compute a 1-D inverse complex FFT on a CUDA device, normalized by `1/N`.
///
/// `oxicuda`'s inverse C2C transform is unnormalized, so this function divides every
/// output element by `N` to match scirs2-fft's CPU `ifft` (SciPy convention, which
/// applies the `1/N` factor on the inverse). Returns [`FFTError::BackendError`] if no
/// NVIDIA CUDA device is available at runtime.
pub fn cuda_ifft_1d(input: &[Complex64]) -> FFTResult<Vec<Complex64>> {
if input.is_empty() {
return Ok(Vec::new());
}
let n = input.len();
// Keep the context alive (`_ctx`) until the end of the function so device memory
// and the FFT plan remain valid through `execute`/`copy_to_host`.
let (_ctx, handle) = build_handle()?;
let dev_in: Vec<oxicuda_fft::Complex<f64>> = input
.iter()
.map(|c| oxicuda_fft::Complex::<f64>::new(c.re, c.im))
.collect();
let plan = oxicuda_fft::FftPlan::new_1d(n, oxicuda_fft::FftType::C2C, 1)
.map_err(fft_err)?
.with_precision(oxicuda_fft::FftPrecision::Double);
let d_in = oxicuda_memory::DeviceBuffer::from_host(&dev_in).map_err(cuda_err)?;
let d_out =
oxicuda_memory::DeviceBuffer::<oxicuda_fft::Complex<f64>>::alloc(n).map_err(cuda_err)?;
handle
.execute(
&plan,
d_in.as_device_ptr(),
d_out.as_device_ptr(),
oxicuda_fft::FftDirection::Inverse,
)
.map_err(fft_err)?;
let mut host_out = vec![oxicuda_fft::Complex::<f64>::new(0.0, 0.0); n];
d_out.copy_to_host(&mut host_out).map_err(cuda_err)?;
// oxicuda's inverse C2C is UNNORMALIZED; apply 1/N here so the result matches
// scirs2-fft's CPU `ifft`, which follows the SciPy convention of scaling the
// inverse transform by 1/N.
let scale = n as f64;
let out: Vec<Complex64> = host_out
.iter()
.map(|c| Complex64::new(c.re / scale, c.im / scale))
.collect();
Ok(out)
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::Mutex;
// CUDA driver APIs (cuCtxCreate, cuMemAlloc, cufftExec…) are serialised
// internally by the driver, but our `build_handle()` creates a new primary-
// context binding on each call. When two test threads both call
// `oxicuda_driver::Context::new` concurrently for the same device, the
// driver can return the same underlying handle to both threads, leading to
// use-after-free when the first thread drops its Arc<Context>. Serialising
// all tests in this module with a module-level Mutex eliminates the race
// without requiring any changes to the production code paths.
static CUDA_TEST_LOCK: Mutex<()> = Mutex::new(());
#[test]
fn cuda_fft_roundtrip_or_skip() {
let _guard = CUDA_TEST_LOCK.lock().expect("cuda test lock");
if !cuda_is_available() {
eprintln!("skipping: no NVIDIA CUDA device");
assert!(!cuda_is_available());
return;
}
let input: Vec<Complex64> = (0..8).map(|i| Complex64::new(i as f64, 0.0)).collect();
let spec = cuda_fft_1d(&input).expect("fwd");
let back = cuda_ifft_1d(&spec).expect("inv");
for (a, b) in input.iter().zip(back.iter()) {
assert!((a.re - b.re).abs() < 1e-6);
assert!((a.im - b.im).abs() < 1e-6);
}
}
// ---- Non-power-of-two sizes + normalization convention check ----
//
// The existing roundtrip test uses n=8 (a power of two). This test covers:
//
// (a) Forward+inverse roundtrip for n=6 and n=12 (both non-powers-of-two)
// with COMPLEX inputs that have nonzero imaginary parts, verifying that
// the device path handles mixed-radix plan sizes and that the combined
// forward/inverse normalization (forward unnormalized, inverse /N) is
// self-consistent — i.e. `cuda_ifft_1d(cuda_fft_1d(x)) ≈ x` within 1e-6.
//
// (b) Explicit normalization convention check for n=6:
// - FORWARD: `cuda_fft_1d` (unnormalized) output is compared against the
// scirs2-fft CPU `fft` for the SAME real-valued signal. Both follow the
// SciPy convention (forward unnormalized), so outputs must agree to 1e-6.
// - INVERSE: `cuda_ifft_1d` (scales by 1/N) output is compared against the
// scirs2-fft CPU `ifft` for a known real-valued spectrum [6,0,0,0,0,0],
// whose analytically correct result is [1,1,1,1,1,1] (the inverse DFT of
// a single DC bin of amplitude N). Both CPU and CUDA must agree with
// each other and with the analytic result to 1e-6.
#[test]
fn cuda_fft_non_power_of_two_or_skip() {
let _guard = CUDA_TEST_LOCK.lock().expect("cuda test lock");
if !cuda_is_available() {
eprintln!("skipping: no NVIDIA CUDA device");
assert!(!cuda_is_available());
return;
}
// ── Part (a): roundtrip for n=6 with complex inputs (nonzero imaginary parts)
let input6: Vec<Complex64> = vec![
Complex64::new(1.0, 2.0),
Complex64::new(3.0, 0.5),
Complex64::new(0.5, 1.5),
Complex64::new(2.0, 3.0),
Complex64::new(1.5, 2.5),
Complex64::new(0.7, 1.3),
];
let spec6 = cuda_fft_1d(&input6).expect("fwd n=6");
let back6 = cuda_ifft_1d(&spec6).expect("inv n=6");
for (i, (orig, recon)) in input6.iter().zip(back6.iter()).enumerate() {
let diff_re = (orig.re - recon.re).abs();
let diff_im = (orig.im - recon.im).abs();
assert!(
diff_re < 1e-6,
"n=6 roundtrip re[{i}]: orig={:.6} recon={:.6} diff={diff_re:.3e}",
orig.re,
recon.re
);
assert!(
diff_im < 1e-6,
"n=6 roundtrip im[{i}]: orig={:.6} recon={:.6} diff={diff_im:.3e}",
orig.im,
recon.im
);
}
// ── Part (a): roundtrip for n=12 with complex inputs (nonzero imaginary parts)
let input12: Vec<Complex64> = (0..12_usize)
.map(|i| Complex64::new((i as f64 * 0.7_f64).sin(), (i as f64 * 0.3_f64).cos()))
.collect();
let spec12 = cuda_fft_1d(&input12).expect("fwd n=12");
let back12 = cuda_ifft_1d(&spec12).expect("inv n=12");
for (i, (orig, recon)) in input12.iter().zip(back12.iter()).enumerate() {
let diff_re = (orig.re - recon.re).abs();
let diff_im = (orig.im - recon.im).abs();
assert!(
diff_re < 1e-6,
"n=12 roundtrip re[{i}]: orig={:.6} recon={:.6} diff={diff_re:.3e}",
orig.re,
recon.re
);
assert!(
diff_im < 1e-6,
"n=12 roundtrip im[{i}]: orig={:.6} recon={:.6} diff={diff_im:.3e}",
orig.im,
recon.im
);
}
// ── Part (b): normalization convention — forward (unnormalized) ──────────
// Use real-valued f64 inputs so we can compare against the CPU `fft`
// function (which requires T: NumCast, satisfied by f64 but not Complex64).
// Convert the same values to Complex64 (im=0) for the CUDA side.
let signal6 = [1.0_f64, 2.0, 3.0, 2.0, 1.0, 0.5];
let cpu_fwd6 = crate::fft::fft(&signal6, Some(6)).expect("cpu fft n=6");
let cuda_in6: Vec<Complex64> = signal6.iter().map(|&v| Complex64::new(v, 0.0)).collect();
let cuda_fwd6 = cuda_fft_1d(&cuda_in6).expect("cuda fft n=6 norm-check");
assert_eq!(
cpu_fwd6.len(),
cuda_fwd6.len(),
"forward output length mismatch"
);
for (k, (cpu, gpu)) in cpu_fwd6.iter().zip(cuda_fwd6.iter()).enumerate() {
let diff_re = (cpu.re - gpu.re).abs();
let diff_im = (cpu.im - gpu.im).abs();
assert!(
diff_re < 1e-6,
"fwd norm re[{k}]: cpu={:.6} cuda={:.6} diff={diff_re:.3e}",
cpu.re,
gpu.re
);
assert!(
diff_im < 1e-6,
"fwd norm im[{k}]: cpu={:.6} cuda={:.6} diff={diff_im:.3e}",
cpu.im,
gpu.im
);
}
// ── Part (b): normalization convention — inverse (scales by 1/N) ─────────
// Use the known spectrum [6, 0, 0, 0, 0, 0] for n=6. This is the DFT of
// the all-ones signal, so ifft([6,0,…,0]) must return [1,1,1,1,1,1].
// Pass as f64 to CPU ifft and as Complex64 to CUDA ifft; both must agree
// with the analytic result and with each other to 1e-6.
let dc_spectrum_f64 = [6.0_f64, 0.0, 0.0, 0.0, 0.0, 0.0];
let cpu_inv6 = crate::fft::ifft(&dc_spectrum_f64, Some(6)).expect("cpu ifft n=6");
let dc_spectrum_cx: Vec<Complex64> = dc_spectrum_f64
.iter()
.map(|&v| Complex64::new(v, 0.0))
.collect();
let cuda_inv6 = cuda_ifft_1d(&dc_spectrum_cx).expect("cuda ifft n=6 norm-check");
assert_eq!(cpu_inv6.len(), 6, "cpu ifft output length");
assert_eq!(cuda_inv6.len(), 6, "cuda ifft output length");
for k in 0..6 {
// Analytic: ifft([6,0,…,0])[n] = (1/6) * 6 * exp(0) = 1 for all n.
let expected = 1.0_f64;
let diff_cpu = (cpu_inv6[k].re - expected).abs();
let diff_cuda = (cuda_inv6[k].re - expected).abs();
let diff_cpu_im = cpu_inv6[k].im.abs();
let diff_cuda_im = cuda_inv6[k].im.abs();
let cpu_vs_cuda_re = (cpu_inv6[k].re - cuda_inv6[k].re).abs();
let cpu_vs_cuda_im = (cpu_inv6[k].im - cuda_inv6[k].im).abs();
assert!(
diff_cpu < 1e-6,
"inv norm cpu re[{k}]: got={:.6} expected={expected} diff={diff_cpu:.3e}",
cpu_inv6[k].re
);
assert!(
diff_cuda < 1e-6,
"inv norm cuda re[{k}]: got={:.6} expected={expected} diff={diff_cuda:.3e}",
cuda_inv6[k].re
);
assert!(
diff_cpu_im < 1e-6,
"inv norm cpu im[{k}] should be ~0: got={:.3e}",
cpu_inv6[k].im
);
assert!(
diff_cuda_im < 1e-6,
"inv norm cuda im[{k}] should be ~0: got={:.3e}",
cuda_inv6[k].im
);
assert!(
cpu_vs_cuda_re < 1e-6,
"cpu vs cuda inv re[{k}] diff={cpu_vs_cuda_re:.3e}"
);
assert!(
cpu_vs_cuda_im < 1e-6,
"cpu vs cuda inv im[{k}] diff={cpu_vs_cuda_im:.3e}"
);
}
}
}