oxifft/dft/codelets/simd/
mod.rs

1//! SIMD-optimized codelets.
2//!
3//! This module provides SIMD-accelerated versions of the small DFT kernels.
4//! The codelets use the architecture-specific SIMD backends when available.
5
6// Items after statements are intentional for precomputed twiddle tables
7#![allow(clippy::items_after_statements)]
8// Large stack arrays are intentional for performance in fixed-size transforms
9#![allow(clippy::large_stack_arrays)]
10
11use core::any::TypeId;
12
13use crate::kernel::{Complex, Float};
14use crate::simd::{detect_simd_level, SimdLevel};
15
16pub(crate) mod backends;
17mod large_sizes;
18mod small_sizes;
19#[cfg(test)]
20mod tests;
21
22pub use large_sizes::*;
23pub use small_sizes::*;
24
25/// Detect if SIMD acceleration is available and beneficial.
26#[inline]
27pub fn simd_available() -> bool {
28    let level = detect_simd_level();
29    matches!(
30        level,
31        SimdLevel::Sse2 | SimdLevel::Avx | SimdLevel::Avx2 | SimdLevel::Avx512 | SimdLevel::Neon
32    )
33}
34
35/// Size-2 DFT with automatic SIMD dispatch.
36///
37/// This function selects the best implementation based on available CPU features
38/// and the float type. For f64, uses SIMD acceleration when available.
39#[inline]
40pub fn notw_2_dispatch<T: Float>(x: &mut [Complex<T>]) {
41    // Check if T is f64 at runtime
42    if TypeId::of::<T>() == TypeId::of::<f64>() {
43        // Safety: We verified T is f64, so the memory layout is identical
44        let x_f64 = unsafe {
45            core::slice::from_raw_parts_mut(x.as_mut_ptr().cast::<Complex<f64>>(), x.len())
46        };
47        notw_2_simd_f64(x_f64);
48        return;
49    }
50    // Fallback to scalar for other types
51    super::notw_2(x);
52}
53
54/// Size-4 DFT with automatic SIMD dispatch.
55///
56/// This function selects the best implementation based on available CPU features
57/// and the float type. For f64, uses SIMD acceleration when available.
58#[inline]
59pub fn notw_4_dispatch<T: Float>(x: &mut [Complex<T>], sign: i32) {
60    // Check if T is f64 at runtime
61    if TypeId::of::<T>() == TypeId::of::<f64>() {
62        // Safety: We verified T is f64, so the memory layout is identical
63        let x_f64 = unsafe {
64            core::slice::from_raw_parts_mut(x.as_mut_ptr().cast::<Complex<f64>>(), x.len())
65        };
66        notw_4_simd_f64(x_f64, sign);
67        return;
68    }
69    // Fallback to scalar for other types
70    super::notw_4(x, sign);
71}
72
73/// Size-8 DFT with automatic SIMD dispatch.
74///
75/// This function selects the best implementation based on available CPU features
76/// and the float type. For f64, uses SIMD acceleration when available.
77#[inline]
78pub fn notw_8_dispatch<T: Float>(x: &mut [Complex<T>], sign: i32) {
79    // Check if T is f64 at runtime
80    if TypeId::of::<T>() == TypeId::of::<f64>() {
81        // Safety: We verified T is f64, so the memory layout is identical
82        let x_f64 = unsafe {
83            core::slice::from_raw_parts_mut(x.as_mut_ptr().cast::<Complex<f64>>(), x.len())
84        };
85        notw_8_simd_f64(x_f64, sign);
86        return;
87    }
88    // Fallback to scalar for other types
89    super::notw_8(x, sign);
90}
91
92/// Size-16 DFT with automatic SIMD dispatch.
93///
94/// Uses scalar implementation with twiddle recurrence optimization.
95#[inline]
96pub fn notw_16_dispatch<T: Float>(x: &mut [Complex<T>], sign: i32) {
97    // Check if T is f64 at runtime
98    if TypeId::of::<T>() == TypeId::of::<f64>() {
99        // Safety: We verified T is f64, so the memory layout is identical
100        let x_f64 = unsafe {
101            core::slice::from_raw_parts_mut(x.as_mut_ptr().cast::<Complex<f64>>(), x.len())
102        };
103        notw_16_simd_f64(x_f64, sign);
104        return;
105    }
106    // Fallback to scalar for other types
107    super::notw_16(x, sign);
108}
109
110/// Size-32 DFT with automatic SIMD dispatch.
111///
112/// Uses scalar implementation with twiddle recurrence optimization.
113#[inline]
114pub fn notw_32_dispatch<T: Float>(x: &mut [Complex<T>], sign: i32) {
115    // Check if T is f64 at runtime
116    if TypeId::of::<T>() == TypeId::of::<f64>() {
117        // Safety: We verified T is f64, so the memory layout is identical
118        let x_f64 = unsafe {
119            core::slice::from_raw_parts_mut(x.as_mut_ptr().cast::<Complex<f64>>(), x.len())
120        };
121        notw_32_simd_f64(x_f64, sign);
122        return;
123    }
124    // Fallback to scalar for other types
125    super::notw_32(x, sign);
126}
127
128/// Size-64 DFT with automatic SIMD dispatch.
129///
130/// Uses scalar implementation with twiddle recurrence optimization.
131#[inline]
132pub fn notw_64_dispatch<T: Float>(x: &mut [Complex<T>], sign: i32) {
133    // Check if T is f64 at runtime
134    if TypeId::of::<T>() == TypeId::of::<f64>() {
135        // Safety: We verified T is f64, so the memory layout is identical
136        let x_f64 = unsafe {
137            core::slice::from_raw_parts_mut(x.as_mut_ptr().cast::<Complex<f64>>(), x.len())
138        };
139        notw_64_simd_f64(x_f64, sign);
140        return;
141    }
142    // Fallback to scalar for other types
143    super::notw_64(x, sign);
144}
145
146/// Size-128 DFT with automatic SIMD dispatch.
147///
148/// Uses scalar implementation with twiddle recurrence optimization.
149#[inline]
150pub fn notw_128_dispatch<T: Float>(x: &mut [Complex<T>], sign: i32) {
151    // Check if T is f64 at runtime
152    if TypeId::of::<T>() == TypeId::of::<f64>() {
153        // Safety: We verified T is f64, so the memory layout is identical
154        let x_f64 = unsafe {
155            core::slice::from_raw_parts_mut(x.as_mut_ptr().cast::<Complex<f64>>(), x.len())
156        };
157        notw_128_simd_f64(x_f64, sign);
158        return;
159    }
160    // Fallback to scalar for other types
161    super::notw_128(x, sign);
162}
163
164/// Size-256 DFT with automatic SIMD dispatch.
165///
166/// Uses scalar implementation with twiddle recurrence optimization.
167#[inline]
168pub fn notw_256_dispatch<T: Float>(x: &mut [Complex<T>], sign: i32) {
169    // Check if T is f64 at runtime
170    if TypeId::of::<T>() == TypeId::of::<f64>() {
171        // Safety: We verified T is f64, so the memory layout is identical
172        let x_f64 = unsafe {
173            core::slice::from_raw_parts_mut(x.as_mut_ptr().cast::<Complex<f64>>(), x.len())
174        };
175        notw_256_simd_f64(x_f64, sign);
176        return;
177    }
178    // Fallback to scalar for other types
179    super::notw_256(x, sign);
180}
181
182/// Size-512 DFT with automatic SIMD dispatch.
183///
184/// Uses iterative DIT with SIMD butterflies for optimal performance.
185#[inline]
186pub fn notw_512_dispatch<T: Float>(x: &mut [Complex<T>], sign: i32) {
187    // Check if T is f64 at runtime
188    if TypeId::of::<T>() == TypeId::of::<f64>() {
189        // Safety: We verified T is f64, so the memory layout is identical
190        let x_f64 = unsafe {
191            core::slice::from_raw_parts_mut(x.as_mut_ptr().cast::<Complex<f64>>(), x.len())
192        };
193        notw_512_simd_f64(x_f64, sign);
194        return;
195    }
196    // Fallback to iterative DIT for other types.
197    // Note: Must use execute_inplace directly to avoid infinite recursion,
198    // since CooleyTukeySolver::execute dispatches back to this codelet.
199    use crate::dft::problem::Sign;
200    use crate::dft::solvers::CooleyTukeySolver;
201    let sign_enum = if sign < 0 {
202        Sign::Forward
203    } else {
204        Sign::Backward
205    };
206    CooleyTukeySolver::default().execute_dit_inplace(x, sign_enum);
207}
208
209/// Size-1024 DFT with automatic SIMD dispatch.
210///
211/// Uses iterative DIT with SIMD butterflies for optimal performance.
212#[inline]
213pub fn notw_1024_dispatch<T: Float>(x: &mut [Complex<T>], sign: i32) {
214    // Check if T is f64 at runtime
215    if TypeId::of::<T>() == TypeId::of::<f64>() {
216        // Safety: We verified T is f64, so the memory layout is identical
217        let x_f64 = unsafe {
218            core::slice::from_raw_parts_mut(x.as_mut_ptr().cast::<Complex<f64>>(), x.len())
219        };
220        notw_1024_simd_f64(x_f64, sign);
221        return;
222    }
223    // Fallback to iterative DIT for other types.
224    // Note: Must use execute_inplace directly to avoid infinite recursion,
225    // since CooleyTukeySolver::execute dispatches back to this codelet.
226    use crate::dft::problem::Sign;
227    use crate::dft::solvers::CooleyTukeySolver;
228    let sign_enum = if sign < 0 {
229        Sign::Forward
230    } else {
231        Sign::Backward
232    };
233    CooleyTukeySolver::default().execute_dit_inplace(x, sign_enum);
234}
235
236/// Size-4096 DFT with automatic SIMD dispatch.
237///
238/// Uses iterative DIT with SIMD butterflies for optimal performance.
239#[inline]
240pub fn notw_4096_dispatch<T: Float>(x: &mut [Complex<T>], sign: i32) {
241    // Check if T is f64 at runtime
242    if TypeId::of::<T>() == TypeId::of::<f64>() {
243        // Safety: We verified T is f64, so the memory layout is identical
244        let x_f64 = unsafe {
245            core::slice::from_raw_parts_mut(x.as_mut_ptr().cast::<Complex<f64>>(), x.len())
246        };
247        notw_4096_simd_f64(x_f64, sign);
248        return;
249    }
250    // Fallback to iterative DIT for other types.
251    // Note: Must use execute_inplace directly to avoid infinite recursion,
252    // since CooleyTukeySolver::execute dispatches back to this codelet.
253    use crate::dft::problem::Sign;
254    use crate::dft::solvers::CooleyTukeySolver;
255    let sign_enum = if sign < 0 {
256        Sign::Forward
257    } else {
258        Sign::Backward
259    };
260    CooleyTukeySolver::default().execute_dit_inplace(x, sign_enum);
261}
oxifft/dft/codelets/simd/mod.rs

oxifft/dft/codelets/simd/
mod.rs