1use crate::error::{FFTError, FFTResult};
8use crate::fft::{fft, ifft};
9use rustfft::FftPlanner;
10use scirs2_core::ndarray::{Array, ArrayBase, Data};
11use scirs2_core::numeric::Complex64;
12use scirs2_core::numeric::NumCast;
13use std::collections::HashMap;
14use std::fmt::Debug;
15use std::sync::atomic::{AtomicUsize, Ordering};
16use std::sync::{Arc, Mutex};
17use std::time::{Duration, Instant};
18
19#[cfg(feature = "simd")]
21use scirs2_core::simd_ops::{
22 simd_add_f32_adaptive, simd_dot_f32_ultra, simd_fma_f32_ultra, simd_mul_f32_hyperoptimized,
23 PlatformCapabilities, SimdUnifiedOps,
24};
25
26#[cfg(feature = "parallel")]
27use scirs2_core::parallel_ops::*;
28
29#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
31pub enum OptimizationLevel {
32 Default,
34 Maximum,
36 Performance,
38 SizeSpecific,
40 Simd,
42 CacheEfficient,
44 Basic,
46 Balanced,
48 Auto,
50 TlbOptimized,
52 UltraOptimized,
54}
55
56#[derive(Debug, Clone)]
58pub struct PerformanceMetrics {
59 pub algorithm: String,
61
62 pub size: usize,
64
65 pub duration: Duration,
67
68 pub mflops: f64,
70
71 pub optimization_level: OptimizationLevel,
73}
74
75#[derive(Debug, Clone)]
77pub struct OptimizedConfig {
78 pub optimization_level: OptimizationLevel,
80 pub threads: Option<usize>,
82 pub use_simd: bool,
84 pub vectorized: bool,
86 pub collect_metrics: bool,
88 pub max_fft_size: usize,
90 pub enable_inplace: bool,
92 pub enable_multithreading: bool,
94 pub cache_line_size: usize,
96 pub l1_cache_size: usize,
98 pub l2_cache_size: usize,
100}
101
102impl Default for OptimizedConfig {
103 fn default() -> Self {
104 Self {
105 optimization_level: OptimizationLevel::Default,
106 threads: None,
107 use_simd: true,
108 vectorized: true,
109 collect_metrics: false,
110 max_fft_size: 1024, enable_inplace: true,
112 enable_multithreading: true,
113 cache_line_size: 64, l1_cache_size: 32 * 1024, l2_cache_size: 256 * 1024, }
117 }
118}
119
120pub struct OptimizedFFT {
122 config: OptimizedConfig,
124 stats: PerformanceStats,
126 collect_stats: bool,
128 #[allow(dead_code)]
130 metrics: Arc<Mutex<HashMap<(usize, OptimizationLevel), PerformanceMetrics>>>,
131
132 #[allow(dead_code)]
134 total_ffts: AtomicUsize,
135}
136
137#[derive(Debug, Default, Clone)]
139pub struct PerformanceStats {
140 pub operation_count: usize,
142 pub total_time_ns: u64,
144 pub max_time_ns: u64,
146 pub min_time_ns: u64,
148 pub total_flops: u64,
150}
151
152impl PerformanceStats {
153 pub fn avg_time_ns(&self) -> u64 {
155 if self.operation_count == 0 {
156 0
157 } else {
158 self.total_time_ns / self.operation_count as u64
159 }
160 }
161
162 pub fn avg_flops(&self) -> f64 {
164 if self.total_time_ns == 0 {
165 0.0
166 } else {
167 self.total_flops as f64 / (self.total_time_ns as f64 / 1_000_000_000.0)
168 }
169 }
170
171 pub fn reset(&mut self) {
173 *self = PerformanceStats::default();
174 }
175}
176
177impl OptimizedFFT {
178 pub fn new(config: OptimizedConfig) -> Self {
180 Self {
181 config,
182 stats: PerformanceStats::default(),
183 collect_stats: false,
184 metrics: Arc::new(Mutex::new(HashMap::new())),
185 total_ffts: AtomicUsize::new(0),
186 }
187 }
188
189 pub fn set_collect_stats(&mut self, enable: bool) {
191 self.collect_stats = enable;
192 }
193
194 pub fn get_stats(&self) -> &PerformanceStats {
196 &self.stats
197 }
198
199 pub fn reset_stats(&mut self) {
201 self.stats.reset();
202 }
203
204 pub fn get_metrics(&self, size: usize, level: OptimizationLevel) -> Option<PerformanceMetrics> {
206 if let Ok(db) = self.metrics.lock() {
207 db.get(&(size, level)).cloned()
208 } else {
209 None
210 }
211 }
212
213 pub fn get_all_metrics(&self) -> Vec<PerformanceMetrics> {
215 if let Ok(db) = self.metrics.lock() {
216 db.values().cloned().collect()
217 } else {
218 Vec::new()
219 }
220 }
221
222 #[allow(dead_code)]
224 fn compute_twiddle_factors(&self, size: usize) -> Vec<Complex64> {
225 let mut twiddles = Vec::with_capacity(size / 2);
226 let factor = -2.0 * std::f64::consts::PI / size as f64;
227
228 for k in 0..size / 2 {
229 let angle = factor * k as f64;
230 twiddles.push(Complex64::new(angle.cos(), angle.sin()));
231 }
232
233 twiddles
234 }
235
236 pub fn fft<T>(&mut self, input: &[T], n: Option<usize>) -> FFTResult<Vec<Complex64>>
238 where
239 T: NumCast + Copy + Debug,
240 {
241 let start = Instant::now();
242 let size = n.unwrap_or(input.len()).min(self.config.max_fft_size); let mut data: Vec<Complex64> = input
246 .iter()
247 .take(size) .map(|&val| {
249 let val_f64 = NumCast::from(val).ok_or_else(|| {
250 FFTError::ValueError(format!("Could not convert {:?} to f64", val))
251 });
252 match val_f64 {
253 Ok(v) => Ok(Complex64::new(v, 0.0)),
254 Err(e) => Err(e),
255 }
256 })
257 .collect::<FFTResult<Vec<_>>>()?;
258
259 match data.len().cmp(&size) {
261 std::cmp::Ordering::Less => {
262 data.resize(size, Complex64::new(0.0, 0.0));
263 }
264 std::cmp::Ordering::Greater => {
265 data.truncate(size);
266 }
267 std::cmp::Ordering::Equal => {
268 }
270 }
271
272 let algorithm = self.select_algorithm(size);
274
275 let result = match algorithm.as_str() {
277 "radix2" => self.radix2_fft(&mut data),
278 "bluestein" => self.bluestein_fft(&mut data),
279 "prime_factor" => self.prime_factor_fft(&mut data),
280 "default" => self.default_fft(&data),
281 _ => self.default_fft(&data),
282 }?;
283
284 if self.collect_stats {
286 let elapsed = start.elapsed();
287 let elapsed_ns = elapsed.as_nanos() as u64;
288 self.stats.operation_count += 1;
289 self.stats.total_time_ns += elapsed_ns;
290 self.stats.max_time_ns = self.stats.max_time_ns.max(elapsed_ns);
291 if self.stats.min_time_ns == 0 {
292 self.stats.min_time_ns = elapsed_ns;
293 } else {
294 self.stats.min_time_ns = self.stats.min_time_ns.min(elapsed_ns);
295 }
296
297 let flops = (5.0 * size as f64 * (size as f64).log2()) as u64;
299 self.stats.total_flops += flops;
300 }
301
302 if self.config.collect_metrics {
304 let duration = start.elapsed();
305 let op_count = 5.0 * size as f64 * (size as f64).log2(); let mflops = op_count / duration.as_secs_f64() / 1_000_000.0;
307
308 let metrics = PerformanceMetrics {
309 algorithm,
310 size,
311 duration,
312 mflops,
313 optimization_level: self.config.optimization_level,
314 };
315
316 if let Ok(mut db) = self.metrics.lock() {
317 db.insert((size, self.config.optimization_level), metrics);
318 }
319
320 self.total_ffts.fetch_add(1, Ordering::SeqCst);
321 }
322
323 Ok(result)
324 }
325
326 pub fn ifft(&mut self, input: &[Complex64], n: Option<usize>) -> FFTResult<Vec<Complex64>> {
328 let start = Instant::now();
329 let size = n.unwrap_or(input.len()).min(self.config.max_fft_size); let data: Vec<Complex64> = input.iter().take(size).copied().collect();
333
334 let algorithm = self.select_algorithm(size);
336
337 let result = match algorithm.as_str() {
339 "radix2" => self.radix2_ifft(&data),
340 "bluestein" => self.bluestein_ifft(&data),
341 "prime_factor" => self.prime_factor_ifft(&data),
342 _ => ifft(&data, Some(size)),
343 }?;
344
345 if self.config.collect_metrics {
347 let duration = start.elapsed();
348 let op_count = 5.0 * size as f64 * (size as f64).log2(); let mflops = op_count / duration.as_secs_f64() / 1_000_000.0;
350
351 let metrics = PerformanceMetrics {
352 algorithm,
353 size,
354 duration,
355 mflops,
356 optimization_level: self.config.optimization_level,
357 };
358
359 if let Ok(mut db) = self.metrics.lock() {
360 db.insert((size, self.config.optimization_level), metrics);
361 }
362
363 self.total_ffts.fetch_add(1, Ordering::SeqCst);
364 }
365
366 Ok(result)
367 }
368
369 fn select_algorithm(&self, size: usize) -> String {
371 match self.config.optimization_level {
372 OptimizationLevel::Default | OptimizationLevel::Basic => {
373 if size.is_power_of_two() {
375 "radix2".to_string()
376 } else {
377 "default".to_string()
378 }
379 }
380 OptimizationLevel::Balanced => {
381 if size.is_power_of_two() {
383 "radix2".to_string()
384 } else if size <= 1024 {
385 "bluestein".to_string()
386 } else {
387 "default".to_string()
388 }
389 }
390 OptimizationLevel::Maximum | OptimizationLevel::Performance => {
391 if size.is_power_of_two() {
393 "radix2".to_string()
394 } else if size % 2 != 0 && size % 3 != 0 && size % 5 != 0 {
395 "bluestein".to_string()
396 } else {
397 "prime_factor".to_string()
398 }
399 }
400 OptimizationLevel::Auto => {
401 if size.is_power_of_two() {
406 "radix2".to_string()
407 } else if size <= 1024 {
408 "bluestein".to_string()
409 } else if size % 2 == 0 || size % 3 == 0 || size % 5 == 0 {
410 "prime_factor".to_string()
411 } else {
412 "bluestein".to_string()
413 }
414 }
415 OptimizationLevel::SizeSpecific => {
416 if size.is_power_of_two() {
418 "radix2".to_string()
419 } else if size <= 16 {
420 "small_size".to_string()
421 } else {
422 "default".to_string()
423 }
424 }
425 OptimizationLevel::Simd => {
426 "simd".to_string()
428 }
429 OptimizationLevel::CacheEfficient => {
430 "cache_efficient".to_string()
432 }
433 OptimizationLevel::TlbOptimized => {
434 if size.is_power_of_two() {
436 "radix2_tlb".to_string()
437 } else {
438 "default".to_string()
439 }
440 }
441 OptimizationLevel::UltraOptimized => {
442 if size.is_power_of_two() {
444 "radix2_ultra".to_string()
445 } else {
446 "default".to_string()
447 }
448 }
449 }
450 }
451
452 fn default_fft(&self, input: &[Complex64]) -> FFTResult<Vec<Complex64>> {
454 let mut planner = FftPlanner::new();
455 let fft = planner.plan_fft_forward(input.len());
456
457 let mut buffer = input.to_vec();
458 fft.process(&mut buffer);
459
460 Ok(buffer)
461 }
462
463 pub fn benchmark_sizes(
465 &mut self,
466 min_size: usize,
467 max_size: usize,
468 step: usize,
469 ) -> FFTResult<HashMap<usize, PerformanceMetrics>> {
470 let mut results = HashMap::new();
471
472 let original_collect = self.config.collect_metrics;
474 self.config.collect_metrics = true;
475
476 let actual_max = max_size.min(self.config.max_fft_size);
478
479 for size in (min_size..=actual_max).step_by(step) {
480 let data: Vec<f64> = (0..size).map(|i| (i as f64).sin()).collect();
482
483 let start = Instant::now();
485 let _ = self.fft(&data, Some(size))?;
486 let duration = start.elapsed();
487
488 let op_count = 5.0 * size as f64 * (size as f64).log2();
490 let mflops = op_count / duration.as_secs_f64() / 1_000_000.0;
491
492 let algorithm = self.select_algorithm(size);
494 let metrics = PerformanceMetrics {
495 algorithm,
496 size,
497 duration,
498 mflops,
499 optimization_level: self.config.optimization_level,
500 };
501
502 results.insert(size, metrics);
503 }
504
505 self.config.collect_metrics = original_collect;
507
508 Ok(results)
509 }
510
511 fn radix2_fft(&self, data: &mut [Complex64]) -> FFTResult<Vec<Complex64>> {
514 match self.config.optimization_level {
515 OptimizationLevel::TlbOptimized | OptimizationLevel::UltraOptimized => {
516 #[cfg(feature = "simd")]
517 {
518 self.radix2_fft_tlb_optimized(data)
519 }
520 #[cfg(not(feature = "simd"))]
521 {
522 fft(data, None)
524 }
525 }
526 OptimizationLevel::Simd => {
527 #[cfg(feature = "simd")]
528 {
529 self.radix2_fft_simd_optimized(data)
530 }
531 #[cfg(not(feature = "simd"))]
532 {
533 fft(data, None)
535 }
536 }
537 _ => {
538 fft(data, None)
540 }
541 }
542 }
543
544 fn bluestein_fft(&self, data: &mut [Complex64]) -> FFTResult<Vec<Complex64>> {
545 fft(data, None)
548 }
549
550 fn prime_factor_fft(&self, data: &mut [Complex64]) -> FFTResult<Vec<Complex64>> {
551 fft(data, None)
554 }
555
556 fn radix2_ifft(&self, data: &[Complex64]) -> FFTResult<Vec<Complex64>> {
557 ifft(data, None)
560 }
561
562 fn bluestein_ifft(&self, data: &[Complex64]) -> FFTResult<Vec<Complex64>> {
563 ifft(data, None)
566 }
567
568 fn prime_factor_ifft(&self, data: &[Complex64]) -> FFTResult<Vec<Complex64>> {
569 ifft(data, None)
572 }
573
574 #[allow(dead_code)]
576 fn maximum_optimized_fft(&self, input: &[Complex64]) -> FFTResult<Vec<Complex64>> {
577 self.default_fft(input)
580 }
581
582 #[allow(dead_code)]
584 fn size_specific_fft(&self, input: &[Complex64]) -> FFTResult<Vec<Complex64>> {
585 let n = input.len();
586
587 if n.is_power_of_two() {
589 return self.power_of_two_fft(input);
590 }
591
592 if n <= 16 {
594 return self.small_size_fft(input);
595 }
596
597 self.default_fft(input)
599 }
600
601 #[allow(dead_code)]
603 fn power_of_two_fft(&self, input: &[Complex64]) -> FFTResult<Vec<Complex64>> {
604 self.default_fft(input)
608 }
609
610 #[allow(dead_code)]
612 fn small_size_fft(&self, input: &[Complex64]) -> FFTResult<Vec<Complex64>> {
613 self.default_fft(input)
617 }
618
619 #[allow(dead_code)]
621 fn simd_optimized_fft(&self, input: &[Complex64]) -> FFTResult<Vec<Complex64>> {
622 #[cfg(any(target_feature = "sse", target_feature = "avx"))]
623 {
624 }
627
628 self.default_fft(input)
630 }
631
632 #[allow(dead_code)]
634 fn cache_efficient_fft(&self, input: &[Complex64]) -> FFTResult<Vec<Complex64>> {
635 self.default_fft(input)
638 }
639
640 pub fn fft2<S>(
642 &mut self,
643 input: &ArrayBase<S, scirs2_core::ndarray::Ix2>,
644 ) -> FFTResult<Array<Complex64, scirs2_core::ndarray::Ix2>>
645 where
646 S: Data,
647 S::Elem: NumCast + Copy + Debug,
648 {
649 let shape = input.shape();
651
652 let rows = shape[0].min(self.config.max_fft_size / 2);
654 let cols = shape[1].min(self.config.max_fft_size / 2);
655
656 let mut output = Array::zeros((rows, cols));
658
659 for i in 0..rows {
661 let row: Vec<_> = input
662 .slice(scirs2_core::ndarray::s![i, ..cols])
663 .iter()
664 .map(|&val| {
665 let val_f64 = NumCast::from(val).ok_or_else(|| {
666 FFTError::ValueError("Could not convert to f64".to_string())
667 })?;
668 Ok(Complex64::new(val_f64, 0.0))
669 })
670 .collect::<FFTResult<Vec<_>>>()?;
671
672 let row_fft = self.fft(&row, None)?;
673 for (j, val) in row_fft.iter().enumerate().take(cols) {
674 output[[i, j]] = *val;
675 }
676 }
677
678 for j in 0..cols {
680 let mut col = Vec::with_capacity(rows);
681 for i in 0..rows {
682 col.push(output[[i, j]]);
683 }
684
685 let col_fft = self.fft(&col, None)?;
686 for (i, val) in col_fft.iter().enumerate().take(rows) {
687 output[[i, j]] = *val;
688 }
689 }
690
691 Ok(output)
694 }
695
696 #[allow(dead_code)]
698 fn detect_cpu_features(&self) -> Vec<String> {
699 vec![
702 "sse".to_string(),
703 "sse2".to_string(),
704 "sse3".to_string(),
705 "sse4.1".to_string(),
706 "avx".to_string(),
707 ]
708 }
709
710 pub fn suggest_optimal_size(&self, requestedsize: usize) -> usize {
712 let next_pow2 = requestedsize.next_power_of_two();
714
715 if requestedsize.is_power_of_two() {
720 return requestedsize;
721 }
722
723 if next_pow2 < requestedsize * 2 {
725 return next_pow2;
726 }
727
728 let mut best_size = requestedsize;
730 let mut best_score = usize::MAX;
731
732 for size in requestedsize..=next_pow2 {
734 let score = self.complexity_score(size);
736
737 if score < best_score {
738 best_score = score;
739 best_size = size;
740 }
741 }
742
743 best_size
744 }
745
746 fn complexity_score(&self, n: usize) -> usize {
749 if n.is_power_of_two() {
750 return 0;
752 }
753
754 let mut factors = 0;
756 let mut remaining = n;
757 let mut i = 2;
758
759 while i * i <= remaining {
760 while remaining % i == 0 {
761 factors += 1;
762 remaining /= i;
763 }
764 i += 1;
765 }
766
767 if remaining > 1 {
768 factors += 1;
769 }
770
771 factors * 100 + n.count_ones() as usize * 10
773 }
774
775 #[cfg(feature = "simd")]
790 fn radix2_fft_tlb_optimized(&self, data: &mut [Complex64]) -> FFTResult<Vec<Complex64>> {
791 let caps = PlatformCapabilities::detect();
792 let n = data.len();
793
794 if n >= 512 && caps.has_avx2() && n.is_power_of_two() {
796 self.radix2_fft_ultra_optimized(data, caps)
797 } else if n >= 64 && caps.simd_available {
798 self.radix2_fft_cache_optimized(data, caps)
799 } else {
800 fft(data, None)
802 }
803 }
804
805 #[cfg(feature = "simd")]
807 fn radix2_fft_ultra_optimized(
808 &self,
809 data: &mut [Complex64],
810 caps: PlatformCapabilities,
811 ) -> FFTResult<Vec<Complex64>> {
812 let n = data.len();
813 let mut result = data.to_vec();
814
815 let page_size = 4096; let tlb_entries = 64; let optimal_working_set = page_size * tlb_entries / 2; let complex_size = std::mem::size_of::<Complex64>();
822 let elements_per_page = page_size / complex_size;
823 let block_size = (optimal_working_set / complex_size).min(n / 4).max(64);
824
825 if n >= block_size * 4 {
827 self.radix2_fft_blocked_tlb_optimized(&mut result, block_size, caps)
829 } else {
830 self.radix2_fft_cache_optimized_impl(&mut result, caps)
832 }
833 }
834
835 #[cfg(feature = "simd")]
837 fn radix2_fft_blocked_tlb_optimized(
838 &self,
839 data: &mut [Complex64],
840 block_size: usize,
841 caps: PlatformCapabilities,
842 ) -> FFTResult<Vec<Complex64>> {
843 let n = data.len();
844
845 self.bit_reverse_tlb_optimized(data, block_size);
847
848 let mut step = 2;
850 while step <= n {
851 let half_step = step / 2;
852
853 for block_start in (0..n).step_by(block_size) {
855 let block_end = (block_start + block_size).min(n);
856
857 for i in (block_start..block_end).step_by(step) {
859 if i + half_step < block_end {
860 self.butterfly_operation_simd_ultra(
861 &mut data[i..i + step],
862 half_step,
863 caps,
864 );
865 }
866 }
867 }
868 step *= 2;
869 }
870
871 Ok(data.to_vec())
872 }
873
874 #[cfg(feature = "simd")]
876 fn radix2_fft_cache_optimized(
877 &self,
878 data: &mut [Complex64],
879 caps: PlatformCapabilities,
880 ) -> FFTResult<Vec<Complex64>> {
881 let mut result = data.to_vec();
882 self.radix2_fft_cache_optimized_impl(&mut result, caps)
883 }
884
885 #[cfg(feature = "simd")]
887 fn radix2_fft_cache_optimized_impl(
888 &self,
889 data: &mut [Complex64],
890 caps: PlatformCapabilities,
891 ) -> FFTResult<Vec<Complex64>> {
892 let n = data.len();
893
894 self.bit_reverse_cache_aware(data, caps.cache_line_size());
896
897 let mut step = 2;
899 while step <= n {
900 let half_step = step / 2;
901
902 let cache_chunk_size = caps.cache_line_size() / std::mem::size_of::<Complex64>();
904
905 for i in (0..n).step_by(cache_chunk_size) {
906 let chunk_end = (i + cache_chunk_size).min(n);
907
908 for j in (i..chunk_end).step_by(step) {
909 if j + half_step < n {
910 self.butterfly_operation_simd_optimized(
911 &mut data[j..j + step],
912 half_step,
913 caps,
914 );
915 }
916 }
917 }
918 step *= 2;
919 }
920
921 Ok(data.to_vec())
922 }
923
924 #[cfg(feature = "simd")]
926 fn radix2_fft_simd_optimized(&self, data: &mut [Complex64]) -> FFTResult<Vec<Complex64>> {
927 let caps = PlatformCapabilities::detect();
928 let n = data.len();
929
930 if !n.is_power_of_two() {
931 return Err(FFTError::ValueError(
932 "Radix-2 FFT requires power-of-2 size".to_string(),
933 ));
934 }
935
936 let mut result = data.to_vec();
937
938 self.bit_reverse_simd(&mut result);
940
941 let mut step = 2;
943 while step <= n {
944 let half_step = step / 2;
945
946 for i in (0..n).step_by(step) {
947 self.butterfly_operation_simd(&mut result[i..i + step], half_step, caps);
948 }
949 step *= 2;
950 }
951
952 Ok(result)
953 }
954
955 #[cfg(feature = "simd")]
961 fn bit_reverse_tlb_optimized(&self, data: &mut [Complex64], block_size: usize) {
962 let n = data.len();
963 let log_n = (n as f64).log2() as usize;
964
965 for block_start in (0..n).step_by(block_size) {
967 let block_end = (block_start + block_size).min(n);
968
969 for i in block_start..block_end {
970 let mut reversed = 0;
971 let mut temp = i;
972
973 for _ in 0..log_n {
975 reversed = (reversed << 1) | (temp & 1);
976 temp >>= 1;
977 }
978
979 if reversed > i && reversed < n {
980 data.swap(i, reversed);
981 }
982 }
983 }
984 }
985
986 #[cfg(feature = "simd")]
988 fn bit_reverse_cache_aware(&self, data: &mut [Complex64], cache_line_size: usize) {
989 let n = data.len();
990 let log_n = (n as f64).log2() as usize;
991 let chunk_size = cache_line_size / std::mem::size_of::<Complex64>();
992
993 for chunk_start in (0..n).step_by(chunk_size) {
994 let chunk_end = (chunk_start + chunk_size).min(n);
995
996 for i in chunk_start..chunk_end {
997 let reversed = self.reverse_bits(i, log_n);
998 if reversed > i && reversed < n {
999 data.swap(i, reversed);
1000 }
1001 }
1002 }
1003 }
1004
1005 #[cfg(feature = "simd")]
1007 fn bit_reverse_simd(&self, data: &mut [Complex64]) {
1008 let n = data.len();
1009 let log_n = (n as f64).log2() as usize;
1010
1011 for i in (0..n).step_by(8) {
1013 let end = (i + 8).min(n);
1014
1015 for j in i..end {
1016 let reversed = self.reverse_bits(j, log_n);
1017 if reversed > j && reversed < n {
1018 data.swap(j, reversed);
1019 }
1020 }
1021 }
1022 }
1023
1024 #[cfg(feature = "simd")]
1026 fn butterfly_operation_simd_ultra(
1027 &self,
1028 data: &mut [Complex64],
1029 half_step: usize,
1030 caps: PlatformCapabilities,
1031 ) {
1032 if data.len() >= half_step * 2 && caps.has_avx2() {
1033 self.butterfly_simd_avx2_ultra(data, half_step);
1035 } else {
1036 self.butterfly_operation_scalar(data, half_step);
1038 }
1039 }
1040
1041 #[cfg(feature = "simd")]
1043 fn butterfly_operation_simd_optimized(
1044 &self,
1045 data: &mut [Complex64],
1046 half_step: usize,
1047 caps: PlatformCapabilities,
1048 ) {
1049 if caps.simd_available && data.len() >= 8 {
1050 self.butterfly_simd_optimized(data, half_step);
1051 } else {
1052 self.butterfly_operation_scalar(data, half_step);
1053 }
1054 }
1055
1056 #[cfg(feature = "simd")]
1058 fn butterfly_operation_simd(
1059 &self,
1060 data: &mut [Complex64],
1061 half_step: usize,
1062 caps: PlatformCapabilities,
1063 ) {
1064 if caps.simd_available {
1065 self.butterfly_simd_basic(data, half_step);
1066 } else {
1067 self.butterfly_operation_scalar(data, half_step);
1068 }
1069 }
1070
1071 #[cfg(feature = "simd")]
1073 fn butterfly_simd_avx2_ultra(&self, data: &mut [Complex64], half_step: usize) {
1074 for i in 0..half_step {
1079 if i + half_step < data.len() {
1080 let w = self
1081 .complex_exp(-2.0 * std::f64::consts::PI * i as f64 / (2 * half_step) as f64);
1082 let temp = data[i + half_step] * w;
1083 data[i + half_step] = data[i] - temp;
1084 data[i] = data[i] + temp;
1085 }
1086 }
1087 }
1088
1089 #[cfg(feature = "simd")]
1091 fn butterfly_simd_optimized(&self, data: &mut [Complex64], half_step: usize) {
1092 let chunk_size = 8; for chunk_start in (0..half_step).step_by(chunk_size) {
1096 let chunk_end = (chunk_start + chunk_size).min(half_step);
1097
1098 for i in chunk_start..chunk_end {
1099 if i + half_step < data.len() {
1100 let w = self.complex_exp(
1101 -2.0 * std::f64::consts::PI * i as f64 / (2 * half_step) as f64,
1102 );
1103 let temp = data[i + half_step] * w;
1104 data[i + half_step] = data[i] - temp;
1105 data[i] = data[i] + temp;
1106 }
1107 }
1108 }
1109 }
1110
1111 #[cfg(feature = "simd")]
1113 fn butterfly_simd_basic(&self, data: &mut [Complex64], half_step: usize) {
1114 for i in 0..half_step {
1115 if i + half_step < data.len() {
1116 let w = self
1117 .complex_exp(-2.0 * std::f64::consts::PI * i as f64 / (2 * half_step) as f64);
1118 let temp = data[i + half_step] * w;
1119 data[i + half_step] = data[i] - temp;
1120 data[i] = data[i] + temp;
1121 }
1122 }
1123 }
1124
1125 fn butterfly_operation_scalar(&self, data: &mut [Complex64], half_step: usize) {
1127 for i in 0..half_step {
1128 if i + half_step < data.len() {
1129 let w = self
1130 .complex_exp(-2.0 * std::f64::consts::PI * i as f64 / (2 * half_step) as f64);
1131 let temp = data[i + half_step] * w;
1132 data[i + half_step] = data[i] - temp;
1133 data[i] = data[i] + temp;
1134 }
1135 }
1136 }
1137
1138 fn reverse_bits(&self, mut n: usize, bits: usize) -> usize {
1140 let mut result = 0;
1141 for _ in 0..bits {
1142 result = (result << 1) | (n & 1);
1143 n >>= 1;
1144 }
1145 result
1146 }
1147
1148 fn complex_exp(&self, angle: f64) -> Complex64 {
1150 Complex64::new(angle.cos(), angle.sin())
1151 }
1152}
1153
1154#[cfg(test)]
1155#[cfg(feature = "never")] mod tests {
1157 use super::*;
1158 use approx::assert_relative_eq;
1159
1160 #[test]
1161 fn test_optimized_fft_simple() {
1162 let config = OptimizedConfig::default();
1163 let mut fft = OptimizedFFT::new(config);
1164
1165 let input = vec![1.0, 0.0, 0.0, 0.0];
1167 let output = fft.fft(&input, None).expect("Operation failed");
1168
1169 assert_eq!(output.len(), 4);
1170 for val in &output {
1171 assert_relative_eq!(val.re, 1.0, epsilon = 1e-10);
1172 assert_relative_eq!(val.im, 0.0, epsilon = 1e-10);
1173 }
1174 }
1175
1176 #[test]
1177 fn test_stats_collection() {
1178 let config = OptimizedConfig::default();
1179 let mut fft = OptimizedFFT::new(config);
1180 fft.set_collect_stats(true);
1181
1182 let input = vec![1.0, 2.0, 3.0, 4.0];
1184 for _ in 0..5 {
1185 let _ = fft.fft(&input, None).expect("Operation failed");
1186 }
1187
1188 let stats = fft.get_stats();
1189 assert_eq!(stats.operation_count, 5);
1190 assert!(stats.total_time_ns > 0);
1191 assert!(stats.avg_time_ns() > 0);
1192 }
1193
1194 #[test]
1195 fn test_suggest_optimal_size() {
1196 let config = OptimizedConfig::default();
1197 let fft = OptimizedFFT::new(config);
1198
1199 assert_eq!(fft.suggest_optimal_size(64), 64);
1201
1202 let size_100 = fft.suggest_optimal_size(100);
1204 assert!(size_100 >= 100); }
1206
1207 #[test]
1208 fn test_different_optimization_levels() {
1209 let input = vec![1.0, 2.0, 3.0, 4.0];
1210
1211 let levels = [
1212 OptimizationLevel::Default,
1213 OptimizationLevel::Maximum,
1214 OptimizationLevel::SizeSpecific,
1215 OptimizationLevel::Simd,
1216 OptimizationLevel::CacheEfficient,
1217 OptimizationLevel::Basic,
1218 OptimizationLevel::Balanced,
1219 OptimizationLevel::Auto,
1220 ];
1221
1222 for level in &levels {
1223 let config = OptimizedConfig {
1224 optimization_level: *level,
1225 ..OptimizedConfig::default()
1226 };
1227
1228 let mut fft = OptimizedFFT::new(config);
1229 let result = fft.fft(&input, None);
1230 assert!(
1231 result.is_ok(),
1232 "FFT failed with optimization level {:?}",
1233 level
1234 );
1235 }
1236 }
1237}