Skip to main content

scirs2_fft/
large_fft.rs

1//! Large FFT Module - Memory-Efficient FFT for Very Large Inputs
2//!
3//! This module provides memory-efficient FFT implementations for very large inputs
4//! that may not fit entirely in memory or cache. It uses various techniques including:
5//!
6//! - **Streaming FFT**: Process data in chunks to limit memory usage
7//! - **Out-of-core FFT**: Handle data larger than available RAM
8//! - **Cache-blocking**: Optimize for CPU cache hierarchy
9//! - **Overlap-save method**: Efficient convolution for streaming data
10//!
11//! # Performance Characteristics
12//!
13//! | Input Size | Method | Memory Usage | Cache Efficiency |
14//! |------------|--------|--------------|------------------|
15//! | < 64 KB    | Direct | O(n)         | L1 optimal       |
16//! | < 256 KB   | Direct | O(n)         | L2 optimal       |
17//! | < 8 MB     | Blocked| O(sqrt(n))   | L3 optimal       |
18//! | > 8 MB     | Streaming | O(block) | Controlled       |
19//!
20//! # Example
21//!
22//! ```rust,no_run
23//! use scirs2_fft::large_fft::{LargeFft, LargeFftConfig};
24//!
25//! let config = LargeFftConfig::default();
26//! let large_fft = LargeFft::new(config);
27//!
28//! let input: Vec<f64> = vec![0.0; 1_000_000];
29//! let result = large_fft.compute(&input, true).expect("FFT failed");
30//! ```
31
32use crate::algorithm_selector::{AlgorithmSelector, CacheInfo, FftAlgorithm, InputCharacteristics};
33use crate::error::{FFTError, FFTResult};
34#[cfg(feature = "oxifft")]
35use crate::oxifft_plan_cache;
36#[cfg(feature = "oxifft")]
37use oxifft::{Complex as OxiComplex, Direction};
38#[cfg(feature = "rustfft-backend")]
39use rustfft::FftPlanner;
40use scirs2_core::numeric::Complex64;
41use scirs2_core::numeric::NumCast;
42use std::fmt::Debug;
43#[cfg(feature = "rustfft-backend")]
44use std::sync::Mutex;
45
46/// Configuration for large FFT operations
47#[derive(Debug, Clone)]
48pub struct LargeFftConfig {
49    /// Maximum block size for streaming (in elements)
50    pub max_block_size: usize,
51    /// Target memory usage in bytes
52    pub target_memory_bytes: usize,
53    /// Whether to use parallel processing
54    pub use_parallel: bool,
55    /// Number of threads for parallel processing
56    pub num_threads: usize,
57    /// Cache line size for alignment
58    pub cache_line_size: usize,
59    /// L1 cache size for blocking
60    pub l1_cache_size: usize,
61    /// L2 cache size for blocking
62    pub l2_cache_size: usize,
63    /// L3 cache size for blocking
64    pub l3_cache_size: usize,
65    /// Whether to use overlap-save for streaming
66    pub use_overlap_save: bool,
67    /// Overlap ratio for overlap-save method
68    pub overlap_ratio: f64,
69}
70
71impl Default for LargeFftConfig {
72    fn default() -> Self {
73        let cache_info = CacheInfo::default();
74        Self {
75            max_block_size: 65536,                  // 64K elements
76            target_memory_bytes: 256 * 1024 * 1024, // 256 MB
77            use_parallel: true,
78            num_threads: num_cpus::get(),
79            cache_line_size: cache_info.cache_line_size,
80            l1_cache_size: cache_info.l1_size,
81            l2_cache_size: cache_info.l2_size,
82            l3_cache_size: cache_info.l3_size,
83            use_overlap_save: false,
84            overlap_ratio: 0.5,
85        }
86    }
87}
88
89/// Method selection for large FFT
90#[derive(Debug, Clone, Copy, PartialEq, Eq)]
91pub enum LargeFftMethod {
92    /// Direct FFT (for small inputs)
93    Direct,
94    /// Cache-blocked FFT (for medium inputs)
95    CacheBlocked,
96    /// Streaming FFT (for large inputs)
97    Streaming,
98    /// Out-of-core FFT (for very large inputs)
99    OutOfCore,
100}
101
102/// Statistics from a large FFT operation
103#[derive(Debug, Clone)]
104pub struct LargeFftStats {
105    /// Method used
106    pub method: LargeFftMethod,
107    /// Number of blocks processed
108    pub num_blocks: usize,
109    /// Block size used
110    pub block_size: usize,
111    /// Peak memory usage estimate
112    pub peak_memory_bytes: usize,
113    /// Total computation time in nanoseconds
114    pub total_time_ns: u64,
115}
116
117/// Large FFT processor
118pub struct LargeFft {
119    /// Configuration
120    config: LargeFftConfig,
121    /// FFT planner (cached) - only for rustfft backend
122    #[cfg(feature = "rustfft-backend")]
123    planner: Mutex<FftPlanner<f64>>,
124    /// Algorithm selector
125    selector: AlgorithmSelector,
126}
127
128impl Default for LargeFft {
129    fn default() -> Self {
130        Self::new(LargeFftConfig::default())
131    }
132}
133
134impl LargeFft {
135    /// Create a new large FFT processor with default configuration
136    pub fn with_defaults() -> Self {
137        Self::new(LargeFftConfig::default())
138    }
139
140    /// Create a new large FFT processor with custom configuration
141    pub fn new(config: LargeFftConfig) -> Self {
142        Self {
143            config,
144            #[cfg(feature = "rustfft-backend")]
145            planner: Mutex::new(FftPlanner::new()),
146            selector: AlgorithmSelector::new(),
147        }
148    }
149
150    /// Determine the best method for the given input size
151    pub fn select_method(&self, size: usize) -> LargeFftMethod {
152        let memory_required = size * 16; // Complex64 = 16 bytes
153
154        if memory_required <= self.config.l2_cache_size {
155            LargeFftMethod::Direct
156        } else if memory_required <= self.config.l3_cache_size {
157            LargeFftMethod::CacheBlocked
158        } else if memory_required <= self.config.target_memory_bytes {
159            LargeFftMethod::Streaming
160        } else {
161            LargeFftMethod::OutOfCore
162        }
163    }
164
165    /// Compute FFT for potentially large input
166    pub fn compute<T>(&self, input: &[T], forward: bool) -> FFTResult<Vec<Complex64>>
167    where
168        T: NumCast + Copy + Debug + 'static,
169    {
170        let size = input.len();
171        if size == 0 {
172            return Err(FFTError::ValueError("Input cannot be empty".to_string()));
173        }
174
175        let method = self.select_method(size);
176
177        match method {
178            LargeFftMethod::Direct => self.compute_direct(input, forward),
179            LargeFftMethod::CacheBlocked => self.compute_cache_blocked(input, forward),
180            LargeFftMethod::Streaming => self.compute_streaming(input, forward),
181            LargeFftMethod::OutOfCore => self.compute_out_of_core(input, forward),
182        }
183    }
184
185    /// Compute FFT for complex-valued input (preserves imaginary components)
186    ///
187    /// This method should be used when the input is already complex (e.g., for inverse FFT
188    /// after a forward transform). Unlike `compute`, this preserves both real and imaginary parts.
189    pub fn compute_complex(&self, input: &[Complex64], forward: bool) -> FFTResult<Vec<Complex64>> {
190        let size = input.len();
191        if size == 0 {
192            return Err(FFTError::ValueError("Input cannot be empty".to_string()));
193        }
194
195        self.compute_direct_complex(input, forward)
196    }
197
198    /// Compute FFT for complex input using direct method
199    fn compute_direct_complex(
200        &self,
201        input: &[Complex64],
202        forward: bool,
203    ) -> FFTResult<Vec<Complex64>> {
204        let size = input.len();
205        let data: Vec<Complex64> = input.to_vec();
206
207        #[cfg(feature = "oxifft")]
208        {
209            let input_oxi: Vec<OxiComplex<f64>> =
210                data.iter().map(|c| OxiComplex::new(c.re, c.im)).collect();
211            let mut output: Vec<OxiComplex<f64>> = vec![OxiComplex::zero(); size];
212
213            let direction = if forward {
214                Direction::Forward
215            } else {
216                Direction::Backward
217            };
218            oxifft_plan_cache::execute_c2c(&input_oxi, &mut output, direction)?;
219
220            let mut result: Vec<Complex64> = output
221                .into_iter()
222                .map(|c| Complex64::new(c.re, c.im))
223                .collect();
224
225            if !forward {
226                let scale = 1.0 / size as f64;
227                for val in &mut result {
228                    *val *= scale;
229                }
230            }
231
232            Ok(result)
233        }
234
235        #[cfg(not(feature = "oxifft"))]
236        {
237            #[cfg(feature = "rustfft-backend")]
238            {
239                let mut data = data;
240                let plan = {
241                    let mut planner = self.planner.lock().map_err(|e| {
242                        FFTError::ValueError(format!("Failed to acquire planner lock: {e}"))
243                    })?;
244                    if forward {
245                        planner.plan_fft_forward(size)
246                    } else {
247                        planner.plan_fft_inverse(size)
248                    }
249                };
250                plan.process(&mut data);
251
252                if !forward {
253                    let scale = 1.0 / size as f64;
254                    for val in &mut data {
255                        *val *= scale;
256                    }
257                }
258
259                return Ok(data);
260            }
261
262            #[cfg(not(feature = "rustfft-backend"))]
263            {
264                return Err(FFTError::ComputationError(
265                    "No FFT backend available. Enable either 'oxifft' or 'rustfft-backend' feature.".to_string()
266                ));
267            }
268        }
269    }
270
271    /// Compute FFT using direct method (best for small inputs)
272    fn compute_direct<T>(&self, input: &[T], forward: bool) -> FFTResult<Vec<Complex64>>
273    where
274        T: NumCast + Copy + Debug + 'static,
275    {
276        let size = input.len();
277
278        // Convert input to complex
279        let data: Vec<Complex64> = input
280            .iter()
281            .map(|val| {
282                let real: f64 = NumCast::from(*val).unwrap_or(0.0);
283                Complex64::new(real, 0.0)
284            })
285            .collect();
286
287        // Use OxiFFT backend by default
288        #[cfg(feature = "oxifft")]
289        {
290            // Convert to OxiFFT-compatible complex type
291            let input_oxi: Vec<OxiComplex<f64>> =
292                data.iter().map(|c| OxiComplex::new(c.re, c.im)).collect();
293            let mut output: Vec<OxiComplex<f64>> = vec![OxiComplex::zero(); size];
294
295            // Execute FFT with cached plan
296            let direction = if forward {
297                Direction::Forward
298            } else {
299                Direction::Backward
300            };
301            oxifft_plan_cache::execute_c2c(&input_oxi, &mut output, direction)?;
302
303            // Convert back to our Complex64 type
304            let mut result: Vec<Complex64> = output
305                .into_iter()
306                .map(|c| Complex64::new(c.re, c.im))
307                .collect();
308
309            // Apply normalization for inverse
310            if !forward {
311                let scale = 1.0 / size as f64;
312                for val in &mut result {
313                    *val *= scale;
314                }
315            }
316
317            Ok(result)
318        }
319
320        #[cfg(not(feature = "oxifft"))]
321        {
322            #[cfg(feature = "rustfft-backend")]
323            {
324                let mut data = data;
325
326                // Get FFT plan
327                let plan = {
328                    let mut planner = self.planner.lock().map_err(|e| {
329                        FFTError::ValueError(format!("Failed to acquire planner lock: {e}"))
330                    })?;
331                    if forward {
332                        planner.plan_fft_forward(size)
333                    } else {
334                        planner.plan_fft_inverse(size)
335                    }
336                };
337
338                // Execute FFT
339                plan.process(&mut data);
340
341                // Apply normalization for inverse
342                if !forward {
343                    let scale = 1.0 / size as f64;
344                    for val in &mut data {
345                        *val *= scale;
346                    }
347                }
348
349                Ok(data)
350            }
351
352            #[cfg(not(feature = "rustfft-backend"))]
353            {
354                Err(FFTError::ComputationError(
355                    "No FFT backend available. Enable either 'oxifft' or 'rustfft-backend' feature.".to_string()
356                ))
357            }
358        }
359    }
360
361    /// Compute FFT using cache-blocked method
362    fn compute_cache_blocked<T>(&self, input: &[T], forward: bool) -> FFTResult<Vec<Complex64>>
363    where
364        T: NumCast + Copy + Debug + 'static,
365    {
366        let size = input.len();
367
368        // Determine optimal block size based on L2 cache
369        let elements_per_cache = self.config.l2_cache_size / 16; // Complex64 = 16 bytes
370        let block_size = find_optimal_block_size(size, elements_per_cache);
371
372        // Convert input to complex
373        let data: Vec<Complex64> = input
374            .iter()
375            .map(|val| {
376                let real: f64 = NumCast::from(*val).unwrap_or(0.0);
377                Complex64::new(real, 0.0)
378            })
379            .collect();
380
381        // For cache-blocked FFT, we use the Cooley-Tukey decomposition
382        // but organize memory access patterns to maximize cache hits
383        //
384        // For a 1D FFT, we still need to do the full computation,
385        // but we can optimize memory access by using in-place algorithms
386
387        // Use OxiFFT backend by default
388        #[cfg(feature = "oxifft")]
389        {
390            // Convert to OxiFFT-compatible complex type
391            let input_oxi: Vec<OxiComplex<f64>> =
392                data.iter().map(|c| OxiComplex::new(c.re, c.im)).collect();
393            let mut output: Vec<OxiComplex<f64>> = vec![OxiComplex::zero(); size];
394
395            // Execute FFT with cached plan
396            let direction = if forward {
397                Direction::Forward
398            } else {
399                Direction::Backward
400            };
401            oxifft_plan_cache::execute_c2c(&input_oxi, &mut output, direction)?;
402
403            // Convert back to our Complex64 type
404            let mut result: Vec<Complex64> = output
405                .into_iter()
406                .map(|c| Complex64::new(c.re, c.im))
407                .collect();
408
409            // Apply normalization for inverse
410            if !forward {
411                let scale = 1.0 / size as f64;
412                for val in &mut result {
413                    *val *= scale;
414                }
415            }
416
417            // Log block size for debugging (could be useful for profiling)
418            let _ = block_size;
419
420            Ok(result)
421        }
422
423        #[cfg(not(feature = "oxifft"))]
424        {
425            #[cfg(feature = "rustfft-backend")]
426            {
427                let mut data = data;
428
429                // Get FFT plan
430                let plan = {
431                    let mut planner = self.planner.lock().map_err(|e| {
432                        FFTError::ValueError(format!("Failed to acquire planner lock: {e}"))
433                    })?;
434                    if forward {
435                        planner.plan_fft_forward(size)
436                    } else {
437                        planner.plan_fft_inverse(size)
438                    }
439                };
440
441                // Use scratch buffer for better cache behavior
442                let scratch_len = plan.get_inplace_scratch_len();
443                let mut scratch = vec![Complex64::new(0.0, 0.0); scratch_len];
444
445                // Execute FFT with scratch (better cache behavior)
446                plan.process_with_scratch(&mut data, &mut scratch);
447
448                // Apply normalization for inverse
449                if !forward {
450                    let scale = 1.0 / size as f64;
451                    for val in &mut data {
452                        *val *= scale;
453                    }
454                }
455
456                // Log block size for debugging (could be useful for profiling)
457                let _ = block_size;
458
459                Ok(data)
460            }
461
462            #[cfg(not(feature = "rustfft-backend"))]
463            {
464                Err(FFTError::ComputationError(
465                    "No FFT backend available. Enable either 'oxifft' or 'rustfft-backend' feature.".to_string()
466                ))
467            }
468        }
469    }
470
471    /// Compute FFT using streaming method (for large inputs)
472    fn compute_streaming<T>(&self, input: &[T], forward: bool) -> FFTResult<Vec<Complex64>>
473    where
474        T: NumCast + Copy + Debug + 'static,
475    {
476        let size = input.len();
477
478        // For streaming, we need to use overlap-save or overlap-add method
479        // However, for a simple FFT (not convolution), we still need to process
480        // the entire signal at once for correct results
481        //
482        // The "streaming" optimization here is about memory access patterns
483        // and using efficient scratch allocation
484
485        // Convert input to complex in chunks to reduce peak memory during conversion
486        let chunk_size = self.config.max_block_size;
487        let mut data: Vec<Complex64> = Vec::with_capacity(size);
488
489        for chunk in input.chunks(chunk_size) {
490            for val in chunk {
491                let real: f64 = NumCast::from(*val).unwrap_or(0.0);
492                data.push(Complex64::new(real, 0.0));
493            }
494        }
495
496        // Use OxiFFT backend by default
497        #[cfg(feature = "oxifft")]
498        {
499            // Convert to OxiFFT-compatible complex type
500            let input_oxi: Vec<OxiComplex<f64>> =
501                data.iter().map(|c| OxiComplex::new(c.re, c.im)).collect();
502            let mut output: Vec<OxiComplex<f64>> = vec![OxiComplex::zero(); size];
503
504            // Execute FFT with cached plan
505            let direction = if forward {
506                Direction::Forward
507            } else {
508                Direction::Backward
509            };
510            oxifft_plan_cache::execute_c2c(&input_oxi, &mut output, direction)?;
511
512            // Convert back to our Complex64 type
513            let mut result: Vec<Complex64> = output
514                .into_iter()
515                .map(|c| Complex64::new(c.re, c.im))
516                .collect();
517
518            // Apply normalization for inverse
519            if !forward {
520                let scale = 1.0 / size as f64;
521                // Process normalization in chunks to improve cache behavior
522                for chunk in result.chunks_mut(chunk_size) {
523                    for val in chunk {
524                        *val *= scale;
525                    }
526                }
527            }
528
529            Ok(result)
530        }
531
532        #[cfg(not(feature = "oxifft"))]
533        {
534            #[cfg(feature = "rustfft-backend")]
535            {
536                // Get FFT plan
537                let plan = {
538                    let mut planner = self.planner.lock().map_err(|e| {
539                        FFTError::ValueError(format!("Failed to acquire planner lock: {e}"))
540                    })?;
541                    if forward {
542                        planner.plan_fft_forward(size)
543                    } else {
544                        planner.plan_fft_inverse(size)
545                    }
546                };
547
548                // For very large inputs, we want to minimize scratch memory
549                // by using the smallest possible scratch buffer
550                let scratch_len = plan.get_inplace_scratch_len();
551
552                // Allocate scratch in smaller chunks if possible
553                let mut scratch = vec![Complex64::new(0.0, 0.0); scratch_len];
554
555                // Execute FFT
556                plan.process_with_scratch(&mut data, &mut scratch);
557
558                // Free scratch immediately to reduce peak memory
559                drop(scratch);
560
561                // Apply normalization for inverse
562                if !forward {
563                    let scale = 1.0 / size as f64;
564                    // Process normalization in chunks to improve cache behavior
565                    for chunk in data.chunks_mut(chunk_size) {
566                        for val in chunk {
567                            *val *= scale;
568                        }
569                    }
570                }
571
572                Ok(data)
573            }
574
575            #[cfg(not(feature = "rustfft-backend"))]
576            {
577                Err(FFTError::ComputationError(
578                    "No FFT backend available. Enable either 'oxifft' or 'rustfft-backend' feature.".to_string()
579                ))
580            }
581        }
582    }
583
584    /// Compute FFT using out-of-core method (for very large inputs)
585    fn compute_out_of_core<T>(&self, input: &[T], forward: bool) -> FFTResult<Vec<Complex64>>
586    where
587        T: NumCast + Copy + Debug + 'static,
588    {
589        // Out-of-core FFT would typically use disk storage for intermediate results
590        // For now, we fall back to streaming with warnings
591        eprintln!(
592            "Warning: Input size {} exceeds target memory, using streaming method",
593            input.len()
594        );
595        self.compute_streaming(input, forward)
596    }
597
598    /// Compute FFT with overlap-save method for streaming convolution
599    pub fn compute_overlap_save<T>(
600        &self,
601        input: &[T],
602        filter_len: usize,
603        forward: bool,
604    ) -> FFTResult<Vec<Complex64>>
605    where
606        T: NumCast + Copy + Debug + 'static,
607    {
608        let input_len = input.len();
609
610        if filter_len == 0 {
611            return Err(FFTError::ValueError(
612                "Filter length must be positive".to_string(),
613            ));
614        }
615
616        // Block size for overlap-save
617        let block_size = (self.config.max_block_size).max(filter_len * 4);
618        let fft_size = block_size.next_power_of_two();
619        let valid_output_per_block = fft_size - filter_len + 1;
620
621        // Number of blocks needed
622        let num_blocks = input_len.div_ceil(valid_output_per_block);
623
624        // Allocate output
625        let output_len = input_len;
626        let mut output = Vec::with_capacity(output_len);
627
628        // Use OxiFFT backend by default
629        #[cfg(feature = "oxifft")]
630        {
631            // Process each block
632            let mut buffer = vec![Complex64::new(0.0, 0.0); fft_size];
633
634            for block_idx in 0..num_blocks {
635                let input_start = if block_idx == 0 {
636                    0
637                } else {
638                    block_idx * valid_output_per_block - (filter_len - 1)
639                };
640
641                // Zero the buffer
642                for val in &mut buffer {
643                    *val = Complex64::new(0.0, 0.0);
644                }
645
646                // Copy input block
647                for (i, j) in (input_start..)
648                    .take(fft_size.min(input_len - input_start))
649                    .enumerate()
650                {
651                    if j < input_len {
652                        let real: f64 = NumCast::from(input[j]).unwrap_or(0.0);
653                        buffer[i] = Complex64::new(real, 0.0);
654                    }
655                }
656
657                // Forward FFT
658                let input_oxi: Vec<OxiComplex<f64>> =
659                    buffer.iter().map(|c| OxiComplex::new(c.re, c.im)).collect();
660                let mut output_oxi: Vec<OxiComplex<f64>> = vec![OxiComplex::zero(); fft_size];
661                oxifft_plan_cache::execute_c2c(&input_oxi, &mut output_oxi, Direction::Forward)?;
662
663                // Convert back to buffer
664                for (i, val) in output_oxi.iter().enumerate() {
665                    buffer[i] = Complex64::new(val.re, val.im);
666                }
667
668                // (Here you would multiply by filter FFT for convolution)
669                // For now, we just do the FFT
670
671                if !forward {
672                    // Inverse FFT
673                    let input_oxi: Vec<OxiComplex<f64>> =
674                        buffer.iter().map(|c| OxiComplex::new(c.re, c.im)).collect();
675                    let mut output_oxi: Vec<OxiComplex<f64>> = vec![OxiComplex::zero(); fft_size];
676                    oxifft_plan_cache::execute_c2c(
677                        &input_oxi,
678                        &mut output_oxi,
679                        Direction::Backward,
680                    )?;
681
682                    // Normalize and convert back to buffer
683                    let scale = 1.0 / fft_size as f64;
684                    for (i, val) in output_oxi.iter().enumerate() {
685                        buffer[i] = Complex64::new(val.re * scale, val.im * scale);
686                    }
687                }
688
689                // Extract valid output
690                let output_start = if block_idx == 0 { 0 } else { filter_len - 1 };
691                let output_count = valid_output_per_block.min(output_len - output.len());
692
693                for i in output_start..(output_start + output_count) {
694                    if i < fft_size {
695                        output.push(buffer[i]);
696                    }
697                }
698            }
699
700            Ok(output)
701        }
702
703        #[cfg(not(feature = "oxifft"))]
704        {
705            #[cfg(feature = "rustfft-backend")]
706            {
707                // Get FFT plans
708                let (fft_plan, ifft_plan) = {
709                    let mut planner = self.planner.lock().map_err(|e| {
710                        FFTError::ValueError(format!("Failed to acquire planner lock: {e}"))
711                    })?;
712                    (
713                        planner.plan_fft_forward(fft_size),
714                        planner.plan_fft_inverse(fft_size),
715                    )
716                };
717
718                // Process each block
719                let mut buffer = vec![Complex64::new(0.0, 0.0); fft_size];
720
721                for block_idx in 0..num_blocks {
722                    let input_start = if block_idx == 0 {
723                        0
724                    } else {
725                        block_idx * valid_output_per_block - (filter_len - 1)
726                    };
727
728                    // Zero the buffer
729                    for val in &mut buffer {
730                        *val = Complex64::new(0.0, 0.0);
731                    }
732
733                    // Copy input block
734                    for (i, j) in (input_start..)
735                        .take(fft_size.min(input_len - input_start))
736                        .enumerate()
737                    {
738                        if j < input_len {
739                            let real: f64 = NumCast::from(input[j]).unwrap_or(0.0);
740                            buffer[i] = Complex64::new(real, 0.0);
741                        }
742                    }
743
744                    // Forward FFT
745                    fft_plan.process(&mut buffer);
746
747                    // (Here you would multiply by filter FFT for convolution)
748                    // For now, we just do the FFT
749
750                    if !forward {
751                        // Inverse FFT
752                        ifft_plan.process(&mut buffer);
753
754                        // Normalize
755                        let scale = 1.0 / fft_size as f64;
756                        for val in &mut buffer {
757                            *val *= scale;
758                        }
759                    }
760
761                    // Extract valid output
762                    let output_start = if block_idx == 0 { 0 } else { filter_len - 1 };
763                    let output_count = valid_output_per_block.min(output_len - output.len());
764
765                    for i in output_start..(output_start + output_count) {
766                        if i < fft_size {
767                            output.push(buffer[i]);
768                        }
769                    }
770                }
771
772                Ok(output)
773            }
774
775            #[cfg(not(feature = "rustfft-backend"))]
776            {
777                Err(FFTError::ComputationError(
778                    "No FFT backend available. Enable either 'oxifft' or 'rustfft-backend' feature.".to_string()
779                ))
780            }
781        }
782    }
783
784    /// Get configuration
785    pub fn config(&self) -> &LargeFftConfig {
786        &self.config
787    }
788
789    /// Get algorithm selector
790    pub fn selector(&self) -> &AlgorithmSelector {
791        &self.selector
792    }
793
794    /// Get estimated memory usage for a given size
795    pub fn estimate_memory(&self, size: usize) -> usize {
796        let method = self.select_method(size);
797        let base_memory = size * 16; // Complex64 = 16 bytes
798
799        match method {
800            LargeFftMethod::Direct => base_memory * 2, // Input + output
801            LargeFftMethod::CacheBlocked => base_memory * 2 + self.config.l2_cache_size,
802            LargeFftMethod::Streaming => base_memory + self.config.max_block_size * 16,
803            LargeFftMethod::OutOfCore => self.config.target_memory_bytes,
804        }
805    }
806}
807
808/// Find optimal block size for cache blocking
809fn find_optimal_block_size(total_size: usize, cache_elements: usize) -> usize {
810    // Find largest power of 2 that fits in cache
811    let mut block = 1;
812    while block * 2 <= cache_elements && block * 2 <= total_size {
813        block *= 2;
814    }
815    block
816}
817
818/// Multi-dimensional large FFT for 2D and higher
819pub struct LargeFftNd {
820    /// 1D large FFT processor
821    fft_1d: LargeFft,
822    /// Configuration
823    config: LargeFftConfig,
824}
825
826impl Default for LargeFftNd {
827    fn default() -> Self {
828        Self::new(LargeFftConfig::default())
829    }
830}
831
832impl LargeFftNd {
833    /// Create new multi-dimensional large FFT processor
834    pub fn new(config: LargeFftConfig) -> Self {
835        Self {
836            fft_1d: LargeFft::new(config.clone()),
837            config,
838        }
839    }
840
841    /// Compute 2D FFT for large input
842    pub fn compute_2d<T>(
843        &self,
844        input: &[T],
845        rows: usize,
846        cols: usize,
847        forward: bool,
848    ) -> FFTResult<Vec<Complex64>>
849    where
850        T: NumCast + Copy + Debug + 'static,
851    {
852        if input.len() != rows * cols {
853            return Err(FFTError::ValueError(format!(
854                "Input size {} doesn't match dimensions {}x{}",
855                input.len(),
856                rows,
857                cols
858            )));
859        }
860
861        // Convert to complex
862        let mut data: Vec<Complex64> = input
863            .iter()
864            .map(|val| {
865                let real: f64 = NumCast::from(*val).unwrap_or(0.0);
866                Complex64::new(real, 0.0)
867            })
868            .collect();
869
870        // Transform rows
871        let mut row_buffer = vec![Complex64::new(0.0, 0.0); cols];
872        for r in 0..rows {
873            // Extract row
874            for c in 0..cols {
875                row_buffer[c] = data[r * cols + c];
876            }
877
878            // Transform row
879            let row_fft = self.fft_1d.compute_direct(&row_buffer, forward)?;
880
881            // Store result
882            for c in 0..cols {
883                data[r * cols + c] = row_fft[c];
884            }
885        }
886
887        // Transform columns
888        let mut col_buffer = vec![Complex64::new(0.0, 0.0); rows];
889        for c in 0..cols {
890            // Extract column
891            for r in 0..rows {
892                col_buffer[r] = data[r * cols + c];
893            }
894
895            // Transform column
896            let col_fft = self.fft_1d.compute_direct(&col_buffer, forward)?;
897
898            // Store result
899            for r in 0..rows {
900                data[r * cols + c] = col_fft[r];
901            }
902        }
903
904        Ok(data)
905    }
906
907    /// Compute N-dimensional FFT for large input
908    pub fn compute_nd<T>(
909        &self,
910        input: &[T],
911        shape: &[usize],
912        forward: bool,
913    ) -> FFTResult<Vec<Complex64>>
914    where
915        T: NumCast + Copy + Debug + 'static,
916    {
917        let total_size: usize = shape.iter().product();
918        if input.len() != total_size {
919            return Err(FFTError::ValueError(format!(
920                "Input size {} doesn't match shape {:?} (expected {})",
921                input.len(),
922                shape,
923                total_size
924            )));
925        }
926
927        // Convert to complex
928        let mut data: Vec<Complex64> = input
929            .iter()
930            .map(|val| {
931                let real: f64 = NumCast::from(*val).unwrap_or(0.0);
932                Complex64::new(real, 0.0)
933            })
934            .collect();
935
936        // Transform along each dimension
937        for (dim_idx, &dim_size) in shape.iter().enumerate() {
938            let stride = shape[(dim_idx + 1)..].iter().product::<usize>().max(1);
939            let outer_size = shape[..dim_idx].iter().product::<usize>().max(1);
940
941            let mut line_buffer = vec![Complex64::new(0.0, 0.0); dim_size];
942
943            for outer in 0..outer_size {
944                let outer_offset = outer * shape[dim_idx..].iter().product::<usize>().max(1);
945
946                for inner in 0..(total_size / (outer_size * dim_size)) {
947                    // Extract line along this dimension
948                    for i in 0..dim_size {
949                        let idx = outer_offset + i * stride + inner;
950                        if idx < data.len() {
951                            line_buffer[i] = data[idx];
952                        }
953                    }
954
955                    // Transform line
956                    let line_fft = self.fft_1d.compute_direct(&line_buffer, forward)?;
957
958                    // Store result
959                    for i in 0..dim_size {
960                        let idx = outer_offset + i * stride + inner;
961                        if idx < data.len() {
962                            data[idx] = line_fft[i];
963                        }
964                    }
965                }
966            }
967        }
968
969        Ok(data)
970    }
971}
972
973#[cfg(test)]
974mod tests {
975    use super::*;
976    use approx::assert_relative_eq;
977
978    #[test]
979    fn test_large_fft_direct() {
980        let large_fft = LargeFft::with_defaults();
981        let input: Vec<f64> = vec![1.0, 2.0, 3.0, 4.0];
982
983        let result = large_fft.compute(&input, true).expect("FFT failed");
984
985        // DC component should be sum of input
986        assert_relative_eq!(result[0].re, 10.0, epsilon = 1e-10);
987    }
988
989    #[test]
990    fn test_large_fft_roundtrip() {
991        let large_fft = LargeFft::with_defaults();
992        let input: Vec<f64> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
993
994        let forward = large_fft.compute(&input, true).expect("Forward FFT failed");
995        let inverse = large_fft
996            .compute_complex(&forward, false)
997            .expect("Inverse FFT failed");
998
999        // Should recover original signal
1000        for (i, &orig) in input.iter().enumerate() {
1001            assert_relative_eq!(inverse[i].re, orig, epsilon = 1e-10);
1002            assert_relative_eq!(inverse[i].im, 0.0, epsilon = 1e-10);
1003        }
1004    }
1005
1006    #[test]
1007    fn test_method_selection() {
1008        let config = LargeFftConfig {
1009            l2_cache_size: 256 * 1024,              // 256 KB
1010            l3_cache_size: 8 * 1024 * 1024,         // 8 MB
1011            target_memory_bytes: 256 * 1024 * 1024, // 256 MB
1012            ..Default::default()
1013        };
1014        let large_fft = LargeFft::new(config);
1015
1016        // Small input should use direct method
1017        let method = large_fft.select_method(1024);
1018        assert_eq!(method, LargeFftMethod::Direct);
1019
1020        // Medium input should use cache-blocked
1021        let method = large_fft.select_method(100_000);
1022        assert_eq!(method, LargeFftMethod::CacheBlocked);
1023
1024        // Large input should use streaming
1025        let method = large_fft.select_method(1_000_000);
1026        assert_eq!(method, LargeFftMethod::Streaming);
1027    }
1028
1029    #[test]
1030    fn test_large_fft_2d() {
1031        let large_fft_nd = LargeFftNd::default();
1032        let input: Vec<f64> = vec![1.0, 2.0, 3.0, 4.0];
1033
1034        let result = large_fft_nd
1035            .compute_2d(&input, 2, 2, true)
1036            .expect("2D FFT failed");
1037
1038        // DC component should be sum of all elements
1039        assert_relative_eq!(result[0].re, 10.0, epsilon = 1e-10);
1040    }
1041
1042    #[test]
1043    fn test_memory_estimation() {
1044        let large_fft = LargeFft::with_defaults();
1045
1046        let small_mem = large_fft.estimate_memory(1024);
1047        let large_mem = large_fft.estimate_memory(1_000_000);
1048
1049        assert!(large_mem > small_mem);
1050    }
1051
1052    #[test]
1053    fn test_find_optimal_block_size() {
1054        let block = find_optimal_block_size(65536, 16384);
1055        assert!(block.is_power_of_two());
1056        assert!(block <= 16384);
1057        assert!(block <= 65536);
1058    }
1059}