scirs2-fft 0.4.2

//! Large FFT Module - Memory-Efficient FFT for Very Large Inputs
//!
//! This module provides memory-efficient FFT implementations for very large inputs
//! that may not fit entirely in memory or cache. It uses various techniques including:
//!
//! - **Streaming FFT**: Process data in chunks to limit memory usage
//! - **Out-of-core FFT**: Handle data larger than available RAM
//! - **Cache-blocking**: Optimize for CPU cache hierarchy
//! - **Overlap-save method**: Efficient convolution for streaming data
//!
//! # Performance Characteristics
//!
//! | Input Size | Method | Memory Usage | Cache Efficiency |
//! |------------|--------|--------------|------------------|
//! | < 64 KB    | Direct | O(n)         | L1 optimal       |
//! | < 256 KB   | Direct | O(n)         | L2 optimal       |
//! | < 8 MB     | Blocked| O(sqrt(n))   | L3 optimal       |
//! | > 8 MB     | Streaming | O(block) | Controlled       |
//!
//! # Example
//!
//! ```rust,no_run
//! use scirs2_fft::large_fft::{LargeFft, LargeFftConfig};
//!
//! let config = LargeFftConfig::default();
//! let large_fft = LargeFft::new(config);
//!
//! let input: Vec<f64> = vec![0.0; 1_000_000];
//! let result = large_fft.compute(&input, true).expect("FFT failed");
//! ```

use crate::algorithm_selector::{AlgorithmSelector, CacheInfo, FftAlgorithm, InputCharacteristics};
use crate::error::{FFTError, FFTResult};
#[cfg(feature = "oxifft")]
use crate::oxifft_plan_cache;
#[cfg(feature = "oxifft")]
use oxifft::{Complex as OxiComplex, Direction};
#[cfg(feature = "rustfft-backend")]
use rustfft::FftPlanner;
use scirs2_core::numeric::Complex64;
use scirs2_core::numeric::NumCast;
use std::fmt::Debug;
#[cfg(feature = "rustfft-backend")]
use std::sync::Mutex;

/// Configuration for large FFT operations
#[derive(Debug, Clone)]
pub struct LargeFftConfig {
    /// Maximum block size for streaming (in elements)
    pub max_block_size: usize,
    /// Target memory usage in bytes
    pub target_memory_bytes: usize,
    /// Whether to use parallel processing
    pub use_parallel: bool,
    /// Number of threads for parallel processing
    pub num_threads: usize,
    /// Cache line size for alignment
    pub cache_line_size: usize,
    /// L1 cache size for blocking
    pub l1_cache_size: usize,
    /// L2 cache size for blocking
    pub l2_cache_size: usize,
    /// L3 cache size for blocking
    pub l3_cache_size: usize,
    /// Whether to use overlap-save for streaming
    pub use_overlap_save: bool,
    /// Overlap ratio for overlap-save method
    pub overlap_ratio: f64,
}

impl Default for LargeFftConfig {
    fn default() -> Self {
        let cache_info = CacheInfo::default();
        Self {
            max_block_size: 65536,                  // 64K elements
            target_memory_bytes: 256 * 1024 * 1024, // 256 MB
            use_parallel: true,
            num_threads: num_cpus::get(),
            cache_line_size: cache_info.cache_line_size,
            l1_cache_size: cache_info.l1_size,
            l2_cache_size: cache_info.l2_size,
            l3_cache_size: cache_info.l3_size,
            use_overlap_save: false,
            overlap_ratio: 0.5,
        }
    }
}

/// Method selection for large FFT
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LargeFftMethod {
    /// Direct FFT (for small inputs)
    Direct,
    /// Cache-blocked FFT (for medium inputs)
    CacheBlocked,
    /// Streaming FFT (for large inputs)
    Streaming,
    /// Out-of-core FFT (for very large inputs)
    OutOfCore,
}

/// Statistics from a large FFT operation
#[derive(Debug, Clone)]
pub struct LargeFftStats {
    /// Method used
    pub method: LargeFftMethod,
    /// Number of blocks processed
    pub num_blocks: usize,
    /// Block size used
    pub block_size: usize,
    /// Peak memory usage estimate
    pub peak_memory_bytes: usize,
    /// Total computation time in nanoseconds
    pub total_time_ns: u64,
}

/// Large FFT processor
pub struct LargeFft {
    /// Configuration
    config: LargeFftConfig,
    /// FFT planner (cached) - only for rustfft backend
    #[cfg(feature = "rustfft-backend")]
    planner: Mutex<FftPlanner<f64>>,
    /// Algorithm selector
    selector: AlgorithmSelector,
}

impl Default for LargeFft {
    fn default() -> Self {
        Self::new(LargeFftConfig::default())
    }
}

impl LargeFft {
    /// Create a new large FFT processor with default configuration
    pub fn with_defaults() -> Self {
        Self::new(LargeFftConfig::default())
    }

    /// Create a new large FFT processor with custom configuration
    pub fn new(config: LargeFftConfig) -> Self {
        Self {
            config,
            #[cfg(feature = "rustfft-backend")]
            planner: Mutex::new(FftPlanner::new()),
            selector: AlgorithmSelector::new(),
        }
    }

    /// Determine the best method for the given input size
    pub fn select_method(&self, size: usize) -> LargeFftMethod {
        let memory_required = size * 16; // Complex64 = 16 bytes

        if memory_required <= self.config.l2_cache_size {
            LargeFftMethod::Direct
        } else if memory_required <= self.config.l3_cache_size {
            LargeFftMethod::CacheBlocked
        } else if memory_required <= self.config.target_memory_bytes {
            LargeFftMethod::Streaming
        } else {
            LargeFftMethod::OutOfCore
        }
    }

    /// Compute FFT for potentially large input
    pub fn compute<T>(&self, input: &[T], forward: bool) -> FFTResult<Vec<Complex64>>
    where
        T: NumCast + Copy + Debug + 'static,
    {
        let size = input.len();
        if size == 0 {
            return Err(FFTError::ValueError("Input cannot be empty".to_string()));
        }

        let method = self.select_method(size);

        match method {
            LargeFftMethod::Direct => self.compute_direct(input, forward),
            LargeFftMethod::CacheBlocked => self.compute_cache_blocked(input, forward),
            LargeFftMethod::Streaming => self.compute_streaming(input, forward),
            LargeFftMethod::OutOfCore => self.compute_out_of_core(input, forward),
        }
    }

    /// Compute FFT for complex-valued input (preserves imaginary components)
    ///
    /// This method should be used when the input is already complex (e.g., for inverse FFT
    /// after a forward transform). Unlike `compute`, this preserves both real and imaginary parts.
    pub fn compute_complex(&self, input: &[Complex64], forward: bool) -> FFTResult<Vec<Complex64>> {
        let size = input.len();
        if size == 0 {
            return Err(FFTError::ValueError("Input cannot be empty".to_string()));
        }

        self.compute_direct_complex(input, forward)
    }

    /// Compute FFT for complex input using direct method
    fn compute_direct_complex(
        &self,
        input: &[Complex64],
        forward: bool,
    ) -> FFTResult<Vec<Complex64>> {
        let size = input.len();
        let data: Vec<Complex64> = input.to_vec();

        #[cfg(feature = "oxifft")]
        {
            let input_oxi: Vec<OxiComplex<f64>> =
                data.iter().map(|c| OxiComplex::new(c.re, c.im)).collect();
            let mut output: Vec<OxiComplex<f64>> = vec![OxiComplex::zero(); size];

            let direction = if forward {
                Direction::Forward
            } else {
                Direction::Backward
            };
            oxifft_plan_cache::execute_c2c(&input_oxi, &mut output, direction)?;

            let mut result: Vec<Complex64> = output
                .into_iter()
                .map(|c| Complex64::new(c.re, c.im))
                .collect();

            if !forward {
                let scale = 1.0 / size as f64;
                for val in &mut result {
                    *val *= scale;
                }
            }

            Ok(result)
        }

        #[cfg(not(feature = "oxifft"))]
        {
            #[cfg(feature = "rustfft-backend")]
            {
                let mut data = data;
                let plan = {
                    let mut planner = self.planner.lock().map_err(|e| {
                        FFTError::ValueError(format!("Failed to acquire planner lock: {e}"))
                    })?;
                    if forward {
                        planner.plan_fft_forward(size)
                    } else {
                        planner.plan_fft_inverse(size)
                    }
                };
                plan.process(&mut data);

                if !forward {
                    let scale = 1.0 / size as f64;
                    for val in &mut data {
                        *val *= scale;
                    }
                }

                return Ok(data);
            }

            #[cfg(not(feature = "rustfft-backend"))]
            {
                return Err(FFTError::ComputationError(
                    "No FFT backend available. Enable either 'oxifft' or 'rustfft-backend' feature.".to_string()
                ));
            }
        }
    }

    /// Compute FFT using direct method (best for small inputs)
    fn compute_direct<T>(&self, input: &[T], forward: bool) -> FFTResult<Vec<Complex64>>
    where
        T: NumCast + Copy + Debug + 'static,
    {
        let size = input.len();

        // Convert input to complex
        let data: Vec<Complex64> = input
            .iter()
            .map(|val| {
                let real: f64 = NumCast::from(*val).unwrap_or(0.0);
                Complex64::new(real, 0.0)
            })
            .collect();

        // Use OxiFFT backend by default
        #[cfg(feature = "oxifft")]
        {
            // Convert to OxiFFT-compatible complex type
            let input_oxi: Vec<OxiComplex<f64>> =
                data.iter().map(|c| OxiComplex::new(c.re, c.im)).collect();
            let mut output: Vec<OxiComplex<f64>> = vec![OxiComplex::zero(); size];

            // Execute FFT with cached plan
            let direction = if forward {
                Direction::Forward
            } else {
                Direction::Backward
            };
            oxifft_plan_cache::execute_c2c(&input_oxi, &mut output, direction)?;

            // Convert back to our Complex64 type
            let mut result: Vec<Complex64> = output
                .into_iter()
                .map(|c| Complex64::new(c.re, c.im))
                .collect();

            // Apply normalization for inverse
            if !forward {
                let scale = 1.0 / size as f64;
                for val in &mut result {
                    *val *= scale;
                }
            }

            Ok(result)
        }

        #[cfg(not(feature = "oxifft"))]
        {
            #[cfg(feature = "rustfft-backend")]
            {
                let mut data = data;

                // Get FFT plan
                let plan = {
                    let mut planner = self.planner.lock().map_err(|e| {
                        FFTError::ValueError(format!("Failed to acquire planner lock: {e}"))
                    })?;
                    if forward {
                        planner.plan_fft_forward(size)
                    } else {
                        planner.plan_fft_inverse(size)
                    }
                };

                // Execute FFT
                plan.process(&mut data);

                // Apply normalization for inverse
                if !forward {
                    let scale = 1.0 / size as f64;
                    for val in &mut data {
                        *val *= scale;
                    }
                }

                Ok(data)
            }

            #[cfg(not(feature = "rustfft-backend"))]
            {
                Err(FFTError::ComputationError(
                    "No FFT backend available. Enable either 'oxifft' or 'rustfft-backend' feature.".to_string()
                ))
            }
        }
    }

    /// Compute FFT using cache-blocked method
    fn compute_cache_blocked<T>(&self, input: &[T], forward: bool) -> FFTResult<Vec<Complex64>>
    where
        T: NumCast + Copy + Debug + 'static,
    {
        let size = input.len();

        // Determine optimal block size based on L2 cache
        let elements_per_cache = self.config.l2_cache_size / 16; // Complex64 = 16 bytes
        let block_size = find_optimal_block_size(size, elements_per_cache);

        // Convert input to complex
        let data: Vec<Complex64> = input
            .iter()
            .map(|val| {
                let real: f64 = NumCast::from(*val).unwrap_or(0.0);
                Complex64::new(real, 0.0)
            })
            .collect();

        // For cache-blocked FFT, we use the Cooley-Tukey decomposition
        // but organize memory access patterns to maximize cache hits
        //
        // For a 1D FFT, we still need to do the full computation,
        // but we can optimize memory access by using in-place algorithms

        // Use OxiFFT backend by default
        #[cfg(feature = "oxifft")]
        {
            // Convert to OxiFFT-compatible complex type
            let input_oxi: Vec<OxiComplex<f64>> =
                data.iter().map(|c| OxiComplex::new(c.re, c.im)).collect();
            let mut output: Vec<OxiComplex<f64>> = vec![OxiComplex::zero(); size];

            // Execute FFT with cached plan
            let direction = if forward {
                Direction::Forward
            } else {
                Direction::Backward
            };
            oxifft_plan_cache::execute_c2c(&input_oxi, &mut output, direction)?;

            // Convert back to our Complex64 type
            let mut result: Vec<Complex64> = output
                .into_iter()
                .map(|c| Complex64::new(c.re, c.im))
                .collect();

            // Apply normalization for inverse
            if !forward {
                let scale = 1.0 / size as f64;
                for val in &mut result {
                    *val *= scale;
                }
            }

            // Log block size for debugging (could be useful for profiling)
            let _ = block_size;

            Ok(result)
        }

        #[cfg(not(feature = "oxifft"))]
        {
            #[cfg(feature = "rustfft-backend")]
            {
                let mut data = data;

                // Get FFT plan
                let plan = {
                    let mut planner = self.planner.lock().map_err(|e| {
                        FFTError::ValueError(format!("Failed to acquire planner lock: {e}"))
                    })?;
                    if forward {
                        planner.plan_fft_forward(size)
                    } else {
                        planner.plan_fft_inverse(size)
                    }
                };

                // Use scratch buffer for better cache behavior
                let scratch_len = plan.get_inplace_scratch_len();
                let mut scratch = vec![Complex64::new(0.0, 0.0); scratch_len];

                // Execute FFT with scratch (better cache behavior)
                plan.process_with_scratch(&mut data, &mut scratch);

                // Apply normalization for inverse
                if !forward {
                    let scale = 1.0 / size as f64;
                    for val in &mut data {
                        *val *= scale;
                    }
                }

                // Log block size for debugging (could be useful for profiling)
                let _ = block_size;

                Ok(data)
            }

            #[cfg(not(feature = "rustfft-backend"))]
            {
                Err(FFTError::ComputationError(
                    "No FFT backend available. Enable either 'oxifft' or 'rustfft-backend' feature.".to_string()
                ))
            }
        }
    }

    /// Compute FFT using streaming method (for large inputs)
    fn compute_streaming<T>(&self, input: &[T], forward: bool) -> FFTResult<Vec<Complex64>>
    where
        T: NumCast + Copy + Debug + 'static,
    {
        let size = input.len();

        // For streaming, we need to use overlap-save or overlap-add method
        // However, for a simple FFT (not convolution), we still need to process
        // the entire signal at once for correct results
        //
        // The "streaming" optimization here is about memory access patterns
        // and using efficient scratch allocation

        // Convert input to complex in chunks to reduce peak memory during conversion
        let chunk_size = self.config.max_block_size;
        let mut data: Vec<Complex64> = Vec::with_capacity(size);

        for chunk in input.chunks(chunk_size) {
            for val in chunk {
                let real: f64 = NumCast::from(*val).unwrap_or(0.0);
                data.push(Complex64::new(real, 0.0));
            }
        }

        // Use OxiFFT backend by default
        #[cfg(feature = "oxifft")]
        {
            // Convert to OxiFFT-compatible complex type
            let input_oxi: Vec<OxiComplex<f64>> =
                data.iter().map(|c| OxiComplex::new(c.re, c.im)).collect();
            let mut output: Vec<OxiComplex<f64>> = vec![OxiComplex::zero(); size];

            // Execute FFT with cached plan
            let direction = if forward {
                Direction::Forward
            } else {
                Direction::Backward
            };
            oxifft_plan_cache::execute_c2c(&input_oxi, &mut output, direction)?;

            // Convert back to our Complex64 type
            let mut result: Vec<Complex64> = output
                .into_iter()
                .map(|c| Complex64::new(c.re, c.im))
                .collect();

            // Apply normalization for inverse
            if !forward {
                let scale = 1.0 / size as f64;
                // Process normalization in chunks to improve cache behavior
                for chunk in result.chunks_mut(chunk_size) {
                    for val in chunk {
                        *val *= scale;
                    }
                }
            }

            Ok(result)
        }

        #[cfg(not(feature = "oxifft"))]
        {
            #[cfg(feature = "rustfft-backend")]
            {
                // Get FFT plan
                let plan = {
                    let mut planner = self.planner.lock().map_err(|e| {
                        FFTError::ValueError(format!("Failed to acquire planner lock: {e}"))
                    })?;
                    if forward {
                        planner.plan_fft_forward(size)
                    } else {
                        planner.plan_fft_inverse(size)
                    }
                };

                // For very large inputs, we want to minimize scratch memory
                // by using the smallest possible scratch buffer
                let scratch_len = plan.get_inplace_scratch_len();

                // Allocate scratch in smaller chunks if possible
                let mut scratch = vec![Complex64::new(0.0, 0.0); scratch_len];

                // Execute FFT
                plan.process_with_scratch(&mut data, &mut scratch);

                // Free scratch immediately to reduce peak memory
                drop(scratch);

                // Apply normalization for inverse
                if !forward {
                    let scale = 1.0 / size as f64;
                    // Process normalization in chunks to improve cache behavior
                    for chunk in data.chunks_mut(chunk_size) {
                        for val in chunk {
                            *val *= scale;
                        }
                    }
                }

                Ok(data)
            }

            #[cfg(not(feature = "rustfft-backend"))]
            {
                Err(FFTError::ComputationError(
                    "No FFT backend available. Enable either 'oxifft' or 'rustfft-backend' feature.".to_string()
                ))
            }
        }
    }

    /// Compute FFT using out-of-core method (for very large inputs)
    fn compute_out_of_core<T>(&self, input: &[T], forward: bool) -> FFTResult<Vec<Complex64>>
    where
        T: NumCast + Copy + Debug + 'static,
    {
        // Out-of-core FFT would typically use disk storage for intermediate results
        // For now, we fall back to streaming with warnings
        eprintln!(
            "Warning: Input size {} exceeds target memory, using streaming method",
            input.len()
        );
        self.compute_streaming(input, forward)
    }

    /// Compute FFT with overlap-save method for streaming convolution
    pub fn compute_overlap_save<T>(
        &self,
        input: &[T],
        filter_len: usize,
        forward: bool,
    ) -> FFTResult<Vec<Complex64>>
    where
        T: NumCast + Copy + Debug + 'static,
    {
        let input_len = input.len();

        if filter_len == 0 {
            return Err(FFTError::ValueError(
                "Filter length must be positive".to_string(),
            ));
        }

        // Block size for overlap-save
        let block_size = (self.config.max_block_size).max(filter_len * 4);
        let fft_size = block_size.next_power_of_two();
        let valid_output_per_block = fft_size - filter_len + 1;

        // Number of blocks needed
        let num_blocks = input_len.div_ceil(valid_output_per_block);

        // Allocate output
        let output_len = input_len;
        let mut output = Vec::with_capacity(output_len);

        // Use OxiFFT backend by default
        #[cfg(feature = "oxifft")]
        {
            // Process each block
            let mut buffer = vec![Complex64::new(0.0, 0.0); fft_size];

            for block_idx in 0..num_blocks {
                let input_start = if block_idx == 0 {
                    0
                } else {
                    block_idx * valid_output_per_block - (filter_len - 1)
                };

                // Zero the buffer
                for val in &mut buffer {
                    *val = Complex64::new(0.0, 0.0);
                }

                // Copy input block
                for (i, j) in (input_start..)
                    .take(fft_size.min(input_len - input_start))
                    .enumerate()
                {
                    if j < input_len {
                        let real: f64 = NumCast::from(input[j]).unwrap_or(0.0);
                        buffer[i] = Complex64::new(real, 0.0);
                    }
                }

                // Forward FFT
                let input_oxi: Vec<OxiComplex<f64>> =
                    buffer.iter().map(|c| OxiComplex::new(c.re, c.im)).collect();
                let mut output_oxi: Vec<OxiComplex<f64>> = vec![OxiComplex::zero(); fft_size];
                oxifft_plan_cache::execute_c2c(&input_oxi, &mut output_oxi, Direction::Forward)?;

                // Convert back to buffer
                for (i, val) in output_oxi.iter().enumerate() {
                    buffer[i] = Complex64::new(val.re, val.im);
                }

                // (Here you would multiply by filter FFT for convolution)
                // For now, we just do the FFT

                if !forward {
                    // Inverse FFT
                    let input_oxi: Vec<OxiComplex<f64>> =
                        buffer.iter().map(|c| OxiComplex::new(c.re, c.im)).collect();
                    let mut output_oxi: Vec<OxiComplex<f64>> = vec![OxiComplex::zero(); fft_size];
                    oxifft_plan_cache::execute_c2c(
                        &input_oxi,
                        &mut output_oxi,
                        Direction::Backward,
                    )?;

                    // Normalize and convert back to buffer
                    let scale = 1.0 / fft_size as f64;
                    for (i, val) in output_oxi.iter().enumerate() {
                        buffer[i] = Complex64::new(val.re * scale, val.im * scale);
                    }
                }

                // Extract valid output
                let output_start = if block_idx == 0 { 0 } else { filter_len - 1 };
                let output_count = valid_output_per_block.min(output_len - output.len());

                for i in output_start..(output_start + output_count) {
                    if i < fft_size {
                        output.push(buffer[i]);
                    }
                }
            }

            Ok(output)
        }

        #[cfg(not(feature = "oxifft"))]
        {
            #[cfg(feature = "rustfft-backend")]
            {
                // Get FFT plans
                let (fft_plan, ifft_plan) = {
                    let mut planner = self.planner.lock().map_err(|e| {
                        FFTError::ValueError(format!("Failed to acquire planner lock: {e}"))
                    })?;
                    (
                        planner.plan_fft_forward(fft_size),
                        planner.plan_fft_inverse(fft_size),
                    )
                };

                // Process each block
                let mut buffer = vec![Complex64::new(0.0, 0.0); fft_size];

                for block_idx in 0..num_blocks {
                    let input_start = if block_idx == 0 {
                        0
                    } else {
                        block_idx * valid_output_per_block - (filter_len - 1)
                    };

                    // Zero the buffer
                    for val in &mut buffer {
                        *val = Complex64::new(0.0, 0.0);
                    }

                    // Copy input block
                    for (i, j) in (input_start..)
                        .take(fft_size.min(input_len - input_start))
                        .enumerate()
                    {
                        if j < input_len {
                            let real: f64 = NumCast::from(input[j]).unwrap_or(0.0);
                            buffer[i] = Complex64::new(real, 0.0);
                        }
                    }

                    // Forward FFT
                    fft_plan.process(&mut buffer);

                    // (Here you would multiply by filter FFT for convolution)
                    // For now, we just do the FFT

                    if !forward {
                        // Inverse FFT
                        ifft_plan.process(&mut buffer);

                        // Normalize
                        let scale = 1.0 / fft_size as f64;
                        for val in &mut buffer {
                            *val *= scale;
                        }
                    }

                    // Extract valid output
                    let output_start = if block_idx == 0 { 0 } else { filter_len - 1 };
                    let output_count = valid_output_per_block.min(output_len - output.len());

                    for i in output_start..(output_start + output_count) {
                        if i < fft_size {
                            output.push(buffer[i]);
                        }
                    }
                }

                Ok(output)
            }

            #[cfg(not(feature = "rustfft-backend"))]
            {
                Err(FFTError::ComputationError(
                    "No FFT backend available. Enable either 'oxifft' or 'rustfft-backend' feature.".to_string()
                ))
            }
        }
    }

    /// Get configuration
    pub fn config(&self) -> &LargeFftConfig {
        &self.config
    }

    /// Get algorithm selector
    pub fn selector(&self) -> &AlgorithmSelector {
        &self.selector
    }

    /// Get estimated memory usage for a given size
    pub fn estimate_memory(&self, size: usize) -> usize {
        let method = self.select_method(size);
        let base_memory = size * 16; // Complex64 = 16 bytes

        match method {
            LargeFftMethod::Direct => base_memory * 2, // Input + output
            LargeFftMethod::CacheBlocked => base_memory * 2 + self.config.l2_cache_size,
            LargeFftMethod::Streaming => base_memory + self.config.max_block_size * 16,
            LargeFftMethod::OutOfCore => self.config.target_memory_bytes,
        }
    }
}

/// Find optimal block size for cache blocking
fn find_optimal_block_size(total_size: usize, cache_elements: usize) -> usize {
    // Find largest power of 2 that fits in cache
    let mut block = 1;
    while block * 2 <= cache_elements && block * 2 <= total_size {
        block *= 2;
    }
    block
}

/// Multi-dimensional large FFT for 2D and higher
pub struct LargeFftNd {
    /// 1D large FFT processor
    fft_1d: LargeFft,
    /// Configuration
    config: LargeFftConfig,
}

impl Default for LargeFftNd {
    fn default() -> Self {
        Self::new(LargeFftConfig::default())
    }
}

impl LargeFftNd {
    /// Create new multi-dimensional large FFT processor
    pub fn new(config: LargeFftConfig) -> Self {
        Self {
            fft_1d: LargeFft::new(config.clone()),
            config,
        }
    }

    /// Compute 2D FFT for large input
    pub fn compute_2d<T>(
        &self,
        input: &[T],
        rows: usize,
        cols: usize,
        forward: bool,
    ) -> FFTResult<Vec<Complex64>>
    where
        T: NumCast + Copy + Debug + 'static,
    {
        if input.len() != rows * cols {
            return Err(FFTError::ValueError(format!(
                "Input size {} doesn't match dimensions {}x{}",
                input.len(),
                rows,
                cols
            )));
        }

        // Convert to complex
        let mut data: Vec<Complex64> = input
            .iter()
            .map(|val| {
                let real: f64 = NumCast::from(*val).unwrap_or(0.0);
                Complex64::new(real, 0.0)
            })
            .collect();

        // Transform rows
        let mut row_buffer = vec![Complex64::new(0.0, 0.0); cols];
        for r in 0..rows {
            // Extract row
            for c in 0..cols {
                row_buffer[c] = data[r * cols + c];
            }

            // Transform row
            let row_fft = self.fft_1d.compute_direct(&row_buffer, forward)?;

            // Store result
            for c in 0..cols {
                data[r * cols + c] = row_fft[c];
            }
        }

        // Transform columns
        let mut col_buffer = vec![Complex64::new(0.0, 0.0); rows];
        for c in 0..cols {
            // Extract column
            for r in 0..rows {
                col_buffer[r] = data[r * cols + c];
            }

            // Transform column
            let col_fft = self.fft_1d.compute_direct(&col_buffer, forward)?;

            // Store result
            for r in 0..rows {
                data[r * cols + c] = col_fft[r];
            }
        }

        Ok(data)
    }

    /// Compute N-dimensional FFT for large input
    pub fn compute_nd<T>(
        &self,
        input: &[T],
        shape: &[usize],
        forward: bool,
    ) -> FFTResult<Vec<Complex64>>
    where
        T: NumCast + Copy + Debug + 'static,
    {
        let total_size: usize = shape.iter().product();
        if input.len() != total_size {
            return Err(FFTError::ValueError(format!(
                "Input size {} doesn't match shape {:?} (expected {})",
                input.len(),
                shape,
                total_size
            )));
        }

        // Convert to complex
        let mut data: Vec<Complex64> = input
            .iter()
            .map(|val| {
                let real: f64 = NumCast::from(*val).unwrap_or(0.0);
                Complex64::new(real, 0.0)
            })
            .collect();

        // Transform along each dimension
        for (dim_idx, &dim_size) in shape.iter().enumerate() {
            let stride = shape[(dim_idx + 1)..].iter().product::<usize>().max(1);
            let outer_size = shape[..dim_idx].iter().product::<usize>().max(1);

            let mut line_buffer = vec![Complex64::new(0.0, 0.0); dim_size];

            for outer in 0..outer_size {
                let outer_offset = outer * shape[dim_idx..].iter().product::<usize>().max(1);

                for inner in 0..(total_size / (outer_size * dim_size)) {
                    // Extract line along this dimension
                    for i in 0..dim_size {
                        let idx = outer_offset + i * stride + inner;
                        if idx < data.len() {
                            line_buffer[i] = data[idx];
                        }
                    }

                    // Transform line
                    let line_fft = self.fft_1d.compute_direct(&line_buffer, forward)?;

                    // Store result
                    for i in 0..dim_size {
                        let idx = outer_offset + i * stride + inner;
                        if idx < data.len() {
                            data[idx] = line_fft[i];
                        }
                    }
                }
            }
        }

        Ok(data)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use approx::assert_relative_eq;

    #[test]
    fn test_large_fft_direct() {
        let large_fft = LargeFft::with_defaults();
        let input: Vec<f64> = vec![1.0, 2.0, 3.0, 4.0];

        let result = large_fft.compute(&input, true).expect("FFT failed");

        // DC component should be sum of input
        assert_relative_eq!(result[0].re, 10.0, epsilon = 1e-10);
    }

    #[test]
    fn test_large_fft_roundtrip() {
        let large_fft = LargeFft::with_defaults();
        let input: Vec<f64> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];

        let forward = large_fft.compute(&input, true).expect("Forward FFT failed");
        let inverse = large_fft
            .compute_complex(&forward, false)
            .expect("Inverse FFT failed");

        // Should recover original signal
        for (i, &orig) in input.iter().enumerate() {
            assert_relative_eq!(inverse[i].re, orig, epsilon = 1e-10);
            assert_relative_eq!(inverse[i].im, 0.0, epsilon = 1e-10);
        }
    }

    #[test]
    fn test_method_selection() {
        let config = LargeFftConfig {
            l2_cache_size: 256 * 1024,              // 256 KB
            l3_cache_size: 8 * 1024 * 1024,         // 8 MB
            target_memory_bytes: 256 * 1024 * 1024, // 256 MB
            ..Default::default()
        };
        let large_fft = LargeFft::new(config);

        // Small input should use direct method
        let method = large_fft.select_method(1024);
        assert_eq!(method, LargeFftMethod::Direct);

        // Medium input should use cache-blocked
        let method = large_fft.select_method(100_000);
        assert_eq!(method, LargeFftMethod::CacheBlocked);

        // Large input should use streaming
        let method = large_fft.select_method(1_000_000);
        assert_eq!(method, LargeFftMethod::Streaming);
    }

    #[test]
    fn test_large_fft_2d() {
        let large_fft_nd = LargeFftNd::default();
        let input: Vec<f64> = vec![1.0, 2.0, 3.0, 4.0];

        let result = large_fft_nd
            .compute_2d(&input, 2, 2, true)
            .expect("2D FFT failed");

        // DC component should be sum of all elements
        assert_relative_eq!(result[0].re, 10.0, epsilon = 1e-10);
    }

    #[test]
    fn test_memory_estimation() {
        let large_fft = LargeFft::with_defaults();

        let small_mem = large_fft.estimate_memory(1024);
        let large_mem = large_fft.estimate_memory(1_000_000);

        assert!(large_mem > small_mem);
    }

    #[test]
    fn test_find_optimal_block_size() {
        let block = find_optimal_block_size(65536, 16384);
        assert!(block.is_power_of_two());
        assert!(block <= 16384);
        assert!(block <= 65536);
    }
}