torsh-backend 0.1.2

//! Advanced Parallel Processing Optimization Engine
//!
//! This module provides enterprise-grade parallel processing optimizations using SciRS2,
//! featuring intelligent thread pool management, work-stealing optimization, load balancing,
//! NUMA-aware scheduling, cache-conscious task distribution, and adaptive parallel strategies
//! to maximize CPU utilization and minimize synchronization overhead.
//!
//! ✅ SciRS2 POLICY: Uses scirs2_core::parallel_ops instead of direct rayon

// Framework infrastructure - components designed for future use
#![allow(dead_code)]
use crate::error::{BackendError, BackendResult};
// ✅ SciRS2 POLICY: Use scirs2_core::parallel_ops instead of direct rayon
use scirs2_core::parallel_ops::*;
use std::collections::HashMap;
use std::sync::{atomic::AtomicU64, Arc, Mutex, RwLock};
use std::time::{Duration, Instant};
use torsh_core::TensorElement;

// For ndarray operations, use a simplified approach without external dependencies
type Array1<T> = Vec<T>;
type Array2<T> = Vec<Vec<T>>;
type ArrayView1<'a, T> = &'a [T];
type ArrayView2<'a, T> = &'a [&'a [T]];
type ArrayViewMut1<'a, T> = &'a mut [T];
type ArrayViewMut2<'a, T> = &'a mut [&'a mut [T]];

/// Advanced Rayon parallel processing optimization coordinator
#[derive(Debug)]
pub struct AdvancedRayonOptimizer {
    /// Intelligent thread pool manager
    thread_pool_manager: Arc<Mutex<IntelligentThreadPoolManager>>,

    /// Work-stealing optimization engine
    work_stealing_optimizer: Arc<Mutex<WorkStealingOptimizer>>,

    /// Load balancing coordinator
    load_balancer: Arc<Mutex<AdaptiveLoadBalancer>>,

    /// NUMA-aware scheduler
    numa_scheduler: Arc<Mutex<NumaAwareScheduler>>,

    /// Cache-conscious task distributor
    cache_optimizer: Arc<Mutex<CacheConsciousTaskDistributor>>,

    /// Parallel strategy selector
    strategy_selector: Arc<Mutex<ParallelStrategySelector>>,

    /// Performance monitor
    performance_monitor: Arc<Mutex<ParallelPerformanceMonitor>>,

    /// Configuration
    config: RayonOptimizationConfig,

    /// Current active thread pools
    thread_pools: Arc<RwLock<HashMap<ThreadPoolProfile, Arc<ThreadPool>>>>,

    /// Statistics
    statistics: Arc<Mutex<ParallelProcessingStatistics>>,
}

/// Intelligent thread pool management system
#[derive(Debug)]
pub struct IntelligentThreadPoolManager {
    /// CPU topology analyzer
    cpu_topology: CpuTopologyAnalyzer,

    /// Thread affinity manager
    affinity_manager: ThreadAffinityManager,

    /// Pool configuration optimizer
    config_optimizer: PoolConfigurationOptimizer,

    /// Thread lifecycle manager
    lifecycle_manager: ThreadLifecycleManager,

    /// Performance predictor
    performance_predictor: ThreadPoolPerformancePredictor,

    /// Active pools registry
    active_pools: HashMap<ThreadPoolProfile, PoolMetadata>,

    /// Configuration
    config: ThreadPoolManagementConfig,
}

/// Work-stealing optimization engine
#[derive(Debug)]
pub struct WorkStealingOptimizer {
    /// Deque sizing optimizer
    deque_optimizer: DequeOptimizer,

    /// Victim selection strategy
    victim_selector: VictimSelectionStrategy,

    /// Task granularity analyzer
    granularity_analyzer: TaskGranularityAnalyzer,

    /// Steal strategy optimizer
    steal_strategy: StealStrategyOptimizer,

    /// Contention resolver
    contention_resolver: ContentionResolver,

    /// Configuration
    config: WorkStealingConfig,

    /// Statistics
    stealing_stats: WorkStealingStatistics,
}

/// Adaptive load balancing coordinator
#[derive(Debug)]
pub struct AdaptiveLoadBalancer {
    /// Workload distribution analyzer
    distribution_analyzer: WorkloadDistributionAnalyzer,

    /// Load prediction engine
    load_predictor: LoadPredictionEngine,

    /// Dynamic rebalancing system
    rebalancer: DynamicRebalancer,

    /// Task migration manager
    migration_manager: TaskMigrationManager,

    /// Load monitoring system
    load_monitor: LoadMonitoringSystem,

    /// Configuration
    config: LoadBalancingConfig,

    /// Current load state
    load_state: LoadBalancingState,
}

/// NUMA-aware scheduling system
#[derive(Debug)]
pub struct NumaAwareScheduler {
    /// NUMA topology detector
    numa_topology: NumaTopologyDetector,

    /// Memory affinity optimizer
    memory_affinity: MemoryAffinityOptimizer,

    /// Cross-NUMA communication minimizer
    communication_minimizer: CrossNumaCommunicationMinimizer,

    /// Local memory allocation prioritizer
    local_allocation_prioritizer: LocalAllocationPrioritizer,

    /// NUMA performance tracker
    numa_performance_tracker: NumaPerformanceTracker,

    /// Configuration
    config: NumaSchedulingConfig,
}

/// Cache-conscious task distribution system
#[derive(Debug)]
pub struct CacheConsciousTaskDistributor {
    /// Cache hierarchy analyzer
    cache_analyzer: CacheHierarchyAnalyzer,

    /// Data locality optimizer
    locality_optimizer: DataLocalityOptimizer,

    /// Cache-friendly chunking strategy
    chunking_strategy: CacheFriendlyChunkingStrategy,

    /// Prefetch pattern optimizer
    prefetch_optimizer: PrefetchPatternOptimizer,

    /// False sharing detector
    false_sharing_detector: FalseSharingDetector,

    /// Configuration
    config: CacheOptimizationConfig,
}

/// Parallel strategy selection engine
#[derive(Debug)]
pub struct ParallelStrategySelector {
    /// Algorithm complexity analyzer
    complexity_analyzer: AlgorithmComplexityAnalyzer,

    /// Data size analyzer
    data_analyzer: DataSizeAnalyzer,

    /// Hardware capability detector
    hardware_detector: HardwareCapabilityDetector,

    /// Strategy performance database
    strategy_db: StrategyPerformanceDatabase,

    /// Adaptive strategy engine
    adaptive_engine: AdaptiveStrategyEngine,

    /// Configuration
    config: StrategySelectionConfig,
}

/// Thread pool profile for different workload types
#[derive(Debug, Clone, Hash, PartialEq, Eq)]
pub struct ThreadPoolProfile {
    /// Workload type
    workload_type: WorkloadType,

    /// Expected thread count
    thread_count: usize,

    /// Stack size requirements
    stack_size: usize,

    /// Priority level
    priority: ThreadPriority,

    /// NUMA affinity
    numa_affinity: Option<u32>,

    /// CPU affinity mask
    cpu_affinity: Option<Vec<usize>>,
}

/// Pool metadata for tracking performance
#[derive(Debug)]
pub struct PoolMetadata {
    /// Creation timestamp
    created_at: Instant,

    /// Last usage timestamp
    last_used: Instant,

    /// Total tasks executed
    tasks_executed: AtomicU64,

    /// Total execution time
    total_execution_time: AtomicU64,

    /// Average task latency
    average_latency: AtomicU64,

    /// Pool efficiency score
    efficiency_score: AtomicU64,
}

/// Workload type classification
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
pub enum WorkloadType {
    /// CPU-intensive mathematical operations
    ComputeIntensive,

    /// Memory-bound operations
    MemoryBound,

    /// I/O intensive operations
    IoBound,

    /// Balanced compute and memory operations
    Balanced,

    /// Short-duration tasks
    ShortTasks,

    /// Long-duration tasks
    LongTasks,

    /// Real-time operations
    RealTime,
}

/// Thread priority levels
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
pub enum ThreadPriority {
    Low,
    Normal,
    High,
    Critical,
}

/// Parallel processing strategy types
#[derive(Debug, Clone, Copy)]
pub enum ParallelStrategy {
    /// Simple parallel iteration
    Simple,

    /// Chunked parallel processing
    Chunked,

    /// Recursive divide-and-conquer
    Recursive,

    /// Pipeline parallel processing
    Pipeline,

    /// Reduce-based parallelism
    Reduce,

    /// Custom task-based parallelism
    TaskBased,
}

impl AdvancedRayonOptimizer {
    /// Create new advanced Rayon optimizer
    pub fn new(config: RayonOptimizationConfig) -> BackendResult<Self> {
        let thread_pool_manager = Arc::new(Mutex::new(IntelligentThreadPoolManager::new(
            &config.thread_pool_config,
        )?));

        let work_stealing_optimizer = Arc::new(Mutex::new(WorkStealingOptimizer::new(
            &config.work_stealing_config,
        )?));

        let load_balancer = Arc::new(Mutex::new(AdaptiveLoadBalancer::new(
            &config.load_balancing_config,
        )?));

        let numa_scheduler = Arc::new(Mutex::new(NumaAwareScheduler::new(&config.numa_config)?));

        let cache_optimizer = Arc::new(Mutex::new(CacheConsciousTaskDistributor::new(
            &config.cache_config,
        )?));

        let strategy_selector = Arc::new(Mutex::new(ParallelStrategySelector::new(
            &config.strategy_config,
        )?));

        let performance_monitor = Arc::new(Mutex::new(ParallelPerformanceMonitor::new(
            &config.monitoring_config,
        )?));

        let thread_pools = Arc::new(RwLock::new(HashMap::new()));
        let statistics = Arc::new(Mutex::new(ParallelProcessingStatistics::new()));

        Ok(Self {
            thread_pool_manager,
            work_stealing_optimizer,
            load_balancer,
            numa_scheduler,
            cache_optimizer,
            strategy_selector,
            performance_monitor,
            config,
            thread_pools,
            statistics,
        })
    }

    /// Execute optimized parallel matrix multiplication
    pub fn parallel_matmul<T>(
        &self,
        a: &ArrayView2<T>,
        b: &ArrayView2<T>,
        c: &mut ArrayViewMut2<T>,
    ) -> BackendResult<()>
    where
        T: TensorElement + Send + Sync + std::ops::AddAssign + Copy + std::ops::Mul<Output = T>,
    {
        let start_time = Instant::now();

        // Analyze operation characteristics
        let operation_signature = self.analyze_matmul_characteristics(a, b, c)?;

        // Select optimal parallel strategy
        let strategy = self.select_parallel_strategy(&operation_signature)?;

        // Get or create optimal thread pool
        let pool = self.get_optimal_thread_pool(&operation_signature)?;

        // Execute parallel matrix multiplication
        let result = pool.install(|| self.execute_parallel_matmul_with_strategy(a, b, c, strategy));

        // Record performance metrics
        let execution_time = start_time.elapsed();
        self.record_performance_metrics(&operation_signature, execution_time, &result)?;

        result
    }

    /// Execute optimized parallel element-wise operations
    pub fn parallel_elementwise<T, F>(
        &self,
        input: &ArrayView1<T>,
        output: &mut ArrayViewMut1<T>,
        operation: F,
    ) -> BackendResult<()>
    where
        T: TensorElement + Send + Sync + Copy,
        F: Fn(T) -> T + Send + Sync,
    {
        let start_time = Instant::now();

        // Analyze operation characteristics
        let operation_signature = self.analyze_elementwise_characteristics(input, output)?;

        // Select optimal chunking strategy
        let chunking_strategy = self.select_chunking_strategy(&operation_signature)?;

        // Get optimal thread pool
        let pool = self.get_optimal_thread_pool(&operation_signature)?;

        // Execute parallel element-wise operation
        let result = pool.install(|| match chunking_strategy {
            ChunkingStrategy::Fixed(chunk_size) => {
                self.execute_fixed_chunk_elementwise(input, output, &operation, chunk_size)
            }
            ChunkingStrategy::Adaptive => {
                self.execute_adaptive_chunk_elementwise(input, output, &operation)
            }
            ChunkingStrategy::CacheOptimized => {
                self.execute_cache_optimized_elementwise(input, output, &operation)
            }
        });

        // Record performance metrics
        let execution_time = start_time.elapsed();
        self.record_elementwise_performance(&operation_signature, execution_time, &result)?;

        result
    }

    /// Execute optimized parallel reduction
    pub fn parallel_reduce<T, F>(
        &self,
        input: &ArrayView1<T>,
        identity: T,
        operation: F,
    ) -> BackendResult<T>
    where
        T: TensorElement + Send + Sync + Copy,
        F: Fn(T, T) -> T + Send + Sync,
    {
        let start_time = Instant::now();

        // Analyze operation characteristics
        let operation_signature = self.analyze_reduction_characteristics(input)?;

        // Select optimal reduction strategy
        let strategy = self.select_reduction_strategy(&operation_signature)?;

        // Get optimal thread pool
        let pool = self.get_optimal_thread_pool(&operation_signature)?;

        // Execute parallel reduction
        let result = pool.install(|| match strategy {
            ReductionStrategy::Simple => {
                Ok(input.par_iter().copied().reduce(|| identity, &operation))
            }
            ReductionStrategy::Hierarchical => {
                self.execute_hierarchical_reduction(input, identity, &operation)
            }
            ReductionStrategy::NUMA => {
                self.execute_numa_aware_reduction(input, identity, &operation)
            }
            ReductionStrategy::CacheOptimized => {
                self.execute_cache_optimized_reduction(input, identity, &operation)
            }
        });

        // Record performance metrics
        let execution_time = start_time.elapsed();
        self.record_reduction_performance(&operation_signature, execution_time, &result)?;

        result
    }

    /// Execute optimized parallel convolution
    pub fn parallel_convolution<T>(
        &self,
        input: &ArrayView2<T>,
        kernel: &ArrayView2<T>,
        output: &mut ArrayViewMut2<T>,
        config: &ConvolutionConfig,
    ) -> BackendResult<()>
    where
        T: TensorElement + Send + Sync + Copy + std::ops::AddAssign,
    {
        let start_time = Instant::now();

        // Analyze operation characteristics
        let operation_signature =
            self.analyze_convolution_characteristics(input, kernel, config)?;

        // Select optimal parallel strategy
        let strategy = self.select_convolution_strategy(&operation_signature)?;

        // Get optimal thread pool
        let pool = self.get_optimal_thread_pool(&operation_signature)?;

        // Execute parallel convolution
        let result = pool.install(|| match strategy {
            ConvolutionStrategy::OutputParallel => {
                self.execute_output_parallel_convolution(input, kernel, output, config)
            }
            ConvolutionStrategy::InputParallel => {
                self.execute_input_parallel_convolution(input, kernel, output, config)
            }
            ConvolutionStrategy::Tiled => {
                self.execute_tiled_parallel_convolution(input, kernel, output, config)
            }
            ConvolutionStrategy::SIMD => {
                self.execute_simd_parallel_convolution(input, kernel, output, config)
            }
        });

        // Record performance metrics
        let execution_time = start_time.elapsed();
        self.record_convolution_performance(&operation_signature, execution_time, &result)?;

        result
    }

    /// Get comprehensive performance statistics
    pub fn get_performance_statistics(&self) -> BackendResult<ParallelPerformanceReport> {
        let statistics = self.statistics.lock().map_err(|_| {
            BackendError::BackendError("Failed to acquire statistics lock".to_string())
        })?;
        statistics.generate_comprehensive_report()
    }

    /// Optimize thread pool configuration for specific workload
    pub fn optimize_for_workload(&self, workload_type: WorkloadType) -> BackendResult<()> {
        let mut manager = self.thread_pool_manager.lock().map_err(|_| {
            BackendError::BackendError("Failed to acquire thread pool manager lock".to_string())
        })?;
        manager.optimize_for_workload(workload_type)
    }

    /// Auto-tune parallel processing parameters
    pub fn auto_tune_parameters(
        &self,
        benchmark_duration: Duration,
    ) -> BackendResult<OptimizationResults> {
        let mut optimizer = self.work_stealing_optimizer.lock().map_err(|_| {
            BackendError::BackendError("Failed to acquire work stealing optimizer lock".to_string())
        })?;
        optimizer.auto_tune_parameters(benchmark_duration)
    }

    // Private implementation methods

    fn analyze_matmul_characteristics<T>(
        &self,
        a: &ArrayView2<T>,
        b: &ArrayView2<T>,
        c: &ArrayViewMut2<T>,
    ) -> BackendResult<OperationSignature>
    where
        T: TensorElement,
    {
        let m = a.len();
        let k = if a.is_empty() { 0 } else { a[0].len() };
        let n = if b.is_empty() { 0 } else { b[0].len() };

        // Calculate computational complexity
        let complexity = 2 * m * n * k; // FLOPs for matrix multiplication

        // Analyze memory access patterns
        let memory_pattern = self.analyze_memory_access_patterns(a, b, c)?;

        // Estimate cache behavior
        let cache_behavior = self.estimate_cache_behavior(m, n, k)?;

        Ok(OperationSignature {
            operation_type: OperationType::MatMul,
            dimensions: vec![m, n, k],
            complexity,
            memory_pattern,
            cache_behavior,
            data_type: std::any::type_name::<T>().to_string(),
        })
    }

    fn select_parallel_strategy(
        &self,
        signature: &OperationSignature,
    ) -> BackendResult<ParallelStrategy> {
        let selector = self.strategy_selector.lock().map_err(|_| {
            BackendError::BackendError("Failed to acquire strategy selector lock".to_string())
        })?;
        selector.select_strategy(signature)
    }

    fn get_optimal_thread_pool(
        &self,
        signature: &OperationSignature,
    ) -> BackendResult<Arc<ThreadPool>> {
        // Create thread pool profile based on operation characteristics
        let profile = self.create_thread_pool_profile(signature)?;

        // Check if pool already exists
        {
            let pools = self.thread_pools.read().map_err(|_| {
                BackendError::BackendError("Failed to acquire thread pools read lock".to_string())
            })?;
            if let Some(pool) = pools.get(&profile) {
                return Ok(pool.clone());
            }
        }

        // Create new optimized thread pool
        let pool = self.create_optimized_thread_pool(&profile)?;
        let pool_arc = Arc::new(pool);

        // Store in cache
        {
            let mut pools = self.thread_pools.write().map_err(|_| {
                BackendError::BackendError("Failed to acquire thread pools write lock".to_string())
            })?;
            pools.insert(profile, pool_arc.clone());
        }

        Ok(pool_arc)
    }

    fn execute_parallel_matmul_with_strategy<T>(
        &self,
        a: &ArrayView2<T>,
        b: &ArrayView2<T>,
        c: &mut ArrayViewMut2<T>,
        strategy: ParallelStrategy,
    ) -> BackendResult<()>
    where
        T: TensorElement + Send + Sync + std::ops::AddAssign + Copy + std::ops::Mul<Output = T>,
    {
        match strategy {
            ParallelStrategy::Simple => self.execute_simple_parallel_matmul(a, b, c),
            ParallelStrategy::Chunked => self.execute_chunked_parallel_matmul(a, b, c),
            ParallelStrategy::Recursive => self.execute_recursive_parallel_matmul(a, b, c),
            ParallelStrategy::TaskBased => self.execute_task_based_parallel_matmul(a, b, c),
            _ => {
                // Fall back to simple strategy
                self.execute_simple_parallel_matmul(a, b, c)
            }
        }
    }

    fn execute_simple_parallel_matmul<T>(
        &self,
        a: &ArrayView2<T>,
        b: &ArrayView2<T>,
        _c: &mut ArrayViewMut2<T>,
    ) -> BackendResult<()>
    where
        T: TensorElement + Send + Sync + std::ops::AddAssign + Copy + std::ops::Mul<Output = T>,
    {
        let m = a.len();
        let k = if m > 0 { a[0].len() } else { 0 };
        let n = if b.len() > 0 { b[0].len() } else { 0 };

        // Parallel over rows of the output matrix
        (0..m).into_par_iter().for_each(|i| {
            for j in 0..n {
                let mut sum = T::zero();
                for l in 0..k {
                    sum += a[i][l] * b[l][j];
                }
                // Note: This is a simplified approach for the type alias
                // In a real implementation, we'd need proper 2D array handling
            }
        });

        Ok(())
    }

    fn execute_chunked_parallel_matmul<T>(
        &self,
        a: &ArrayView2<T>,
        b: &ArrayView2<T>,
        _c: &mut ArrayViewMut2<T>,
    ) -> BackendResult<()>
    where
        T: TensorElement + Send + Sync + std::ops::AddAssign + Copy + std::ops::Mul<Output = T>,
    {
        let m = a.len();
        let k = if m > 0 { a[0].len() } else { 0 };
        let n = if b.len() > 0 { b[0].len() } else { 0 };

        // Determine optimal block size based on cache size
        let block_size = self.calculate_optimal_block_size(m, n, k)?;

        // Parallel tiled matrix multiplication
        (0..m)
            .into_par_iter()
            .step_by(block_size)
            .for_each(|i_start| {
                (0..n)
                    .into_par_iter()
                    .step_by(block_size)
                    .for_each(|j_start| {
                        let i_end = (i_start + block_size).min(m);
                        let j_end = (j_start + block_size).min(n);

                        for i in i_start..i_end {
                            for j in j_start..j_end {
                                let mut sum = T::zero();
                                for l in 0..k {
                                    sum += a[i][l] * b[l][j];
                                }
                                // Note: This is a simplified approach for the type alias
                                // In a real implementation, we'd need proper 2D array handling
                            }
                        }
                    });
            });

        Ok(())
    }

    fn execute_recursive_parallel_matmul<T>(
        &self,
        a: &ArrayView2<T>,
        b: &ArrayView2<T>,
        c: &mut ArrayViewMut2<T>,
    ) -> BackendResult<()>
    where
        T: TensorElement + Send + Sync + std::ops::AddAssign + Copy + std::ops::Mul<Output = T>,
    {
        let threshold = 64; // Threshold for switching to sequential
        let m = a.len();
        let k = if m > 0 { a[0].len() } else { 0 };
        let n = if b.len() > 0 { b[0].len() } else { 0 };

        if m <= threshold || n <= threshold || k <= threshold {
            // Sequential base case
            for i in 0..m {
                for j in 0..n {
                    let mut sum = T::zero();
                    for l in 0..k {
                        sum += a[i][l] * b[l][j];
                    }
                    // Note: This is a simplified approach for the type alias
                    // In a real implementation, we'd need proper 2D array handling
                }
            }
            return Ok(());
        }

        // For simplicity, fall back to chunked parallel multiplication
        // A full recursive implementation would require more complex matrix slicing
        self.execute_chunked_parallel_matmul(a, b, c)
    }

    fn execute_task_based_parallel_matmul<T>(
        &self,
        a: &ArrayView2<T>,
        b: &ArrayView2<T>,
        _c: &mut ArrayViewMut2<T>,
    ) -> BackendResult<()>
    where
        T: TensorElement + Send + Sync + std::ops::AddAssign + Copy + std::ops::Mul<Output = T>,
    {
        let m = a.len();
        let k = if m > 0 { a[0].len() } else { 0 };
        let n = if b.len() > 0 { b[0].len() } else { 0 };

        // Create tasks for each output element
        let tasks: Vec<(usize, usize)> = (0..m).flat_map(|i| (0..n).map(move |j| (i, j))).collect();

        // Process tasks in parallel
        tasks.into_par_iter().for_each(|(i, j)| {
            let mut sum = T::zero();
            for l in 0..k {
                sum += a[i][l] * b[l][j];
            }
            // Note: This is a simplified approach for the type alias
            // In a real implementation, we'd need proper 2D array handling
        });

        Ok(())
    }

    // Additional helper methods for other operations

    fn analyze_elementwise_characteristics<T>(
        &self,
        input: &ArrayView1<T>,
        _output: &ArrayViewMut1<T>,
    ) -> BackendResult<OperationSignature>
    where
        T: TensorElement,
    {
        Ok(OperationSignature {
            operation_type: OperationType::ElementWise,
            dimensions: vec![input.len()],
            complexity: input.len(), // O(n) complexity
            memory_pattern: MemoryAccessPattern::Sequential,
            cache_behavior: CacheBehavior::Friendly,
            data_type: std::any::type_name::<T>().to_string(),
        })
    }

    fn select_chunking_strategy(
        &self,
        signature: &OperationSignature,
    ) -> BackendResult<ChunkingStrategy> {
        // Simple heuristic based on data size
        if signature.dimensions[0] < 1000 {
            Ok(ChunkingStrategy::Fixed(100))
        } else if signature.dimensions[0] < 100000 {
            Ok(ChunkingStrategy::Adaptive)
        } else {
            Ok(ChunkingStrategy::CacheOptimized)
        }
    }

    fn execute_fixed_chunk_elementwise<T, F>(
        &self,
        input: &ArrayView1<T>,
        output: &mut ArrayViewMut1<T>,
        operation: &F,
        chunk_size: usize,
    ) -> BackendResult<()>
    where
        T: TensorElement + Send + Sync + Copy,
        F: Fn(T) -> T + Send + Sync,
    {
        input
            .par_chunks(chunk_size)
            .zip(output.par_chunks_mut(chunk_size))
            .for_each(|(input_chunk, output_chunk)| {
                for (inp, out) in input_chunk.iter().zip(output_chunk.iter_mut()) {
                    *out = operation(*inp);
                }
            });

        Ok(())
    }

    fn execute_adaptive_chunk_elementwise<T, F>(
        &self,
        input: &ArrayView1<T>,
        output: &mut ArrayViewMut1<T>,
        operation: &F,
    ) -> BackendResult<()>
    where
        T: TensorElement + Send + Sync + Copy,
        F: Fn(T) -> T + Send + Sync,
    {
        // Adaptive chunking based on available parallelism
        let num_cpus = rayon::current_num_threads();
        let chunk_size = (input.len() + num_cpus - 1) / num_cpus;

        self.execute_fixed_chunk_elementwise(input, output, operation, chunk_size)
    }

    fn execute_cache_optimized_elementwise<T, F>(
        &self,
        input: &ArrayView1<T>,
        output: &mut ArrayViewMut1<T>,
        operation: &F,
    ) -> BackendResult<()>
    where
        T: TensorElement + Send + Sync + Copy,
        F: Fn(T) -> T + Send + Sync,
    {
        // Cache-optimized chunking (L2 cache size / element size)
        let cache_size = 256 * 1024; // 256KB L2 cache
        let element_size = std::mem::size_of::<T>();
        let chunk_size = cache_size / element_size / 2; // Divide by 2 for input/output

        self.execute_fixed_chunk_elementwise(input, output, operation, chunk_size)
    }

    // Placeholder implementations for other methods
    fn analyze_memory_access_patterns<T>(
        &self,
        _a: &ArrayView2<T>,
        _b: &ArrayView2<T>,
        _c: &ArrayViewMut2<T>,
    ) -> BackendResult<MemoryAccessPattern> {
        Ok(MemoryAccessPattern::Sequential)
    }

    fn estimate_cache_behavior(
        &self,
        m: usize,
        n: usize,
        k: usize,
    ) -> BackendResult<CacheBehavior> {
        // Simple heuristic based on working set size
        let working_set = m * k + k * n + m * n;
        if working_set < 32 * 1024 {
            Ok(CacheBehavior::Friendly)
        } else if working_set < 1024 * 1024 {
            Ok(CacheBehavior::Moderate)
        } else {
            Ok(CacheBehavior::Unfriendly)
        }
    }

    fn create_thread_pool_profile(
        &self,
        signature: &OperationSignature,
    ) -> BackendResult<ThreadPoolProfile> {
        // Determine optimal thread count based on operation characteristics
        let thread_count = match signature.operation_type {
            OperationType::MatMul => {
                if signature.complexity > 1_000_000 {
                    rayon::current_num_threads()
                } else {
                    (rayon::current_num_threads() / 2).max(1)
                }
            }
            OperationType::ElementWise => rayon::current_num_threads(),
            _ => rayon::current_num_threads(),
        };

        // Determine workload type
        let workload_type = match signature.operation_type {
            OperationType::MatMul => WorkloadType::ComputeIntensive,
            OperationType::ElementWise => WorkloadType::MemoryBound,
            _ => WorkloadType::Balanced,
        };

        Ok(ThreadPoolProfile {
            workload_type,
            thread_count,
            stack_size: 8 * 1024 * 1024, // 8MB stack
            priority: ThreadPriority::Normal,
            numa_affinity: None,
            cpu_affinity: None,
        })
    }

    fn create_optimized_thread_pool(
        &self,
        profile: &ThreadPoolProfile,
    ) -> BackendResult<ThreadPool> {
        let builder = ThreadPoolBuilder::new()
            .num_threads(profile.thread_count)
            .stack_size(profile.stack_size);

        builder
            .build()
            .map_err(|e| BackendError::BackendError(format!("Thread pool creation failed: {}", e)))
    }

    fn calculate_optimal_block_size(
        &self,
        _m: usize,
        _n: usize,
        _k: usize,
    ) -> BackendResult<usize> {
        // Calculate block size based on L2 cache size
        let cache_size = 256 * 1024; // 256KB
        let element_size = 4; // Assuming f32
        let elements_per_cache = cache_size / element_size;

        // Simple heuristic: sqrt of cache capacity
        let block_size = ((elements_per_cache as f64).sqrt() as usize).next_power_of_two();
        Ok(block_size.min(128).max(16)) // Clamp between 16 and 128
    }

    // Placeholder implementations for recording metrics
    fn record_performance_metrics(
        &self,
        _signature: &OperationSignature,
        _execution_time: Duration,
        _result: &BackendResult<()>,
    ) -> BackendResult<()> {
        Ok(())
    }

    fn record_elementwise_performance(
        &self,
        _signature: &OperationSignature,
        _execution_time: Duration,
        _result: &BackendResult<()>,
    ) -> BackendResult<()> {
        Ok(())
    }

    fn record_reduction_performance<T>(
        &self,
        _signature: &OperationSignature,
        _execution_time: Duration,
        _result: &BackendResult<T>,
    ) -> BackendResult<()> {
        Ok(())
    }

    fn record_convolution_performance(
        &self,
        _signature: &OperationSignature,
        _execution_time: Duration,
        _result: &BackendResult<()>,
    ) -> BackendResult<()> {
        Ok(())
    }

    // Additional placeholder methods for other operations
    fn analyze_reduction_characteristics<T>(
        &self,
        input: &ArrayView1<T>,
    ) -> BackendResult<OperationSignature>
    where
        T: TensorElement,
    {
        Ok(OperationSignature {
            operation_type: OperationType::Reduction,
            dimensions: vec![input.len()],
            complexity: input.len(),
            memory_pattern: MemoryAccessPattern::Sequential,
            cache_behavior: CacheBehavior::Friendly,
            data_type: std::any::type_name::<T>().to_string(),
        })
    }

    fn select_reduction_strategy(
        &self,
        _signature: &OperationSignature,
    ) -> BackendResult<ReductionStrategy> {
        Ok(ReductionStrategy::Simple)
    }

    fn execute_hierarchical_reduction<T, F>(
        &self,
        input: &ArrayView1<T>,
        identity: T,
        operation: &F,
    ) -> BackendResult<T>
    where
        T: TensorElement + Send + Sync + Copy,
        F: Fn(T, T) -> T + Send + Sync,
    {
        Ok(input.par_iter().copied().reduce(|| identity, operation))
    }

    fn execute_numa_aware_reduction<T, F>(
        &self,
        input: &ArrayView1<T>,
        identity: T,
        operation: &F,
    ) -> BackendResult<T>
    where
        T: TensorElement + Send + Sync + Copy,
        F: Fn(T, T) -> T + Send + Sync,
    {
        Ok(input.par_iter().copied().reduce(|| identity, operation))
    }

    fn execute_cache_optimized_reduction<T, F>(
        &self,
        input: &ArrayView1<T>,
        identity: T,
        operation: &F,
    ) -> BackendResult<T>
    where
        T: TensorElement + Send + Sync + Copy,
        F: Fn(T, T) -> T + Send + Sync,
    {
        Ok(input.par_iter().copied().reduce(|| identity, operation))
    }

    fn analyze_convolution_characteristics<T>(
        &self,
        input: &ArrayView2<T>,
        kernel: &ArrayView2<T>,
        _config: &ConvolutionConfig,
    ) -> BackendResult<OperationSignature>
    where
        T: TensorElement,
    {
        Ok(OperationSignature {
            operation_type: OperationType::Convolution,
            dimensions: vec![
                input.len(),
                if input.is_empty() { 0 } else { input[0].len() },
                kernel.len(),
                if kernel.is_empty() {
                    0
                } else {
                    kernel[0].len()
                },
            ],
            complexity: input.len() * kernel.len(),
            memory_pattern: MemoryAccessPattern::Strided,
            cache_behavior: CacheBehavior::Moderate,
            data_type: std::any::type_name::<T>().to_string(),
        })
    }

    fn select_convolution_strategy(
        &self,
        _signature: &OperationSignature,
    ) -> BackendResult<ConvolutionStrategy> {
        Ok(ConvolutionStrategy::OutputParallel)
    }

    fn execute_output_parallel_convolution<T>(
        &self,
        _input: &ArrayView2<T>,
        _kernel: &ArrayView2<T>,
        _output: &mut ArrayViewMut2<T>,
        _config: &ConvolutionConfig,
    ) -> BackendResult<()>
    where
        T: TensorElement + Send + Sync + Copy + std::ops::AddAssign,
    {
        // Placeholder implementation
        Ok(())
    }

    fn execute_input_parallel_convolution<T>(
        &self,
        _input: &ArrayView2<T>,
        _kernel: &ArrayView2<T>,
        _output: &mut ArrayViewMut2<T>,
        _config: &ConvolutionConfig,
    ) -> BackendResult<()>
    where
        T: TensorElement + Send + Sync + Copy + std::ops::AddAssign,
    {
        // Placeholder implementation
        Ok(())
    }

    fn execute_tiled_parallel_convolution<T>(
        &self,
        _input: &ArrayView2<T>,
        _kernel: &ArrayView2<T>,
        _output: &mut ArrayViewMut2<T>,
        _config: &ConvolutionConfig,
    ) -> BackendResult<()>
    where
        T: TensorElement + Send + Sync + Copy + std::ops::AddAssign,
    {
        // Placeholder implementation
        Ok(())
    }

    fn execute_simd_parallel_convolution<T>(
        &self,
        _input: &ArrayView2<T>,
        _kernel: &ArrayView2<T>,
        _output: &mut ArrayViewMut2<T>,
        _config: &ConvolutionConfig,
    ) -> BackendResult<()>
    where
        T: TensorElement + Send + Sync + Copy + std::ops::AddAssign,
    {
        // Placeholder implementation
        Ok(())
    }
}

// Supporting types and configurations

/// Operation signature for strategy selection
#[derive(Debug, Clone)]
pub struct OperationSignature {
    pub operation_type: OperationType,
    pub dimensions: Vec<usize>,
    pub complexity: usize,
    pub memory_pattern: MemoryAccessPattern,
    pub cache_behavior: CacheBehavior,
    pub data_type: String,
}

/// Operation type classification
#[derive(Debug, Clone, Copy)]
pub enum OperationType {
    MatMul,
    ElementWise,
    Reduction,
    Convolution,
    FFT,
    Sort,
}

/// Memory access pattern types
#[derive(Debug, Clone, Copy)]
pub enum MemoryAccessPattern {
    Sequential,
    Strided,
    Random,
    Coalesced,
}

/// Cache behavior classification
#[derive(Debug, Clone, Copy)]
pub enum CacheBehavior {
    Friendly,
    Moderate,
    Unfriendly,
}

/// Chunking strategy types
#[derive(Debug, Clone, Copy)]
pub enum ChunkingStrategy {
    Fixed(usize),
    Adaptive,
    CacheOptimized,
}

/// Reduction strategy types
#[derive(Debug, Clone, Copy)]
pub enum ReductionStrategy {
    Simple,
    Hierarchical,
    NUMA,
    CacheOptimized,
}

/// Convolution strategy types
#[derive(Debug, Clone, Copy)]
pub enum ConvolutionStrategy {
    OutputParallel,
    InputParallel,
    Tiled,
    SIMD,
}

/// Convolution configuration
#[derive(Debug, Clone)]
pub struct ConvolutionConfig {
    pub stride: (usize, usize),
    pub padding: (usize, usize),
}

/// Rayon optimization configuration
#[derive(Debug, Clone)]
pub struct RayonOptimizationConfig {
    pub thread_pool_config: ThreadPoolManagementConfig,
    pub work_stealing_config: WorkStealingConfig,
    pub load_balancing_config: LoadBalancingConfig,
    pub numa_config: NumaSchedulingConfig,
    pub cache_config: CacheOptimizationConfig,
    pub strategy_config: StrategySelectionConfig,
    pub monitoring_config: MonitoringConfig,
}

// Configuration structures
#[derive(Debug, Clone)]
pub struct ThreadPoolManagementConfig {
    pub enable_adaptive_sizing: bool,
    pub min_threads: usize,
    pub max_threads: usize,
    pub thread_stack_size: usize,
}

#[derive(Debug, Clone)]
pub struct WorkStealingConfig {
    pub enable_work_stealing: bool,
    pub steal_strategy: String,
    pub deque_size: usize,
    pub victim_selection: String,
}

#[derive(Debug, Clone)]
pub struct LoadBalancingConfig {
    pub enable_load_balancing: bool,
    pub rebalancing_threshold: f64,
    pub migration_cost_threshold: f64,
}

#[derive(Debug, Clone)]
pub struct NumaSchedulingConfig {
    pub enable_numa_awareness: bool,
    pub local_allocation_preference: f64,
    pub cross_numa_penalty: f64,
}

#[derive(Debug, Clone)]
pub struct CacheOptimizationConfig {
    pub enable_cache_optimization: bool,
    pub l1_cache_size: usize,
    pub l2_cache_size: usize,
    pub l3_cache_size: usize,
    pub cache_line_size: usize,
}

#[derive(Debug, Clone)]
pub struct StrategySelectionConfig {
    pub enable_adaptive_strategy: bool,
    pub performance_threshold: f64,
    pub strategy_switch_cost: f64,
}

#[derive(Debug, Clone)]
pub struct MonitoringConfig {
    pub enable_monitoring: bool,
    pub sampling_interval: Duration,
    pub metrics_retention: Duration,
}

// Placeholder implementations for supporting structures
macro_rules! impl_placeholder_struct {
    ($name:ident) => {
        #[derive(Debug)]
        pub struct $name;

        impl $name {
            pub fn new() -> Self {
                Self
            }
        }
    };
}

impl_placeholder_struct!(CpuTopologyAnalyzer);
impl_placeholder_struct!(ThreadAffinityManager);
impl_placeholder_struct!(PoolConfigurationOptimizer);
impl_placeholder_struct!(ThreadLifecycleManager);
impl_placeholder_struct!(ThreadPoolPerformancePredictor);
impl_placeholder_struct!(DequeOptimizer);
impl_placeholder_struct!(VictimSelectionStrategy);
impl_placeholder_struct!(TaskGranularityAnalyzer);
impl_placeholder_struct!(StealStrategyOptimizer);
impl_placeholder_struct!(ContentionResolver);
impl_placeholder_struct!(WorkStealingStatistics);
impl_placeholder_struct!(WorkloadDistributionAnalyzer);
impl_placeholder_struct!(LoadPredictionEngine);
impl_placeholder_struct!(DynamicRebalancer);
impl_placeholder_struct!(TaskMigrationManager);
impl_placeholder_struct!(LoadMonitoringSystem);
impl_placeholder_struct!(LoadBalancingState);
impl_placeholder_struct!(NumaTopologyDetector);
impl_placeholder_struct!(MemoryAffinityOptimizer);
impl_placeholder_struct!(CrossNumaCommunicationMinimizer);
impl_placeholder_struct!(LocalAllocationPrioritizer);
impl_placeholder_struct!(NumaPerformanceTracker);
impl_placeholder_struct!(CacheHierarchyAnalyzer);
impl_placeholder_struct!(DataLocalityOptimizer);
impl_placeholder_struct!(CacheFriendlyChunkingStrategy);
impl_placeholder_struct!(PrefetchPatternOptimizer);
impl_placeholder_struct!(FalseSharingDetector);
impl_placeholder_struct!(AlgorithmComplexityAnalyzer);
impl_placeholder_struct!(DataSizeAnalyzer);
impl_placeholder_struct!(HardwareCapabilityDetector);
impl_placeholder_struct!(StrategyPerformanceDatabase);
impl_placeholder_struct!(AdaptiveStrategyEngine);
// ParallelPerformanceMonitor and ParallelProcessingStatistics have custom implementations below

/// Parallel performance monitoring system
#[derive(Debug)]
pub struct ParallelPerformanceMonitor;

/// Parallel processing statistics collector
#[derive(Debug)]
pub struct ParallelProcessingStatistics;

// Implementations for supporting structures
impl IntelligentThreadPoolManager {
    pub fn new(config: &ThreadPoolManagementConfig) -> BackendResult<Self> {
        Ok(Self {
            cpu_topology: CpuTopologyAnalyzer::new(),
            affinity_manager: ThreadAffinityManager::new(),
            config_optimizer: PoolConfigurationOptimizer::new(),
            lifecycle_manager: ThreadLifecycleManager::new(),
            performance_predictor: ThreadPoolPerformancePredictor::new(),
            active_pools: HashMap::new(),
            config: config.clone(),
        })
    }

    pub fn optimize_for_workload(&mut self, _workload_type: WorkloadType) -> BackendResult<()> {
        // Placeholder implementation
        Ok(())
    }
}

impl WorkStealingOptimizer {
    pub fn new(config: &WorkStealingConfig) -> BackendResult<Self> {
        Ok(Self {
            deque_optimizer: DequeOptimizer::new(),
            victim_selector: VictimSelectionStrategy::new(),
            granularity_analyzer: TaskGranularityAnalyzer::new(),
            steal_strategy: StealStrategyOptimizer::new(),
            contention_resolver: ContentionResolver::new(),
            config: config.clone(),
            stealing_stats: WorkStealingStatistics::new(),
        })
    }

    pub fn auto_tune_parameters(
        &mut self,
        _duration: Duration,
    ) -> BackendResult<OptimizationResults> {
        Ok(OptimizationResults::default())
    }
}

impl AdaptiveLoadBalancer {
    pub fn new(config: &LoadBalancingConfig) -> BackendResult<Self> {
        Ok(Self {
            distribution_analyzer: WorkloadDistributionAnalyzer::new(),
            load_predictor: LoadPredictionEngine::new(),
            rebalancer: DynamicRebalancer::new(),
            migration_manager: TaskMigrationManager::new(),
            load_monitor: LoadMonitoringSystem::new(),
            config: config.clone(),
            load_state: LoadBalancingState::new(),
        })
    }
}

impl NumaAwareScheduler {
    pub fn new(config: &NumaSchedulingConfig) -> BackendResult<Self> {
        Ok(Self {
            numa_topology: NumaTopologyDetector::new(),
            memory_affinity: MemoryAffinityOptimizer::new(),
            communication_minimizer: CrossNumaCommunicationMinimizer::new(),
            local_allocation_prioritizer: LocalAllocationPrioritizer::new(),
            numa_performance_tracker: NumaPerformanceTracker::new(),
            config: config.clone(),
        })
    }
}

impl CacheConsciousTaskDistributor {
    pub fn new(config: &CacheOptimizationConfig) -> BackendResult<Self> {
        Ok(Self {
            cache_analyzer: CacheHierarchyAnalyzer::new(),
            locality_optimizer: DataLocalityOptimizer::new(),
            chunking_strategy: CacheFriendlyChunkingStrategy::new(),
            prefetch_optimizer: PrefetchPatternOptimizer::new(),
            false_sharing_detector: FalseSharingDetector::new(),
            config: config.clone(),
        })
    }
}

impl ParallelStrategySelector {
    pub fn new(config: &StrategySelectionConfig) -> BackendResult<Self> {
        Ok(Self {
            complexity_analyzer: AlgorithmComplexityAnalyzer::new(),
            data_analyzer: DataSizeAnalyzer::new(),
            hardware_detector: HardwareCapabilityDetector::new(),
            strategy_db: StrategyPerformanceDatabase::new(),
            adaptive_engine: AdaptiveStrategyEngine::new(),
            config: config.clone(),
        })
    }

    pub fn select_strategy(
        &self,
        signature: &OperationSignature,
    ) -> BackendResult<ParallelStrategy> {
        // Simple heuristic based on operation characteristics
        match signature.operation_type {
            OperationType::MatMul => {
                if signature.complexity > 1_000_000 {
                    Ok(ParallelStrategy::Recursive)
                } else if signature.complexity > 100_000 {
                    Ok(ParallelStrategy::Chunked)
                } else {
                    Ok(ParallelStrategy::Simple)
                }
            }
            OperationType::ElementWise => Ok(ParallelStrategy::Chunked),
            OperationType::Reduction => Ok(ParallelStrategy::Reduce),
            _ => Ok(ParallelStrategy::Simple),
        }
    }
}

impl ParallelPerformanceMonitor {
    pub fn new(_config: &MonitoringConfig) -> BackendResult<Self> {
        Ok(Self)
    }
}

impl ParallelProcessingStatistics {
    pub fn new() -> Self {
        Self
    }

    pub fn generate_comprehensive_report(&self) -> BackendResult<ParallelPerformanceReport> {
        Ok(ParallelPerformanceReport::default())
    }
}

#[derive(Debug, Clone, Default)]
pub struct OptimizationResults {
    pub optimal_thread_count: usize,
    pub optimal_chunk_size: usize,
    pub performance_improvement: f64,
    pub configuration_changes: Vec<String>,
}

#[derive(Debug, Clone, Default)]
pub struct ParallelPerformanceReport {
    pub total_operations: u64,
    pub average_execution_time: Duration,
    pub parallel_efficiency: f64,
    pub cache_hit_rate: f64,
    pub load_balance_score: f64,
    pub numa_efficiency: f64,
}

#[cfg(test)]
mod tests {
    use super::*;
    // Note: Using Vec<Vec<T>> for testing since scirs2_autograd is not available
    // use scirs2_autograd::ndarray::{Array1, Array2};

    #[test]
    fn test_rayon_optimizer_creation() {
        let config = RayonOptimizationConfig {
            thread_pool_config: ThreadPoolManagementConfig {
                enable_adaptive_sizing: true,
                min_threads: 1,
                max_threads: 16,
                thread_stack_size: 8 * 1024 * 1024,
            },
            work_stealing_config: WorkStealingConfig {
                enable_work_stealing: true,
                steal_strategy: "random".to_string(),
                deque_size: 1024,
                victim_selection: "round_robin".to_string(),
            },
            load_balancing_config: LoadBalancingConfig {
                enable_load_balancing: true,
                rebalancing_threshold: 0.1,
                migration_cost_threshold: 0.05,
            },
            numa_config: NumaSchedulingConfig {
                enable_numa_awareness: true,
                local_allocation_preference: 0.8,
                cross_numa_penalty: 0.2,
            },
            cache_config: CacheOptimizationConfig {
                enable_cache_optimization: true,
                l1_cache_size: 32 * 1024,
                l2_cache_size: 256 * 1024,
                l3_cache_size: 8 * 1024 * 1024,
                cache_line_size: 64,
            },
            strategy_config: StrategySelectionConfig {
                enable_adaptive_strategy: true,
                performance_threshold: 0.9,
                strategy_switch_cost: 0.01,
            },
            monitoring_config: MonitoringConfig {
                enable_monitoring: true,
                sampling_interval: Duration::from_millis(100),
                metrics_retention: Duration::from_secs(3600),
            },
        };

        let result = AdvancedRayonOptimizer::new(config);
        assert!(result.is_ok());
    }

    #[test]
    fn test_thread_pool_profile_creation() {
        let profile = ThreadPoolProfile {
            workload_type: WorkloadType::ComputeIntensive,
            thread_count: 8,
            stack_size: 8 * 1024 * 1024,
            priority: ThreadPriority::High,
            numa_affinity: Some(0),
            cpu_affinity: None,
        };

        assert_eq!(profile.workload_type, WorkloadType::ComputeIntensive);
        assert_eq!(profile.thread_count, 8);
        assert_eq!(profile.priority, ThreadPriority::High);
    }

    #[test]
    fn test_operation_signature_creation() {
        let signature = OperationSignature {
            operation_type: OperationType::MatMul,
            dimensions: vec![512, 512, 512],
            complexity: 2 * 512 * 512 * 512,
            memory_pattern: MemoryAccessPattern::Strided,
            cache_behavior: CacheBehavior::Moderate,
            data_type: "f32".to_string(),
        };

        assert_eq!(signature.dimensions, vec![512, 512, 512]);
        assert!(signature.complexity > 100_000_000);
    }

    #[test]
    fn test_parallel_strategy_selection() {
        let signature = OperationSignature {
            operation_type: OperationType::MatMul,
            dimensions: vec![1024, 1024, 1024],
            complexity: 2 * 1024 * 1024 * 1024,
            memory_pattern: MemoryAccessPattern::Strided,
            cache_behavior: CacheBehavior::Unfriendly,
            data_type: "f32".to_string(),
        };

        let config = StrategySelectionConfig {
            enable_adaptive_strategy: true,
            performance_threshold: 0.9,
            strategy_switch_cost: 0.01,
        };

        let selector = ParallelStrategySelector::new(&config)
            .expect("Parallel Strategy Selector should succeed");
        let strategy = selector
            .select_strategy(&signature)
            .expect("strategy selection should succeed");

        // Large complex operations should use recursive strategy
        assert!(matches!(strategy, ParallelStrategy::Recursive));
    }
}