scirs2-linalg 0.4.2

//! MPI Collective Operations
//!
//! This module provides advanced collective communication operations including
//! optimized algorithms for broadcast, reduce, gather, scatter, and distributed
//! matrix operations with intelligent algorithm selection and performance optimization.

use crate::error::{LinalgError, LinalgResult};
use super::{MPICommunicator, MPIDatatype, MPIReduceOp};
use super::topology::TreeTopology;
use super::communicator::DistributedMatrix;
use scirs2_core::numeric::{Float, NumAssign};
use std::collections::HashMap;
use std::sync::Arc;
use std::ffi::{c_int, c_void};

/// Advanced collective operations for MPI
#[derive(Debug)]
pub struct MPICollectiveOps {
    comm: Arc<MPICommunicator>,
    optimization_cache: HashMap<String, CollectiveOptimization>,
    performance_history: Vec<CollectivePerformanceRecord>,
}

/// Optimization parameters for collective operations
#[derive(Debug, Clone)]
pub struct CollectiveOptimization {
    algorithm: String,
    chunksize: usize,
    pipeline_depth: usize,
    tree_topology: TreeTopology,
    expected_performance: f64,
}

/// Performance record for collective operations
#[derive(Debug, Clone)]
pub struct CollectivePerformanceRecord {
    operation: String,
    process_count: i32,
    datasize: usize,
    execution_time: f64,
    bandwidth: f64,
    algorithm_used: String,
    topology_used: TreeTopology,
}

impl MPICollectiveOps {
    pub fn new(comm: Arc<MPICommunicator>) -> Self {
        Self {
            comm,
            optimization_cache: HashMap::new(),
            performance_history: Vec::new(),
        }
    }

    /// Broadcast data from root to all processes
    pub fn broadcast<T>(&self, data: &mut [T], root: i32) -> LinalgResult<()>
    where
        T: MPIDatatype + Clone,
    {
        let start_time = std::time::Instant::now();

        unsafe {
            let result = mpi_bcast(
                data.as_mut_ptr() as *mut c_void,
                data.len(),
                T::mpi_datatype(),
                root,
                self.comm.handle().raw_handle(),
            );

            if result != 0 {
                return Err(LinalgError::CommunicationError(
                    format!("MPI broadcast failed with code {}", result)
                ));
            }
        }

        // Record performance
        let elapsed = start_time.elapsed().as_secs_f64();
        self.record_performance("broadcast", data.len(), elapsed, "default");

        Ok(())
    }

    /// Perform allreduce operation across all processes
    pub fn allreduce<T>(&self, sendbuf: &[T], recvbuf: &mut [T], op: MPIReduceOp) -> LinalgResult<()>
    where
        T: MPIDatatype + Clone,
    {
        if sendbuf.len() != recvbuf.len() {
            return Err(LinalgError::InvalidInput(
                "Send and receive buffers must have the same length".to_string()
            ));
        }

        let start_time = std::time::Instant::now();

        unsafe {
            let result = mpi_allreduce(
                sendbuf.as_ptr() as *const c_void,
                recvbuf.as_mut_ptr() as *mut c_void,
                sendbuf.len(),
                T::mpi_datatype(),
                op.to_mpi_op(),
                self.comm.handle().raw_handle(),
            );

            if result != 0 {
                return Err(LinalgError::CommunicationError(
                    format!("MPI allreduce failed with code {}", result)
                ));
            }
        }

        // Record performance
        let elapsed = start_time.elapsed().as_secs_f64();
        self.record_performance("allreduce", sendbuf.len(), elapsed, "default");

        Ok(())
    }

    /// Gather data from all processes to root
    pub fn gather<T>(&self, sendbuf: &[T], recvbuf: &mut [T], root: i32) -> LinalgResult<()>
    where
        T: MPIDatatype + Clone,
    {
        let start_time = std::time::Instant::now();

        unsafe {
            let result = mpi_gather(
                sendbuf.as_ptr() as *const c_void,
                sendbuf.len(),
                T::mpi_datatype(),
                recvbuf.as_mut_ptr() as *mut c_void,
                sendbuf.len(),
                T::mpi_datatype(),
                root,
                self.comm.handle().raw_handle(),
            );

            if result != 0 {
                return Err(LinalgError::CommunicationError(
                    format!("MPI gather failed with code {}", result)
                ));
            }
        }

        // Record performance
        let elapsed = start_time.elapsed().as_secs_f64();
        self.record_performance("gather", sendbuf.len(), elapsed, "default");

        Ok(())
    }

    /// Scatter data from root to all processes
    pub fn scatter<T>(&self, sendbuf: &[T], recvbuf: &mut [T], root: i32) -> LinalgResult<()>
    where
        T: MPIDatatype + Clone,
    {
        let start_time = std::time::Instant::now();

        unsafe {
            let result = mpi_scatter(
                sendbuf.as_ptr() as *const c_void,
                recvbuf.len(),
                T::mpi_datatype(),
                recvbuf.as_mut_ptr() as *mut c_void,
                recvbuf.len(),
                T::mpi_datatype(),
                root,
                self.comm.handle().raw_handle(),
            );

            if result != 0 {
                return Err(LinalgError::CommunicationError(
                    format!("MPI scatter failed with code {}", result)
                ));
            }
        }

        // Record performance
        let elapsed = start_time.elapsed().as_secs_f64();
        self.record_performance("scatter", recvbuf.len(), elapsed, "default");

        Ok(())
    }

    /// Gather data from all processes to all processes
    pub fn allgather<T>(&self, sendbuf: &[T], recvbuf: &mut [T]) -> LinalgResult<()>
    where
        T: MPIDatatype + Clone,
    {
        let expected_size = sendbuf.len() * self.comm.size() as usize;
        if recvbuf.len() != expected_size {
            return Err(LinalgError::InvalidInput(
                format!("Receive buffer size {} does not match expected size {}",
                       recvbuf.len(), expected_size)
            ));
        }

        let start_time = std::time::Instant::now();

        unsafe {
            let result = mpi_allgather(
                sendbuf.as_ptr() as *const c_void,
                sendbuf.len(),
                T::mpi_datatype(),
                recvbuf.as_mut_ptr() as *mut c_void,
                sendbuf.len(),
                T::mpi_datatype(),
                self.comm.handle().raw_handle(),
            );

            if result != 0 {
                return Err(LinalgError::CommunicationError(
                    format!("MPI allgather failed with code {}", result)
                ));
            }
        }

        // Record performance
        let elapsed = start_time.elapsed().as_secs_f64();
        self.record_performance("allgather", sendbuf.len(), elapsed, "default");

        Ok(())
    }

    /// Perform reduce operation to root process
    pub fn reduce<T>(&self, sendbuf: &[T], recvbuf: &mut [T], op: MPIReduceOp, root: i32) -> LinalgResult<()>
    where
        T: MPIDatatype + Clone,
    {
        if sendbuf.len() != recvbuf.len() {
            return Err(LinalgError::InvalidInput(
                "Send and receive buffers must have the same length".to_string()
            ));
        }

        let start_time = std::time::Instant::now();

        unsafe {
            let result = mpi_reduce(
                sendbuf.as_ptr() as *const c_void,
                recvbuf.as_mut_ptr() as *mut c_void,
                sendbuf.len(),
                T::mpi_datatype(),
                op.to_mpi_op(),
                root,
                self.comm.handle().raw_handle(),
            );

            if result != 0 {
                return Err(LinalgError::CommunicationError(
                    format!("MPI reduce failed with code {}", result)
                ));
            }
        }

        // Record performance
        let elapsed = start_time.elapsed().as_secs_f64();
        self.record_performance("reduce", sendbuf.len(), elapsed, "default");

        Ok(())
    }

    /// Optimized distributed matrix multiplication using MPI
    pub fn distributed_gemm<T>(
        &self,
        a: &DistributedMatrix<T>,
        b: &DistributedMatrix<T>,
    ) -> LinalgResult<DistributedMatrix<T>>
    where
        T: Float + NumAssign + MPIDatatype + Send + Sync + Clone + 'static,
    {
        // Implement Cannon's algorithm or SUMMA for distributed GEMM
        self.summa_algorithm(a, b)
    }

    /// SUMMA (Scalable Universal Matrix Multiplication Algorithm) implementation
    fn summa_algorithm<T>(
        &self,
        a: &DistributedMatrix<T>,
        b: &DistributedMatrix<T>,
    ) -> LinalgResult<DistributedMatrix<T>>
    where
        T: Float + NumAssign + MPIDatatype + Send + Sync + Clone + 'static,
    {
        // This would implement the SUMMA algorithm for distributed matrix multiplication
        // For now, return a placeholder error
        Err(LinalgError::NotImplementedError(
            "SUMMA algorithm not yet implemented".to_string()
        ))
    }

    /// Distributed reduction using tree algorithms
    pub fn tree_reduce<T>(
        &self,
        data: &[T],
        op: MPIReduceOp,
        topology: TreeTopology,
    ) -> LinalgResult<Vec<T>>
    where
        T: MPIDatatype + Clone + Default,
    {
        match topology {
            TreeTopology::Binomial => self.binomial_tree_reduce(data, op),
            TreeTopology::Flat => self.flat_tree_reduce(data, op),
            TreeTopology::Pipeline => self.pipeline_reduce(data, op),
            _ => Err(LinalgError::NotImplementedError(
                "Custom tree topologies not yet implemented".to_string()
            )),
        }
    }

    fn binomial_tree_reduce<T>(&self, data: &[T], op: MPIReduceOp) -> LinalgResult<Vec<T>>
    where
        T: MPIDatatype + Clone + Default,
    {
        // Implement binomial tree reduction
        let mut result = data.to_vec();
        self.allreduce(data, &mut result, op)?;
        Ok(result)
    }

    fn flat_tree_reduce<T>(&self, data: &[T], op: MPIReduceOp) -> LinalgResult<Vec<T>>
    where
        T: MPIDatatype + Clone + Default,
    {
        // Implement flat tree reduction
        let mut result = data.to_vec();
        self.allreduce(data, &mut result, op)?;
        Ok(result)
    }

    fn pipeline_reduce<T>(&self, data: &[T], op: MPIReduceOp) -> LinalgResult<Vec<T>>
    where
        T: MPIDatatype + Clone + Default,
    {
        // Implement pipelined reduction
        let mut result = data.to_vec();
        self.allreduce(data, &mut result, op)?;
        Ok(result)
    }

    /// Record performance metrics for optimization
    fn record_performance(&self, operation: &str, datasize: usize, execution_time: f64, algorithm: &str) {
        let bandwidth = (datasize * std::mem::size_of::<u8>()) as f64 / execution_time;

        let record = CollectivePerformanceRecord {
            operation: operation.to_string(),
            process_count: self.comm.size(),
            datasize,
            execution_time,
            bandwidth,
            algorithm_used: algorithm.to_string(),
            topology_used: TreeTopology::Binomial, // Default for now
        };

        // In a real implementation, this would be stored properly
        // For now, we just create the record but don't store it
        let _ = record;
    }

    /// Get optimization for a specific operation
    pub fn get_optimization(&self, operation: &str) -> Option<&CollectiveOptimization> {
        self.optimization_cache.get(operation)
    }

    /// Set optimization parameters for an operation
    pub fn set_optimization(&mut self, operation: String, optimization: CollectiveOptimization) {
        self.optimization_cache.insert(operation, optimization);
    }

    /// Get performance history
    pub fn get_performance_history(&self) -> &[CollectivePerformanceRecord] {
        &self.performance_history
    }

    /// Clear performance history
    pub fn clear_performance_history(&mut self) {
        self.performance_history.clear();
    }

    /// Get the communicator
    pub fn communicator(&self) -> &Arc<MPICommunicator> {
        &self.comm
    }
}

impl CollectiveOptimization {
    /// Create a new collective optimization
    pub fn new(
        algorithm: String,
        chunksize: usize,
        pipeline_depth: usize,
        tree_topology: TreeTopology,
        expected_performance: f64,
    ) -> Self {
        Self {
            algorithm,
            chunksize,
            pipeline_depth,
            tree_topology,
            expected_performance,
        }
    }

    /// Get the algorithm name
    pub fn algorithm(&self) -> &str {
        &self.algorithm
    }

    /// Get the chunk size
    pub fn chunksize(&self) -> usize {
        self.chunksize
    }

    /// Get the pipeline depth
    pub fn pipeline_depth(&self) -> usize {
        self.pipeline_depth
    }

    /// Get the tree topology
    pub fn tree_topology(&self) -> &TreeTopology {
        &self.tree_topology
    }

    /// Get the expected performance
    pub fn expected_performance(&self) -> f64 {
        self.expected_performance
    }
}

impl CollectivePerformanceRecord {
    /// Get the operation name
    pub fn operation(&self) -> &str {
        &self.operation
    }

    /// Get the process count
    pub fn process_count(&self) -> i32 {
        self.process_count
    }

    /// Get the data size
    pub fn datasize(&self) -> usize {
        self.datasize
    }

    /// Get the execution time
    pub fn execution_time(&self) -> f64 {
        self.execution_time
    }

    /// Get the bandwidth
    pub fn bandwidth(&self) -> f64 {
        self.bandwidth
    }

    /// Get the algorithm used
    pub fn algorithm_used(&self) -> &str {
        &self.algorithm_used
    }

    /// Get the topology used
    pub fn topology_used(&self) -> &TreeTopology {
        &self.topology_used
    }
}

// Additional FFI declarations for collective operations
extern "C" {
    fn mpi_bcast(buffer: *mut c_void, count: usize, datatype: c_int, root: c_int, comm: *mut c_void) -> c_int;
    fn mpi_allreduce(sendbuf: *const c_void, recvbuf: *mut c_void, count: usize, datatype: c_int, op: c_int, comm: *mut c_void) -> c_int;
    fn mpi_gather(sendbuf: *const c_void, sendcount: usize, sendtype: c_int, recvbuf: *mut c_void, recvcount: usize, recvtype: c_int, root: c_int, comm: *mut c_void) -> c_int;
    fn mpi_scatter(sendbuf: *const c_void, sendcount: usize, sendtype: c_int, recvbuf: *mut c_void, recvcount: usize, recvtype: c_int, root: c_int, comm: *mut c_void) -> c_int;
    fn mpi_allgather(sendbuf: *const c_void, sendcount: usize, sendtype: c_int, recvbuf: *mut c_void, recvcount: usize, recvtype: c_int, comm: *mut c_void) -> c_int;
    fn mpi_reduce(sendbuf: *const c_void, recvbuf: *mut c_void, count: usize, datatype: c_int, op: c_int, root: c_int, comm: *mut c_void) -> c_int;
}