scirs2-sparse 0.4.2

//! Memory-efficient algorithms and patterns for sparse matrices
//!
//! This module provides advanced memory optimization techniques for sparse matrix operations,
//! including streaming algorithms, out-of-core processing, and cache-aware implementations.

use crate::csr_array::CsrArray;
use crate::error::{SparseError, SparseResult};
use crate::sparray::SparseArray;
use scirs2_core::ndarray::{Array1, ArrayView1};
use scirs2_core::numeric::{Float, SparseElement};
use std::collections::VecDeque;
use std::fmt::Debug;

// Import core utilities for memory management
use scirs2_core::simd_ops::SimdUnifiedOps;

/// Memory usage tracking and optimization
#[derive(Debug, Clone)]
pub struct MemoryTracker {
    /// Current memory usage estimate
    current_usage: usize,
    /// Peak memory usage observed
    peak_usage: usize,
    /// Memory budget limit
    _memorylimit: usize,
}

impl MemoryTracker {
    /// Create a new memory tracker with given limit
    pub fn new(_memorylimit: usize) -> Self {
        Self {
            current_usage: 0,
            peak_usage: 0,
            _memorylimit,
        }
    }

    /// Allocate memory and track usage
    pub fn allocate(&mut self, size: usize) -> SparseResult<()> {
        // Check limit BEFORE modifying current_usage
        if self.current_usage + size > self._memorylimit {
            return Err(SparseError::ValueError("Memory limit exceeded".to_string()));
        }

        self.current_usage += size;
        self.peak_usage = self.peak_usage.max(self.current_usage);
        Ok(())
    }

    /// Deallocate memory and update tracking
    pub fn deallocate(&mut self, size: usize) {
        self.current_usage = self.current_usage.saturating_sub(size);
    }

    /// Get current memory usage
    pub fn current_usage(&self) -> usize {
        self.current_usage
    }

    /// Get peak memory usage
    pub fn peak_usage(&self) -> usize {
        self.peak_usage
    }

    /// Check if allocation would exceed limit
    pub fn can_allocate(&self, size: usize) -> bool {
        self.current_usage + size <= self._memorylimit
    }
}

/// Memory-efficient sparse matrix-vector multiplication using streaming
///
/// This implementation processes the matrix in chunks to minimize memory usage
/// while maintaining computational efficiency.
///
/// # Arguments
///
/// * `matrix` - The sparse matrix
/// * `x` - The input vector
/// * `chunk_size` - Number of rows to process at once
/// * `memory_tracker` - Optional memory usage tracker
///
/// # Returns
///
/// The result vector y = A * x
#[allow(dead_code)]
pub fn streaming_sparse_matvec<T, S>(
    matrix: &S,
    x: &ArrayView1<T>,
    chunk_size: usize,
    mut memory_tracker: Option<&mut MemoryTracker>,
) -> SparseResult<Array1<T>>
where
    T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
    S: SparseArray<T>,
{
    let (rows, cols) = matrix.shape();

    if x.len() != cols {
        return Err(SparseError::DimensionMismatch {
            expected: cols,
            found: x.len(),
        });
    }

    let mut y = Array1::zeros(rows);
    let element_size = std::mem::size_of::<T>();

    // Process matrix in chunks
    let num_chunks = rows.div_ceil(chunk_size);

    for chunk_idx in 0..num_chunks {
        let row_start = chunk_idx * chunk_size;
        let row_end = std::cmp::min(row_start + chunk_size, rows);
        let current_chunk_size = row_end - row_start;

        // Estimate memory usage for this chunk
        let chunk_memory = current_chunk_size * cols * element_size; // Worst case

        if let Some(tracker) = memory_tracker.as_ref() {
            if !tracker.can_allocate(chunk_memory) {
                return Err(SparseError::ValueError(
                    "Insufficient memory for chunk processing".to_string(),
                ));
            }
        }

        // Track memory allocation
        if let Some(tracker) = memory_tracker.as_mut() {
            tracker.allocate(chunk_memory)?;
        }

        // Extract chunk data
        let (row_indices, col_indices, values) = matrix.find();
        let mut chunk_result = vec![T::sparse_zero(); current_chunk_size];

        // Find elements in the current row range
        for (k, (&row, &col)) in row_indices.iter().zip(col_indices.iter()).enumerate() {
            if row >= row_start && row < row_end {
                let local_row = row - row_start;
                chunk_result[local_row] = chunk_result[local_row] + values[k] * x[col];
            }
        }

        // Copy results to output vector
        for (i, &val) in chunk_result.iter().enumerate() {
            y[row_start + i] = val;
        }

        // Deallocate chunk memory
        if let Some(tracker) = memory_tracker.as_mut() {
            tracker.deallocate(chunk_memory);
        }
    }

    Ok(y)
}

/// Out-of-core sparse matrix operations
///
/// This struct provides methods for processing matrices that are too large
/// to fit entirely in memory.
pub struct OutOfCoreProcessor<T>
where
    T: Float + SparseElement + Debug + Copy + 'static,
{
    _memorylimit: usize,
    #[allow(dead_code)]
    chunk_size: usize,
    temp_storage: VecDeque<Vec<T>>,
}

impl<T> OutOfCoreProcessor<T>
where
    T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
{
    /// Create a new out-of-core processor
    pub fn new(_memorylimit: usize) -> Self {
        let chunk_size = _memorylimit / (8 * std::mem::size_of::<T>()); // Conservative estimate

        Self {
            _memorylimit,
            chunk_size,
            temp_storage: VecDeque::new(),
        }
    }

    /// Perform matrix-matrix multiplication out-of-core
    pub fn out_of_core_matmul<S1, S2>(&mut self, a: &S1, b: &S2) -> SparseResult<CsrArray<T>>
    where
        S1: SparseArray<T>,
        S2: SparseArray<T>,
    {
        let (a_rows, a_cols) = a.shape();
        let (b_rows, b_cols) = b.shape();

        if a_cols != b_rows {
            return Err(SparseError::DimensionMismatch {
                expected: a_cols,
                found: b_rows,
            });
        }

        // Calculate optimal chunk size based on memory limit
        let element_size = std::mem::size_of::<T>();
        let max_chunk_size = self._memorylimit / (4 * element_size * b_cols); // Conservative estimate
        let chunk_size = std::cmp::min(max_chunk_size, a_rows).max(1);

        let mut result_rows = Vec::new();
        let mut result_cols = Vec::new();
        let mut result_values = Vec::new();

        // Process A in chunks of rows
        for chunk_start in (0..a_rows).step_by(chunk_size) {
            let chunk_end = std::cmp::min(chunk_start + chunk_size, a_rows);

            // Extract chunk of A
            let (a_row_indices, a_col_indices, a_values) = a.find();
            let mut chunk_a_data = Vec::new();

            // Find all elements in the current row chunk
            for (k, (&row, &col)) in a_row_indices.iter().zip(a_col_indices.iter()).enumerate() {
                if row >= chunk_start && row < chunk_end {
                    chunk_a_data.push((row - chunk_start, col, a_values[k]));
                }
            }

            // Compute chunk result: chunk_a * B
            let chunk_result =
                self.compute_chunk_product(&chunk_a_data, b, chunk_end - chunk_start, b_cols)?;

            // Add chunk results to final result with row offset
            for (local_row, col, val) in chunk_result {
                if !SparseElement::is_zero(&val) {
                    result_rows.push(chunk_start + local_row);
                    result_cols.push(col);
                    result_values.push(val);
                }
            }
        }

        CsrArray::from_triplets(
            &result_rows,
            &result_cols,
            &result_values,
            (a_rows, b_cols),
            true,
        )
    }

    /// Compute the product of a chunk of A with B
    fn compute_chunk_product<S>(
        &self,
        chunk_a: &[(usize, usize, T)],
        b: &S,
        chunk_rows: usize,
        b_cols: usize,
    ) -> SparseResult<Vec<(usize, usize, T)>>
    where
        S: SparseArray<T>,
    {
        let mut result = Vec::new();
        let (b_row_indices, b_col_indices, b_values) = b.find();

        // Create _a more efficient representation of B for column access
        let mut b_by_row: std::collections::HashMap<usize, Vec<(usize, T)>> =
            std::collections::HashMap::new();
        for (k, (&row, &col)) in b_row_indices.iter().zip(b_col_indices.iter()).enumerate() {
            b_by_row.entry(row).or_default().push((col, b_values[k]));
        }

        // For each row in the chunk
        for i in 0..chunk_rows {
            // Collect A[i, :] entries
            let mut a_row_entries = Vec::new();
            for &(row, col, val) in chunk_a {
                if row == i {
                    a_row_entries.push((col, val));
                }
            }

            // For each column j in B
            for j in 0..b_cols {
                let mut dot_product = T::sparse_zero();

                // Compute A[i, :] · B[:, j]
                for &(a_col, a_val) in &a_row_entries {
                    if let Some(b_row_data) = b_by_row.get(&a_col) {
                        for &(b_col, b_val) in b_row_data {
                            if b_col == j {
                                dot_product = dot_product + a_val * b_val;
                                break;
                            }
                        }
                    }
                }

                if !SparseElement::is_zero(&dot_product) {
                    result.push((i, j, dot_product));
                }
            }
        }

        Ok(result)
    }

    /// Process a chunk of matrix multiplication
    #[allow(dead_code)]
    fn process_chunk_matmul<S1, S2>(
        &mut self,
        _a: &S1,
        _b_csc: &S2,
        _row_start: usize,
        _row_end: usize,
        _b_cols: usize,
    ) -> SparseResult<ChunkResult<T>>
    where
        S1: SparseArray<T>,
        S2: SparseArray<T>,
    {
        // Not implemented since out-of-core matmul is disabled
        Err(SparseError::ValueError(
            "Process chunk matmul not implemented".to_string(),
        ))
    }

    #[allow(dead_code)]
    fn process_chunk_matmul_old<S1, S2>(
        &mut self,
        a: &S1,
        b_csc: &S2,
        row_start: usize,
        row_end: usize,
        b_cols: usize,
    ) -> SparseResult<ChunkResult<T>>
    where
        S1: SparseArray<T>,
        S2: SparseArray<T>,
    {
        let mut chunk_data = Vec::new();
        let mut chunk_indices = Vec::new();
        let mut chunk_indptr = vec![0];

        let (a_row_indices, a_col_indices, a_values) = a.find();
        let (b_row_indices, b_col_indices, b_values) = b_csc.find();
        let b_indptr = b_csc
            .get_indptr()
            .ok_or_else(|| SparseError::ValueError("CSC matrix must have indptr".to_string()))?;

        for i in row_start..row_end {
            let mut row_data = Vec::new();
            let mut row_indices = Vec::new();

            // Find A's entries for row i
            let mut a_entries = Vec::new();
            for (k, (&row, &col)) in a_row_indices.iter().zip(a_col_indices.iter()).enumerate() {
                if row == i {
                    a_entries.push((col, a_values[k]));
                }
            }

            // For each column j in B
            for j in 0..b_cols {
                let mut sum = T::sparse_zero();
                let b_col_start = b_indptr[j];
                let b_col_end = b_indptr[j + 1];

                // Compute dot product of A[i,:] and B[:,j]
                for &(a_col, a_val) in &a_entries {
                    for b_idx in b_col_start..b_col_end {
                        if b_row_indices[b_idx] == a_col {
                            sum = sum + a_val * b_values[b_idx];
                            break;
                        }
                    }
                }

                if !SparseElement::is_zero(&sum) {
                    row_data.push(sum);
                    row_indices.push(j);
                }
            }

            chunk_data.extend(row_data);
            chunk_indices.extend(row_indices);
            chunk_indptr.push(chunk_data.len());
        }

        Ok(ChunkResult {
            data: chunk_data,
            indices: chunk_indices,
            indptr: chunk_indptr,
        })
    }

    /// Get memory usage statistics
    pub fn memory_stats(&self) -> (usize, usize) {
        let current_usage = self
            .temp_storage
            .iter()
            .map(|v| v.len() * std::mem::size_of::<T>())
            .sum();
        (current_usage, self._memorylimit)
    }
}

/// Result of processing a chunk
struct ChunkResult<T> {
    #[allow(dead_code)]
    data: Vec<T>,
    #[allow(dead_code)]
    indices: Vec<usize>,
    #[allow(dead_code)]
    indptr: Vec<usize>,
}

/// Cache-aware sparse matrix operations
pub struct CacheAwareOps;

impl CacheAwareOps {
    /// Cache-optimized sparse matrix-vector multiplication
    ///
    /// This implementation optimizes for cache performance by reordering
    /// operations to improve data locality.
    pub fn cache_optimized_spmv<T, S>(
        matrix: &S,
        x: &ArrayView1<T>,
        cache_line_size: usize,
    ) -> SparseResult<Array1<T>>
    where
        T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
        S: SparseArray<T>,
    {
        let (rows, cols) = matrix.shape();

        if x.len() != cols {
            return Err(SparseError::DimensionMismatch {
                expected: cols,
                found: x.len(),
            });
        }

        let mut y = Array1::zeros(rows);
        let elements_per_cache_line = cache_line_size / std::mem::size_of::<T>();

        // Group operations by cache lines for better locality
        let (row_indices, col_indices, values) = matrix.find();

        // Sort by column to improve x vector cache locality
        let mut sorted_ops: Vec<(usize, usize, T)> = row_indices
            .iter()
            .zip(col_indices.iter())
            .zip(values.iter())
            .map(|((&row, &col), &val)| (row, col, val))
            .collect();

        sorted_ops.sort_by_key(|&(_, col_, _)| col_);

        // Process in cache-friendly chunks
        for chunk in sorted_ops.chunks(elements_per_cache_line) {
            for &(row, col, val) in chunk {
                y[row] = y[row] + val * x[col];
            }
        }

        Ok(y)
    }

    /// Cache-optimized sparse matrix transpose
    pub fn cache_optimized_transpose<T, S>(
        matrix: &S,
        cache_line_size: usize,
    ) -> SparseResult<CsrArray<T>>
    where
        T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
        S: SparseArray<T>,
    {
        let (rows, cols) = matrix.shape();
        let (row_indices, col_indices, values) = matrix.find();

        // Group operations by cache lines
        let elements_per_cache_line = cache_line_size / std::mem::size_of::<T>();

        let mut transposed_triplets = Vec::new();

        // Process in cache-friendly chunks
        for chunk_start in (0..row_indices.len()).step_by(elements_per_cache_line) {
            let chunk_end = std::cmp::min(chunk_start + elements_per_cache_line, row_indices.len());

            for k in chunk_start..chunk_end {
                transposed_triplets.push((col_indices[k], row_indices[k], values[k]));
            }
        }

        // Sort by new row index (original column)
        transposed_triplets.sort_by_key(|&(new_row_, _, _)| new_row_);

        let new_rows: Vec<usize> = transposed_triplets
            .iter()
            .map(|&(new_row_, _, _)| new_row_)
            .collect();
        let new_cols: Vec<usize> = transposed_triplets
            .iter()
            .map(|&(_, new_col_, _)| new_col_)
            .collect();
        let new_values: Vec<T> = transposed_triplets.iter().map(|&(_, _, val)| val).collect();

        CsrArray::from_triplets(&new_rows, &new_cols, &new_values, (cols, rows), false)
    }
}

/// Memory pool for efficient allocation and reuse
pub struct MemoryPool<T>
where
    T: Float + SparseElement + Debug + Copy + 'static,
{
    available_buffers: Vec<Vec<T>>,
    allocated_buffers: Vec<Vec<T>>,
    _pool_sizelimit: usize,
}

impl<T> MemoryPool<T>
where
    T: Float + SparseElement + Debug + Copy + 'static,
{
    /// Create a new memory pool
    pub fn new(_pool_sizelimit: usize) -> Self {
        Self {
            available_buffers: Vec::new(),
            allocated_buffers: Vec::new(),
            _pool_sizelimit,
        }
    }

    /// Allocate a buffer from the pool
    pub fn allocate(&mut self, size: usize) -> Vec<T> {
        if let Some(mut buffer) = self.available_buffers.pop() {
            buffer.resize(size, T::sparse_zero());
            buffer
        } else {
            vec![T::sparse_zero(); size]
        }
    }

    /// Return a buffer to the pool
    pub fn deallocate(&mut self, mut buffer: Vec<T>) {
        if self.available_buffers.len() < self._pool_sizelimit {
            buffer.clear();
            self.available_buffers.push(buffer);
        }
        // If pool is full, let the buffer be dropped
    }

    /// Get pool statistics
    pub fn stats(&self) -> (usize, usize) {
        (self.available_buffers.len(), self.allocated_buffers.len())
    }
}

/// Chunked sparse matrix operations for memory efficiency
pub struct ChunkedOperations;

impl ChunkedOperations {
    /// Memory-efficient sparse matrix addition using chunking
    pub fn chunked_sparse_add<T, S1, S2>(
        a: &S1,
        b: &S2,
        chunk_size: usize,
        mut memory_tracker: Option<&mut MemoryTracker>,
    ) -> SparseResult<CsrArray<T>>
    where
        T: Float
            + SparseElement
            + Debug
            + Copy
            + 'static
            + SimdUnifiedOps
            + Send
            + Sync
            + std::ops::AddAssign,
        S1: SparseArray<T>,
        S2: SparseArray<T>,
    {
        let (a_rows, a_cols) = a.shape();
        let (b_rows, b_cols) = b.shape();

        if (a_rows, a_cols) != (b_rows, b_cols) {
            return Err(SparseError::ShapeMismatch {
                expected: (a_rows, a_cols),
                found: (b_rows, b_cols),
            });
        }

        let mut result_rows = Vec::new();
        let mut result_cols = Vec::new();
        let mut result_values = Vec::new();

        let element_size = std::mem::size_of::<T>();

        // Extract elements from both matrices once
        let (a_rowsidx, a_cols_idx, a_values) = a.find();
        let (b_rowsidx, b_cols_idx, b_values) = b.find();

        // Process matrices in row chunks
        for chunk_start in (0..a_rows).step_by(chunk_size) {
            let chunk_end = std::cmp::min(chunk_start + chunk_size, a_rows);
            let current_chunk_size = chunk_end - chunk_start;

            // Estimate memory for this chunk
            let chunk_memory = current_chunk_size * a_cols * element_size * 2; // For both matrices

            if let Some(ref mut tracker) = memory_tracker {
                if !tracker.can_allocate(chunk_memory) {
                    return Err(SparseError::ValueError(
                        "Insufficient memory for chunked addition".to_string(),
                    ));
                }
                tracker.allocate(chunk_memory)?;
            }

            // Use HashMap to efficiently combine elements
            let mut chunk_result: std::collections::HashMap<(usize, usize), T> =
                std::collections::HashMap::new();

            // Add elements from matrix A
            for (k, (&row, &col)) in a_rowsidx.iter().zip(a_cols_idx.iter()).enumerate() {
                if row >= chunk_start && row < chunk_end {
                    let local_row = row - chunk_start;
                    let key = (local_row, col);
                    if let Some(existing_val) = chunk_result.get_mut(&key) {
                        *existing_val += a_values[k];
                    } else {
                        chunk_result.insert(key, a_values[k]);
                    }
                }
            }

            // Add elements from matrix B
            for (k, (&row, &col)) in b_rowsidx.iter().zip(b_cols_idx.iter()).enumerate() {
                if row >= chunk_start && row < chunk_end {
                    let local_row = row - chunk_start;
                    let key = (local_row, col);
                    if let Some(existing_val) = chunk_result.get_mut(&key) {
                        *existing_val += b_values[k];
                    } else {
                        chunk_result.insert(key, b_values[k]);
                    }
                }
            }

            // Add non-zero results to final triplets
            for ((local_row, col), val) in chunk_result {
                if !SparseElement::is_zero(&val) {
                    result_rows.push(chunk_start + local_row);
                    result_cols.push(col);
                    result_values.push(val);
                }
            }

            if let Some(ref mut tracker) = memory_tracker {
                tracker.deallocate(chunk_memory);
            }
        }

        CsrArray::from_triplets(
            &result_rows,
            &result_cols,
            &result_values,
            (a_rows, a_cols),
            false,
        )
    }

    /// Memory-efficient sparse matrix scaling using chunking
    pub fn chunked_sparse_scale<T, S>(
        matrix: &S,
        scalar: T,
        chunk_size: usize,
        mut memory_tracker: Option<&mut MemoryTracker>,
    ) -> SparseResult<CsrArray<T>>
    where
        T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
        S: SparseArray<T>,
    {
        let (rows, cols) = matrix.shape();
        let mut result_rows = Vec::new();
        let mut result_cols = Vec::new();
        let mut result_values = Vec::new();

        let element_size = std::mem::size_of::<T>();

        // Process matrix in chunks
        for chunk_start in (0..rows).step_by(chunk_size) {
            let chunk_end = std::cmp::min(chunk_start + chunk_size, rows);
            let current_chunk_size = chunk_end - chunk_start;

            // Estimate memory for this chunk
            let chunk_memory = current_chunk_size * cols * element_size;

            if let Some(ref mut tracker) = memory_tracker {
                if !tracker.can_allocate(chunk_memory) {
                    return Err(SparseError::ValueError(
                        "Insufficient memory for chunked scaling".to_string(),
                    ));
                }
                tracker.allocate(chunk_memory)?;
            }

            // Extract and scale elements in the current chunk
            let (row_indices, col_indices, values) = matrix.find();

            for (k, (&row, &col)) in row_indices.iter().zip(col_indices.iter()).enumerate() {
                if row >= chunk_start && row < chunk_end {
                    let scaled_value = values[k] * scalar;
                    if !SparseElement::is_zero(&scaled_value) {
                        result_rows.push(row);
                        result_cols.push(col);
                        result_values.push(scaled_value);
                    }
                }
            }

            if let Some(ref mut tracker) = memory_tracker {
                tracker.deallocate(chunk_memory);
            }
        }

        CsrArray::from_triplets(
            &result_rows,
            &result_cols,
            &result_values,
            (rows, cols),
            false,
        )
    }

    /// Memory-efficient sparse matrix conversion with chunking
    pub fn chunked_format_conversion<T, S>(
        matrix: &S,
        chunk_size: usize,
        mut memory_tracker: Option<&mut MemoryTracker>,
    ) -> SparseResult<CsrArray<T>>
    where
        T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
        S: SparseArray<T>,
    {
        let (rows, cols) = matrix.shape();
        let mut all_triplets = Vec::new();

        let element_size = std::mem::size_of::<T>();

        // Process in chunks to minimize peak memory usage
        for chunk_start in (0..rows).step_by(chunk_size) {
            let chunk_end = std::cmp::min(chunk_start + chunk_size, rows);
            let current_chunk_size = chunk_end - chunk_start;

            // Estimate memory for this chunk
            let chunk_memory = current_chunk_size * cols * element_size;

            if let Some(ref mut tracker) = memory_tracker {
                if !tracker.can_allocate(chunk_memory) {
                    return Err(SparseError::ValueError(
                        "Insufficient memory for format conversion".to_string(),
                    ));
                }
                tracker.allocate(chunk_memory)?;
            }

            // Extract triplets for this chunk
            let (row_indices, col_indices, values) = matrix.find();
            let mut chunk_triplets = Vec::new();

            for (k, (&row, &col)) in row_indices.iter().zip(col_indices.iter()).enumerate() {
                if row >= chunk_start && row < chunk_end && !SparseElement::is_zero(&values[k]) {
                    chunk_triplets.push((row, col, values[k]));
                }
            }

            all_triplets.extend(chunk_triplets);

            if let Some(ref mut tracker) = memory_tracker {
                tracker.deallocate(chunk_memory);
            }
        }

        // Create the final matrix from all triplets
        let result_rows: Vec<usize> = all_triplets.iter().map(|&(r_, _, _)| r_).collect();
        let result_cols: Vec<usize> = all_triplets.iter().map(|&(_, c_, _)| c_).collect();
        let result_values: Vec<T> = all_triplets.iter().map(|&(_, _, v)| v).collect();

        CsrArray::from_triplets(
            &result_rows,
            &result_cols,
            &result_values,
            (rows, cols),
            false,
        )
    }

    /// Memory-efficient bandwidth reduction using reverse Cuthill-McKee algorithm
    pub fn bandwidth_reduction<T, S>(
        matrix: &S,
        mut memory_tracker: Option<&mut MemoryTracker>,
    ) -> SparseResult<(Vec<usize>, CsrArray<T>)>
    where
        T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
        S: SparseArray<T>,
    {
        let (rows, cols) = matrix.shape();

        if rows != cols {
            return Err(SparseError::ValueError(
                "Bandwidth reduction requires square matrix".to_string(),
            ));
        }

        let element_size = std::mem::size_of::<usize>();
        let memory_needed = rows * element_size * 4; // Conservative estimate

        if let Some(ref mut tracker) = memory_tracker {
            if !tracker.can_allocate(memory_needed) {
                return Err(SparseError::ValueError(
                    "Insufficient memory for bandwidth reduction".to_string(),
                ));
            }
            tracker.allocate(memory_needed)?;
        }

        // Build adjacency list representation
        let (row_indices, col_indices_, _) = matrix.find();
        let mut adj_list: Vec<Vec<usize>> = vec![Vec::new(); rows];

        for (&row, &col) in row_indices.iter().zip(col_indices_.iter()) {
            if row != col {
                adj_list[row].push(col);
                adj_list[col].push(row);
            }
        }

        // Remove duplicates and sort adjacency lists
        for neighbors in &mut adj_list {
            neighbors.sort_unstable();
            neighbors.dedup();
        }

        // Find vertex with minimum degree as starting point
        let start_vertex = (0..rows).min_by_key(|&v| adj_list[v].len()).unwrap_or(0);

        // Reverse Cuthill-McKee ordering
        let mut ordering = Vec::new();
        let mut visited = vec![false; rows];
        let mut queue = VecDeque::new();

        // BFS from start vertex
        queue.push_back(start_vertex);
        visited[start_vertex] = true;

        while let Some(current) = queue.pop_front() {
            ordering.push(current);

            // Add unvisited neighbors sorted by degree
            let mut neighbors = adj_list[current]
                .iter()
                .filter(|&&v| !visited[v])
                .map(|&v| (adj_list[v].len(), v))
                .collect::<Vec<_>>();

            neighbors.sort_unstable();

            for (_, neighbor) in neighbors {
                if !visited[neighbor] {
                    visited[neighbor] = true;
                    queue.push_back(neighbor);
                }
            }
        }

        // Add any remaining unvisited vertices
        for (v, &is_visited) in visited.iter().enumerate().take(rows) {
            if !is_visited {
                ordering.push(v);
            }
        }

        // Reverse the ordering for better bandwidth reduction
        ordering.reverse();

        // Create permutation matrix and apply reordering
        let mut perm_rows = Vec::new();
        let mut perm_cols = Vec::new();
        let mut perm_values = Vec::new();

        let (orig_rows, orig_cols, orig_values) = matrix.find();

        // Create inverse permutation for quick lookup
        let mut inv_perm = vec![0; rows];
        for (new_idx, &old_idx) in ordering.iter().enumerate() {
            inv_perm[old_idx] = new_idx;
        }

        // Apply permutation to matrix elements
        for (k, (&row, &col)) in orig_rows.iter().zip(orig_cols.iter()).enumerate() {
            let new_row = inv_perm[row];
            let new_col = inv_perm[col];
            perm_rows.push(new_row);
            perm_cols.push(new_col);
            perm_values.push(orig_values[k]);
        }

        let reordered_matrix =
            CsrArray::from_triplets(&perm_rows, &perm_cols, &perm_values, (rows, cols), false)?;

        if let Some(ref mut tracker) = memory_tracker {
            tracker.deallocate(memory_needed);
        }

        Ok((ordering, reordered_matrix))
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::csr_array::CsrArray;
    use approx::assert_relative_eq;

    #[test]
    fn test_memory_tracker() {
        let mut tracker = MemoryTracker::new(1000);

        // Test allocation
        assert!(tracker.allocate(500).is_ok());
        assert_eq!(tracker.current_usage(), 500);
        assert_eq!(tracker.peak_usage(), 500);

        // Test over-allocation
        assert!(tracker.allocate(600).is_err());

        // Test deallocation
        tracker.deallocate(200);
        assert_eq!(tracker.current_usage(), 300);
        assert_eq!(tracker.peak_usage(), 500); // Peak should remain

        // Test can_allocate
        assert!(tracker.can_allocate(700));
        assert!(!tracker.can_allocate(800));
    }

    #[test]
    fn test_streaming_sparse_matvec() {
        let rows = vec![0, 0, 1, 2, 2];
        let cols = vec![0, 2, 1, 0, 2];
        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
        let matrix =
            CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).expect("Operation failed");

        let x = Array1::from_vec(vec![1.0, 2.0, 3.0]);

        let mut tracker = MemoryTracker::new(10000);
        let result = streaming_sparse_matvec(&matrix, &x.view(), 2, Some(&mut tracker))
            .expect("Operation failed");

        // Expected: [1*1 + 2*3, 3*2, 4*1 + 5*3] = [7, 6, 19]
        assert_relative_eq!(result[0], 7.0);
        assert_relative_eq!(result[1], 6.0);
        assert_relative_eq!(result[2], 19.0);

        assert!(tracker.peak_usage() > 0);
    }

    #[test]
    fn test_out_of_core_processor() {
        let mut processor = OutOfCoreProcessor::<f64>::new(1_000_000);

        // Create small test matrices
        // A = [[2, 0], [1, 3]]
        let rowsa = vec![0, 1, 1];
        let cols_a = vec![0, 0, 1];
        let data_a = vec![2.0, 1.0, 3.0];
        let matrix_a = CsrArray::from_triplets(&rowsa, &cols_a, &data_a, (2, 2), false)
            .expect("Operation failed");

        // B = [[1, 0], [0, 2]]
        let rowsb = vec![0, 1];
        let cols_b = vec![0, 1];
        let data_b = vec![1.0, 2.0];
        let matrix_b = CsrArray::from_triplets(&rowsb, &cols_b, &data_b, (2, 2), false)
            .expect("Operation failed");

        let result = processor
            .out_of_core_matmul(&matrix_a, &matrix_b)
            .expect("Operation failed");

        // Verify result dimensions
        assert_eq!(result.shape(), (2, 2));

        // Expected result: A * B = [[2*1, 0], [1*1, 3*2]] = [[2, 0], [1, 6]]
        assert_relative_eq!(result.get(0, 0), 2.0);
        assert_relative_eq!(result.get(0, 1), 0.0);
        assert_relative_eq!(result.get(1, 0), 1.0);
        assert_relative_eq!(result.get(1, 1), 6.0);

        let (current, limit) = processor.memory_stats();
        assert!(current <= limit);
    }

    #[test]
    fn test_cache_aware_ops() {
        let rows = vec![0, 0, 1, 2, 2];
        let cols = vec![0, 2, 1, 0, 2];
        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
        let matrix =
            CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).expect("Operation failed");

        let x = Array1::from_vec(vec![1.0, 2.0, 3.0]);

        // Test cache-optimized SpMV
        let result =
            CacheAwareOps::cache_optimized_spmv(&matrix, &x.view(), 64).expect("Operation failed");
        assert_relative_eq!(result[0], 7.0);
        assert_relative_eq!(result[1], 6.0);
        assert_relative_eq!(result[2], 19.0);

        // Test cache-optimized transpose
        let transposed =
            CacheAwareOps::cache_optimized_transpose(&matrix, 64).expect("Operation failed");
        assert_eq!(transposed.shape(), (3, 3));

        // Verify transpose correctness
        assert_relative_eq!(transposed.get(0, 0), 1.0); // Original (0,0)
        assert_relative_eq!(transposed.get(2, 0), 2.0); // Original (0,2) -> (2,0)
        assert_relative_eq!(transposed.get(1, 1), 3.0); // Original (1,1)
    }

    #[test]
    fn test_memory_pool() {
        let mut pool = MemoryPool::<f64>::new(5);

        // Allocate buffer
        let buffer1 = pool.allocate(100);
        assert_eq!(buffer1.len(), 100);

        // Return buffer to pool
        pool.deallocate(buffer1);

        let (available, allocated) = pool.stats();
        assert_eq!(available, 1);
        assert_eq!(allocated, 0);

        // Allocate again (should reuse buffer)
        let buffer2 = pool.allocate(50);
        assert_eq!(buffer2.len(), 50);

        pool.deallocate(buffer2);
    }

    #[test]
    fn test_streamingmemory_limit() {
        let rows = vec![0, 0, 1, 2, 2];
        let cols = vec![0, 2, 1, 0, 2];
        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
        let matrix =
            CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).expect("Operation failed");

        let x = Array1::from_vec(vec![1.0, 2.0, 3.0]);

        // Set very small memory limit
        let mut tracker = MemoryTracker::new(10);
        let result = streaming_sparse_matvec(&matrix, &x.view(), 1, Some(&mut tracker));

        // Should fail due to memory limit
        assert!(result.is_err());
    }

    #[test]
    fn test_chunked_sparse_add() {
        // Create two test matrices
        let rowsa = vec![0, 1, 2];
        let cols_a = vec![0, 1, 2];
        let data_a = vec![1.0, 2.0, 3.0];
        let matrix_a = CsrArray::from_triplets(&rowsa, &cols_a, &data_a, (3, 3), false)
            .expect("Operation failed");

        let rowsb = vec![0, 1, 2];
        let cols_b = vec![0, 1, 2];
        let data_b = vec![4.0, 5.0, 6.0];
        let matrix_b = CsrArray::from_triplets(&rowsb, &cols_b, &data_b, (3, 3), false)
            .expect("Operation failed");

        let mut tracker = MemoryTracker::new(10000);
        let result =
            ChunkedOperations::chunked_sparse_add(&matrix_a, &matrix_b, 2, Some(&mut tracker))
                .expect("Operation failed");

        // Check result dimensions
        assert_eq!(result.shape(), (3, 3));

        // Check values: A + B should have diagonal elements [5, 7, 9]
        assert_relative_eq!(result.get(0, 0), 5.0);
        assert_relative_eq!(result.get(1, 1), 7.0);
        assert_relative_eq!(result.get(2, 2), 9.0);
    }

    #[test]
    fn test_chunked_sparse_scale() {
        let rows = vec![0, 1, 2];
        let cols = vec![0, 1, 2];
        let data = vec![1.0, 2.0, 3.0];
        let matrix =
            CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).expect("Operation failed");

        let mut tracker = MemoryTracker::new(10000);
        let result = ChunkedOperations::chunked_sparse_scale(&matrix, 2.0, 2, Some(&mut tracker))
            .expect("Operation failed");

        // Check result dimensions
        assert_eq!(result.shape(), (3, 3));

        // Check scaled values
        assert_relative_eq!(result.get(0, 0), 2.0);
        assert_relative_eq!(result.get(1, 1), 4.0);
        assert_relative_eq!(result.get(2, 2), 6.0);
    }

    #[test]
    fn test_chunked_format_conversion() {
        let rows = vec![0, 1, 2];
        let cols = vec![0, 1, 2];
        let data = vec![1.0, 2.0, 3.0];
        let matrix =
            CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).expect("Operation failed");

        let mut tracker = MemoryTracker::new(10000);
        let result = ChunkedOperations::chunked_format_conversion(&matrix, 2, Some(&mut tracker))
            .expect("Operation failed");

        // Should be identical to original
        assert_eq!(result.shape(), matrix.shape());
        assert_eq!(result.nnz(), matrix.nnz());

        // Check values preserved
        assert_relative_eq!(result.get(0, 0), 1.0);
        assert_relative_eq!(result.get(1, 1), 2.0);
        assert_relative_eq!(result.get(2, 2), 3.0);
    }

    #[test]
    fn test_bandwidth_reduction() {
        // Create a matrix with some structure
        let rows = vec![0, 0, 1, 1, 2, 2, 3, 3];
        let cols = vec![0, 3, 1, 2, 1, 2, 0, 3];
        let data = vec![1.0, 1.0, 2.0, 1.0, 1.0, 3.0, 1.0, 4.0];
        let matrix =
            CsrArray::from_triplets(&rows, &cols, &data, (4, 4), false).expect("Operation failed");

        let mut tracker = MemoryTracker::new(100000);
        let (ordering, reordered) =
            ChunkedOperations::bandwidth_reduction(&matrix, Some(&mut tracker))
                .expect("Operation failed");

        // Check that we got an ordering
        assert_eq!(ordering.len(), 4);

        // Check that reordered matrix has same dimensions
        assert_eq!(reordered.shape(), (4, 4));
        assert_eq!(reordered.nnz(), matrix.nnz());

        // Check that it's a valid permutation
        let mut sorted_ordering = ordering.clone();
        sorted_ordering.sort_unstable();
        assert_eq!(sorted_ordering, vec![0, 1, 2, 3]);
    }

    #[test]
    fn test_chunked_operationsmemory_limit() {
        let rows = vec![0, 1, 2];
        let cols = vec![0, 1, 2];
        let data = vec![1.0, 2.0, 3.0];
        let matrix =
            CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).expect("Operation failed");

        // Set very small memory limit
        let mut tracker = MemoryTracker::new(10);

        // All chunked operations should fail with insufficient memory
        assert!(
            ChunkedOperations::chunked_sparse_scale(&matrix, 2.0, 1, Some(&mut tracker)).is_err()
        );

        tracker = MemoryTracker::new(10); // Reset
        assert!(
            ChunkedOperations::chunked_format_conversion(&matrix, 1, Some(&mut tracker)).is_err()
        );

        tracker = MemoryTracker::new(10); // Reset
        assert!(ChunkedOperations::bandwidth_reduction(&matrix, Some(&mut tracker)).is_err());
    }

    #[test]
    fn test_chunked_add_different_sparsity_patterns() {
        // Create matrices with different sparsity patterns
        let rowsa = vec![0, 2];
        let cols_a = vec![0, 2];
        let data_a = vec![1.0, 3.0];
        let matrix_a = CsrArray::from_triplets(&rowsa, &cols_a, &data_a, (3, 3), false)
            .expect("Operation failed");

        let rowsb = vec![1, 2];
        let cols_b = vec![1, 0];
        let data_b = vec![2.0, 1.0];
        let matrix_b = CsrArray::from_triplets(&rowsb, &cols_b, &data_b, (3, 3), false)
            .expect("Operation failed");

        let result = ChunkedOperations::chunked_sparse_add(&matrix_a, &matrix_b, 1, None)
            .expect("Operation failed");

        // Check that all elements are preserved
        assert_relative_eq!(result.get(0, 0), 1.0); // From A
        assert_relative_eq!(result.get(1, 1), 2.0); // From B
        assert_relative_eq!(result.get(2, 2), 3.0); // From A
        assert_relative_eq!(result.get(2, 0), 1.0); // From B
    }
}