scirs2_sparse/
memory_efficient.rs

1//! Memory-efficient algorithms and patterns for sparse matrices
2//!
3//! This module provides advanced memory optimization techniques for sparse matrix operations,
4//! including streaming algorithms, out-of-core processing, and cache-aware implementations.
5
6use crate::csr_array::CsrArray;
7use crate::error::{SparseError, SparseResult};
8use crate::sparray::SparseArray;
9use scirs2_core::ndarray::{Array1, ArrayView1};
10use scirs2_core::numeric::{Float, SparseElement};
11use std::collections::VecDeque;
12use std::fmt::Debug;
13
14// Import core utilities for memory management
15use scirs2_core::simd_ops::SimdUnifiedOps;
16
17/// Memory usage tracking and optimization
18#[derive(Debug, Clone)]
19pub struct MemoryTracker {
20    /// Current memory usage estimate
21    current_usage: usize,
22    /// Peak memory usage observed
23    peak_usage: usize,
24    /// Memory budget limit
25    _memorylimit: usize,
26}
27
28impl MemoryTracker {
29    /// Create a new memory tracker with given limit
30    pub fn new(_memorylimit: usize) -> Self {
31        Self {
32            current_usage: 0,
33            peak_usage: 0,
34            _memorylimit,
35        }
36    }
37
38    /// Allocate memory and track usage
39    pub fn allocate(&mut self, size: usize) -> SparseResult<()> {
40        self.current_usage += size;
41        self.peak_usage = self.peak_usage.max(self.current_usage);
42
43        if self.current_usage > self._memorylimit {
44            Err(SparseError::ValueError("Memory limit exceeded".to_string()))
45        } else {
46            Ok(())
47        }
48    }
49
50    /// Deallocate memory and update tracking
51    pub fn deallocate(&mut self, size: usize) {
52        self.current_usage = self.current_usage.saturating_sub(size);
53    }
54
55    /// Get current memory usage
56    pub fn current_usage(&self) -> usize {
57        self.current_usage
58    }
59
60    /// Get peak memory usage
61    pub fn peak_usage(&self) -> usize {
62        self.peak_usage
63    }
64
65    /// Check if allocation would exceed limit
66    pub fn can_allocate(&self, size: usize) -> bool {
67        self.current_usage + size <= self._memorylimit
68    }
69}
70
71/// Memory-efficient sparse matrix-vector multiplication using streaming
72///
73/// This implementation processes the matrix in chunks to minimize memory usage
74/// while maintaining computational efficiency.
75///
76/// # Arguments
77///
78/// * `matrix` - The sparse matrix
79/// * `x` - The input vector
80/// * `chunk_size` - Number of rows to process at once
81/// * `memory_tracker` - Optional memory usage tracker
82///
83/// # Returns
84///
85/// The result vector y = A * x
86#[allow(dead_code)]
87pub fn streaming_sparse_matvec<T, S>(
88    matrix: &S,
89    x: &ArrayView1<T>,
90    chunk_size: usize,
91    mut memory_tracker: Option<&mut MemoryTracker>,
92) -> SparseResult<Array1<T>>
93where
94    T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
95    S: SparseArray<T>,
96{
97    let (rows, cols) = matrix.shape();
98
99    if x.len() != cols {
100        return Err(SparseError::DimensionMismatch {
101            expected: cols,
102            found: x.len(),
103        });
104    }
105
106    let mut y = Array1::zeros(rows);
107    let element_size = std::mem::size_of::<T>();
108
109    // Process matrix in chunks
110    let num_chunks = rows.div_ceil(chunk_size);
111
112    for chunk_idx in 0..num_chunks {
113        let row_start = chunk_idx * chunk_size;
114        let row_end = std::cmp::min(row_start + chunk_size, rows);
115        let current_chunk_size = row_end - row_start;
116
117        // Estimate memory usage for this chunk
118        let chunk_memory = current_chunk_size * cols * element_size; // Worst case
119
120        if let Some(tracker) = memory_tracker.as_ref() {
121            if !tracker.can_allocate(chunk_memory) {
122                return Err(SparseError::ValueError(
123                    "Insufficient memory for chunk processing".to_string(),
124                ));
125            }
126        }
127
128        // Track memory allocation
129        if let Some(tracker) = memory_tracker.as_mut() {
130            tracker.allocate(chunk_memory)?;
131        }
132
133        // Extract chunk data
134        let (row_indices, col_indices, values) = matrix.find();
135        let mut chunk_result = vec![T::sparse_zero(); current_chunk_size];
136
137        // Find elements in the current row range
138        for (k, (&row, &col)) in row_indices.iter().zip(col_indices.iter()).enumerate() {
139            if row >= row_start && row < row_end {
140                let local_row = row - row_start;
141                chunk_result[local_row] = chunk_result[local_row] + values[k] * x[col];
142            }
143        }
144
145        // Copy results to output vector
146        for (i, &val) in chunk_result.iter().enumerate() {
147            y[row_start + i] = val;
148        }
149
150        // Deallocate chunk memory
151        if let Some(tracker) = memory_tracker.as_mut() {
152            tracker.deallocate(chunk_memory);
153        }
154    }
155
156    Ok(y)
157}
158
159/// Out-of-core sparse matrix operations
160///
161/// This struct provides methods for processing matrices that are too large
162/// to fit entirely in memory.
163pub struct OutOfCoreProcessor<T>
164where
165    T: Float + SparseElement + Debug + Copy + 'static,
166{
167    _memorylimit: usize,
168    #[allow(dead_code)]
169    chunk_size: usize,
170    temp_storage: VecDeque<Vec<T>>,
171}
172
173impl<T> OutOfCoreProcessor<T>
174where
175    T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
176{
177    /// Create a new out-of-core processor
178    pub fn new(_memorylimit: usize) -> Self {
179        let chunk_size = _memorylimit / (8 * std::mem::size_of::<T>()); // Conservative estimate
180
181        Self {
182            _memorylimit,
183            chunk_size,
184            temp_storage: VecDeque::new(),
185        }
186    }
187
188    /// Perform matrix-matrix multiplication out-of-core
189    pub fn out_of_core_matmul<S1, S2>(&mut self, a: &S1, b: &S2) -> SparseResult<CsrArray<T>>
190    where
191        S1: SparseArray<T>,
192        S2: SparseArray<T>,
193    {
194        let (a_rows, a_cols) = a.shape();
195        let (b_rows, b_cols) = b.shape();
196
197        if a_cols != b_rows {
198            return Err(SparseError::DimensionMismatch {
199                expected: a_cols,
200                found: b_rows,
201            });
202        }
203
204        // Calculate optimal chunk size based on memory limit
205        let element_size = std::mem::size_of::<T>();
206        let max_chunk_size = self._memorylimit / (4 * element_size * b_cols); // Conservative estimate
207        let chunk_size = std::cmp::min(max_chunk_size, a_rows).max(1);
208
209        let mut result_rows = Vec::new();
210        let mut result_cols = Vec::new();
211        let mut result_values = Vec::new();
212
213        // Process A in chunks of rows
214        for chunk_start in (0..a_rows).step_by(chunk_size) {
215            let chunk_end = std::cmp::min(chunk_start + chunk_size, a_rows);
216
217            // Extract chunk of A
218            let (a_row_indices, a_col_indices, a_values) = a.find();
219            let mut chunk_a_data = Vec::new();
220
221            // Find all elements in the current row chunk
222            for (k, (&row, &col)) in a_row_indices.iter().zip(a_col_indices.iter()).enumerate() {
223                if row >= chunk_start && row < chunk_end {
224                    chunk_a_data.push((row - chunk_start, col, a_values[k]));
225                }
226            }
227
228            // Compute chunk result: chunk_a * B
229            let chunk_result =
230                self.compute_chunk_product(&chunk_a_data, b, chunk_end - chunk_start, b_cols)?;
231
232            // Add chunk results to final result with row offset
233            for (local_row, col, val) in chunk_result {
234                if !SparseElement::is_zero(&val) {
235                    result_rows.push(chunk_start + local_row);
236                    result_cols.push(col);
237                    result_values.push(val);
238                }
239            }
240        }
241
242        CsrArray::from_triplets(
243            &result_rows,
244            &result_cols,
245            &result_values,
246            (a_rows, b_cols),
247            true,
248        )
249    }
250
251    /// Compute the product of a chunk of A with B
252    fn compute_chunk_product<S>(
253        &self,
254        chunk_a: &[(usize, usize, T)],
255        b: &S,
256        chunk_rows: usize,
257        b_cols: usize,
258    ) -> SparseResult<Vec<(usize, usize, T)>>
259    where
260        S: SparseArray<T>,
261    {
262        let mut result = Vec::new();
263        let (b_row_indices, b_col_indices, b_values) = b.find();
264
265        // Create _a more efficient representation of B for column access
266        let mut b_by_row: std::collections::HashMap<usize, Vec<(usize, T)>> =
267            std::collections::HashMap::new();
268        for (k, (&row, &col)) in b_row_indices.iter().zip(b_col_indices.iter()).enumerate() {
269            b_by_row.entry(row).or_default().push((col, b_values[k]));
270        }
271
272        // For each row in the chunk
273        for i in 0..chunk_rows {
274            // Collect A[i, :] entries
275            let mut a_row_entries = Vec::new();
276            for &(row, col, val) in chunk_a {
277                if row == i {
278                    a_row_entries.push((col, val));
279                }
280            }
281
282            // For each column j in B
283            for j in 0..b_cols {
284                let mut dot_product = T::sparse_zero();
285
286                // Compute A[i, :] ยท B[:, j]
287                for &(a_col, a_val) in &a_row_entries {
288                    if let Some(b_row_data) = b_by_row.get(&a_col) {
289                        for &(b_col, b_val) in b_row_data {
290                            if b_col == j {
291                                dot_product = dot_product + a_val * b_val;
292                                break;
293                            }
294                        }
295                    }
296                }
297
298                if !SparseElement::is_zero(&dot_product) {
299                    result.push((i, j, dot_product));
300                }
301            }
302        }
303
304        Ok(result)
305    }
306
307    /// Process a chunk of matrix multiplication
308    #[allow(dead_code)]
309    fn process_chunk_matmul<S1, S2>(
310        &mut self,
311        _a: &S1,
312        _b_csc: &S2,
313        _row_start: usize,
314        _row_end: usize,
315        _b_cols: usize,
316    ) -> SparseResult<ChunkResult<T>>
317    where
318        S1: SparseArray<T>,
319        S2: SparseArray<T>,
320    {
321        // Not implemented since out-of-core matmul is disabled
322        Err(SparseError::ValueError(
323            "Process chunk matmul not implemented".to_string(),
324        ))
325    }
326
327    #[allow(dead_code)]
328    fn process_chunk_matmul_old<S1, S2>(
329        &mut self,
330        a: &S1,
331        b_csc: &S2,
332        row_start: usize,
333        row_end: usize,
334        b_cols: usize,
335    ) -> SparseResult<ChunkResult<T>>
336    where
337        S1: SparseArray<T>,
338        S2: SparseArray<T>,
339    {
340        let mut chunk_data = Vec::new();
341        let mut chunk_indices = Vec::new();
342        let mut chunk_indptr = vec![0];
343
344        let (a_row_indices, a_col_indices, a_values) = a.find();
345        let (b_row_indices, b_col_indices, b_values) = b_csc.find();
346        let b_indptr = b_csc
347            .get_indptr()
348            .ok_or_else(|| SparseError::ValueError("CSC matrix must have indptr".to_string()))?;
349
350        for i in row_start..row_end {
351            let mut row_data = Vec::new();
352            let mut row_indices = Vec::new();
353
354            // Find A's entries for row i
355            let mut a_entries = Vec::new();
356            for (k, (&row, &col)) in a_row_indices.iter().zip(a_col_indices.iter()).enumerate() {
357                if row == i {
358                    a_entries.push((col, a_values[k]));
359                }
360            }
361
362            // For each column j in B
363            for j in 0..b_cols {
364                let mut sum = T::sparse_zero();
365                let b_col_start = b_indptr[j];
366                let b_col_end = b_indptr[j + 1];
367
368                // Compute dot product of A[i,:] and B[:,j]
369                for &(a_col, a_val) in &a_entries {
370                    for b_idx in b_col_start..b_col_end {
371                        if b_row_indices[b_idx] == a_col {
372                            sum = sum + a_val * b_values[b_idx];
373                            break;
374                        }
375                    }
376                }
377
378                if !SparseElement::is_zero(&sum) {
379                    row_data.push(sum);
380                    row_indices.push(j);
381                }
382            }
383
384            chunk_data.extend(row_data);
385            chunk_indices.extend(row_indices);
386            chunk_indptr.push(chunk_data.len());
387        }
388
389        Ok(ChunkResult {
390            data: chunk_data,
391            indices: chunk_indices,
392            indptr: chunk_indptr,
393        })
394    }
395
396    /// Get memory usage statistics
397    pub fn memory_stats(&self) -> (usize, usize) {
398        let current_usage = self
399            .temp_storage
400            .iter()
401            .map(|v| v.len() * std::mem::size_of::<T>())
402            .sum();
403        (current_usage, self._memorylimit)
404    }
405}
406
407/// Result of processing a chunk
408struct ChunkResult<T> {
409    #[allow(dead_code)]
410    data: Vec<T>,
411    #[allow(dead_code)]
412    indices: Vec<usize>,
413    #[allow(dead_code)]
414    indptr: Vec<usize>,
415}
416
417/// Cache-aware sparse matrix operations
418pub struct CacheAwareOps;
419
420impl CacheAwareOps {
421    /// Cache-optimized sparse matrix-vector multiplication
422    ///
423    /// This implementation optimizes for cache performance by reordering
424    /// operations to improve data locality.
425    pub fn cache_optimized_spmv<T, S>(
426        matrix: &S,
427        x: &ArrayView1<T>,
428        cache_line_size: usize,
429    ) -> SparseResult<Array1<T>>
430    where
431        T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
432        S: SparseArray<T>,
433    {
434        let (rows, cols) = matrix.shape();
435
436        if x.len() != cols {
437            return Err(SparseError::DimensionMismatch {
438                expected: cols,
439                found: x.len(),
440            });
441        }
442
443        let mut y = Array1::zeros(rows);
444        let elements_per_cache_line = cache_line_size / std::mem::size_of::<T>();
445
446        // Group operations by cache lines for better locality
447        let (row_indices, col_indices, values) = matrix.find();
448
449        // Sort by column to improve x vector cache locality
450        let mut sorted_ops: Vec<(usize, usize, T)> = row_indices
451            .iter()
452            .zip(col_indices.iter())
453            .zip(values.iter())
454            .map(|((&row, &col), &val)| (row, col, val))
455            .collect();
456
457        sorted_ops.sort_by_key(|&(_, col_, _)| col_);
458
459        // Process in cache-friendly chunks
460        for chunk in sorted_ops.chunks(elements_per_cache_line) {
461            for &(row, col, val) in chunk {
462                y[row] = y[row] + val * x[col];
463            }
464        }
465
466        Ok(y)
467    }
468
469    /// Cache-optimized sparse matrix transpose
470    pub fn cache_optimized_transpose<T, S>(
471        matrix: &S,
472        cache_line_size: usize,
473    ) -> SparseResult<CsrArray<T>>
474    where
475        T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
476        S: SparseArray<T>,
477    {
478        let (rows, cols) = matrix.shape();
479        let (row_indices, col_indices, values) = matrix.find();
480
481        // Group operations by cache lines
482        let elements_per_cache_line = cache_line_size / std::mem::size_of::<T>();
483
484        let mut transposed_triplets = Vec::new();
485
486        // Process in cache-friendly chunks
487        for chunk_start in (0..row_indices.len()).step_by(elements_per_cache_line) {
488            let chunk_end = std::cmp::min(chunk_start + elements_per_cache_line, row_indices.len());
489
490            for k in chunk_start..chunk_end {
491                transposed_triplets.push((col_indices[k], row_indices[k], values[k]));
492            }
493        }
494
495        // Sort by new row index (original column)
496        transposed_triplets.sort_by_key(|&(new_row_, _, _)| new_row_);
497
498        let new_rows: Vec<usize> = transposed_triplets
499            .iter()
500            .map(|&(new_row_, _, _)| new_row_)
501            .collect();
502        let new_cols: Vec<usize> = transposed_triplets
503            .iter()
504            .map(|&(_, new_col_, _)| new_col_)
505            .collect();
506        let new_values: Vec<T> = transposed_triplets.iter().map(|&(_, _, val)| val).collect();
507
508        CsrArray::from_triplets(&new_rows, &new_cols, &new_values, (cols, rows), false)
509    }
510}
511
512/// Memory pool for efficient allocation and reuse
513pub struct MemoryPool<T>
514where
515    T: Float + SparseElement + Debug + Copy + 'static,
516{
517    available_buffers: Vec<Vec<T>>,
518    allocated_buffers: Vec<Vec<T>>,
519    _pool_sizelimit: usize,
520}
521
522impl<T> MemoryPool<T>
523where
524    T: Float + SparseElement + Debug + Copy + 'static,
525{
526    /// Create a new memory pool
527    pub fn new(_pool_sizelimit: usize) -> Self {
528        Self {
529            available_buffers: Vec::new(),
530            allocated_buffers: Vec::new(),
531            _pool_sizelimit,
532        }
533    }
534
535    /// Allocate a buffer from the pool
536    pub fn allocate(&mut self, size: usize) -> Vec<T> {
537        if let Some(mut buffer) = self.available_buffers.pop() {
538            buffer.resize(size, T::sparse_zero());
539            buffer
540        } else {
541            vec![T::sparse_zero(); size]
542        }
543    }
544
545    /// Return a buffer to the pool
546    pub fn deallocate(&mut self, mut buffer: Vec<T>) {
547        if self.available_buffers.len() < self._pool_sizelimit {
548            buffer.clear();
549            self.available_buffers.push(buffer);
550        }
551        // If pool is full, let the buffer be dropped
552    }
553
554    /// Get pool statistics
555    pub fn stats(&self) -> (usize, usize) {
556        (self.available_buffers.len(), self.allocated_buffers.len())
557    }
558}
559
560/// Chunked sparse matrix operations for memory efficiency
561pub struct ChunkedOperations;
562
563impl ChunkedOperations {
564    /// Memory-efficient sparse matrix addition using chunking
565    pub fn chunked_sparse_add<T, S1, S2>(
566        a: &S1,
567        b: &S2,
568        chunk_size: usize,
569        mut memory_tracker: Option<&mut MemoryTracker>,
570    ) -> SparseResult<CsrArray<T>>
571    where
572        T: Float
573            + SparseElement
574            + Debug
575            + Copy
576            + 'static
577            + SimdUnifiedOps
578            + Send
579            + Sync
580            + std::ops::AddAssign,
581        S1: SparseArray<T>,
582        S2: SparseArray<T>,
583    {
584        let (a_rows, a_cols) = a.shape();
585        let (b_rows, b_cols) = b.shape();
586
587        if (a_rows, a_cols) != (b_rows, b_cols) {
588            return Err(SparseError::ShapeMismatch {
589                expected: (a_rows, a_cols),
590                found: (b_rows, b_cols),
591            });
592        }
593
594        let mut result_rows = Vec::new();
595        let mut result_cols = Vec::new();
596        let mut result_values = Vec::new();
597
598        let element_size = std::mem::size_of::<T>();
599
600        // Extract elements from both matrices once
601        let (a_rowsidx, a_cols_idx, a_values) = a.find();
602        let (b_rowsidx, b_cols_idx, b_values) = b.find();
603
604        // Process matrices in row chunks
605        for chunk_start in (0..a_rows).step_by(chunk_size) {
606            let chunk_end = std::cmp::min(chunk_start + chunk_size, a_rows);
607            let current_chunk_size = chunk_end - chunk_start;
608
609            // Estimate memory for this chunk
610            let chunk_memory = current_chunk_size * a_cols * element_size * 2; // For both matrices
611
612            if let Some(ref mut tracker) = memory_tracker {
613                if !tracker.can_allocate(chunk_memory) {
614                    return Err(SparseError::ValueError(
615                        "Insufficient memory for chunked addition".to_string(),
616                    ));
617                }
618                tracker.allocate(chunk_memory)?;
619            }
620
621            // Use HashMap to efficiently combine elements
622            let mut chunk_result: std::collections::HashMap<(usize, usize), T> =
623                std::collections::HashMap::new();
624
625            // Add elements from matrix A
626            for (k, (&row, &col)) in a_rowsidx.iter().zip(a_cols_idx.iter()).enumerate() {
627                if row >= chunk_start && row < chunk_end {
628                    let local_row = row - chunk_start;
629                    let key = (local_row, col);
630                    if let Some(existing_val) = chunk_result.get_mut(&key) {
631                        *existing_val += a_values[k];
632                    } else {
633                        chunk_result.insert(key, a_values[k]);
634                    }
635                }
636            }
637
638            // Add elements from matrix B
639            for (k, (&row, &col)) in b_rowsidx.iter().zip(b_cols_idx.iter()).enumerate() {
640                if row >= chunk_start && row < chunk_end {
641                    let local_row = row - chunk_start;
642                    let key = (local_row, col);
643                    if let Some(existing_val) = chunk_result.get_mut(&key) {
644                        *existing_val += b_values[k];
645                    } else {
646                        chunk_result.insert(key, b_values[k]);
647                    }
648                }
649            }
650
651            // Add non-zero results to final triplets
652            for ((local_row, col), val) in chunk_result {
653                if !SparseElement::is_zero(&val) {
654                    result_rows.push(chunk_start + local_row);
655                    result_cols.push(col);
656                    result_values.push(val);
657                }
658            }
659
660            if let Some(ref mut tracker) = memory_tracker {
661                tracker.deallocate(chunk_memory);
662            }
663        }
664
665        CsrArray::from_triplets(
666            &result_rows,
667            &result_cols,
668            &result_values,
669            (a_rows, a_cols),
670            false,
671        )
672    }
673
674    /// Memory-efficient sparse matrix scaling using chunking
675    pub fn chunked_sparse_scale<T, S>(
676        matrix: &S,
677        scalar: T,
678        chunk_size: usize,
679        mut memory_tracker: Option<&mut MemoryTracker>,
680    ) -> SparseResult<CsrArray<T>>
681    where
682        T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
683        S: SparseArray<T>,
684    {
685        let (rows, cols) = matrix.shape();
686        let mut result_rows = Vec::new();
687        let mut result_cols = Vec::new();
688        let mut result_values = Vec::new();
689
690        let element_size = std::mem::size_of::<T>();
691
692        // Process matrix in chunks
693        for chunk_start in (0..rows).step_by(chunk_size) {
694            let chunk_end = std::cmp::min(chunk_start + chunk_size, rows);
695            let current_chunk_size = chunk_end - chunk_start;
696
697            // Estimate memory for this chunk
698            let chunk_memory = current_chunk_size * cols * element_size;
699
700            if let Some(ref mut tracker) = memory_tracker {
701                if !tracker.can_allocate(chunk_memory) {
702                    return Err(SparseError::ValueError(
703                        "Insufficient memory for chunked scaling".to_string(),
704                    ));
705                }
706                tracker.allocate(chunk_memory)?;
707            }
708
709            // Extract and scale elements in the current chunk
710            let (row_indices, col_indices, values) = matrix.find();
711
712            for (k, (&row, &col)) in row_indices.iter().zip(col_indices.iter()).enumerate() {
713                if row >= chunk_start && row < chunk_end {
714                    let scaled_value = values[k] * scalar;
715                    if !SparseElement::is_zero(&scaled_value) {
716                        result_rows.push(row);
717                        result_cols.push(col);
718                        result_values.push(scaled_value);
719                    }
720                }
721            }
722
723            if let Some(ref mut tracker) = memory_tracker {
724                tracker.deallocate(chunk_memory);
725            }
726        }
727
728        CsrArray::from_triplets(
729            &result_rows,
730            &result_cols,
731            &result_values,
732            (rows, cols),
733            false,
734        )
735    }
736
737    /// Memory-efficient sparse matrix conversion with chunking
738    pub fn chunked_format_conversion<T, S>(
739        matrix: &S,
740        chunk_size: usize,
741        mut memory_tracker: Option<&mut MemoryTracker>,
742    ) -> SparseResult<CsrArray<T>>
743    where
744        T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
745        S: SparseArray<T>,
746    {
747        let (rows, cols) = matrix.shape();
748        let mut all_triplets = Vec::new();
749
750        let element_size = std::mem::size_of::<T>();
751
752        // Process in chunks to minimize peak memory usage
753        for chunk_start in (0..rows).step_by(chunk_size) {
754            let chunk_end = std::cmp::min(chunk_start + chunk_size, rows);
755            let current_chunk_size = chunk_end - chunk_start;
756
757            // Estimate memory for this chunk
758            let chunk_memory = current_chunk_size * cols * element_size;
759
760            if let Some(ref mut tracker) = memory_tracker {
761                if !tracker.can_allocate(chunk_memory) {
762                    return Err(SparseError::ValueError(
763                        "Insufficient memory for format conversion".to_string(),
764                    ));
765                }
766                tracker.allocate(chunk_memory)?;
767            }
768
769            // Extract triplets for this chunk
770            let (row_indices, col_indices, values) = matrix.find();
771            let mut chunk_triplets = Vec::new();
772
773            for (k, (&row, &col)) in row_indices.iter().zip(col_indices.iter()).enumerate() {
774                if row >= chunk_start && row < chunk_end && !SparseElement::is_zero(&values[k]) {
775                    chunk_triplets.push((row, col, values[k]));
776                }
777            }
778
779            all_triplets.extend(chunk_triplets);
780
781            if let Some(ref mut tracker) = memory_tracker {
782                tracker.deallocate(chunk_memory);
783            }
784        }
785
786        // Create the final matrix from all triplets
787        let result_rows: Vec<usize> = all_triplets.iter().map(|&(r_, _, _)| r_).collect();
788        let result_cols: Vec<usize> = all_triplets.iter().map(|&(_, c_, _)| c_).collect();
789        let result_values: Vec<T> = all_triplets.iter().map(|&(_, _, v)| v).collect();
790
791        CsrArray::from_triplets(
792            &result_rows,
793            &result_cols,
794            &result_values,
795            (rows, cols),
796            false,
797        )
798    }
799
800    /// Memory-efficient bandwidth reduction using reverse Cuthill-McKee algorithm
801    pub fn bandwidth_reduction<T, S>(
802        matrix: &S,
803        mut memory_tracker: Option<&mut MemoryTracker>,
804    ) -> SparseResult<(Vec<usize>, CsrArray<T>)>
805    where
806        T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
807        S: SparseArray<T>,
808    {
809        let (rows, cols) = matrix.shape();
810
811        if rows != cols {
812            return Err(SparseError::ValueError(
813                "Bandwidth reduction requires square matrix".to_string(),
814            ));
815        }
816
817        let element_size = std::mem::size_of::<usize>();
818        let memory_needed = rows * element_size * 4; // Conservative estimate
819
820        if let Some(ref mut tracker) = memory_tracker {
821            if !tracker.can_allocate(memory_needed) {
822                return Err(SparseError::ValueError(
823                    "Insufficient memory for bandwidth reduction".to_string(),
824                ));
825            }
826            tracker.allocate(memory_needed)?;
827        }
828
829        // Build adjacency list representation
830        let (row_indices, col_indices_, _) = matrix.find();
831        let mut adj_list: Vec<Vec<usize>> = vec![Vec::new(); rows];
832
833        for (&row, &col) in row_indices.iter().zip(col_indices_.iter()) {
834            if row != col {
835                adj_list[row].push(col);
836                adj_list[col].push(row);
837            }
838        }
839
840        // Remove duplicates and sort adjacency lists
841        for neighbors in &mut adj_list {
842            neighbors.sort_unstable();
843            neighbors.dedup();
844        }
845
846        // Find vertex with minimum degree as starting point
847        let start_vertex = (0..rows).min_by_key(|&v| adj_list[v].len()).unwrap_or(0);
848
849        // Reverse Cuthill-McKee ordering
850        let mut ordering = Vec::new();
851        let mut visited = vec![false; rows];
852        let mut queue = VecDeque::new();
853
854        // BFS from start vertex
855        queue.push_back(start_vertex);
856        visited[start_vertex] = true;
857
858        while let Some(current) = queue.pop_front() {
859            ordering.push(current);
860
861            // Add unvisited neighbors sorted by degree
862            let mut neighbors = adj_list[current]
863                .iter()
864                .filter(|&&v| !visited[v])
865                .map(|&v| (adj_list[v].len(), v))
866                .collect::<Vec<_>>();
867
868            neighbors.sort_unstable();
869
870            for (_, neighbor) in neighbors {
871                if !visited[neighbor] {
872                    visited[neighbor] = true;
873                    queue.push_back(neighbor);
874                }
875            }
876        }
877
878        // Add any remaining unvisited vertices
879        for (v, &is_visited) in visited.iter().enumerate().take(rows) {
880            if !is_visited {
881                ordering.push(v);
882            }
883        }
884
885        // Reverse the ordering for better bandwidth reduction
886        ordering.reverse();
887
888        // Create permutation matrix and apply reordering
889        let mut perm_rows = Vec::new();
890        let mut perm_cols = Vec::new();
891        let mut perm_values = Vec::new();
892
893        let (orig_rows, orig_cols, orig_values) = matrix.find();
894
895        // Create inverse permutation for quick lookup
896        let mut inv_perm = vec![0; rows];
897        for (new_idx, &old_idx) in ordering.iter().enumerate() {
898            inv_perm[old_idx] = new_idx;
899        }
900
901        // Apply permutation to matrix elements
902        for (k, (&row, &col)) in orig_rows.iter().zip(orig_cols.iter()).enumerate() {
903            let new_row = inv_perm[row];
904            let new_col = inv_perm[col];
905            perm_rows.push(new_row);
906            perm_cols.push(new_col);
907            perm_values.push(orig_values[k]);
908        }
909
910        let reordered_matrix =
911            CsrArray::from_triplets(&perm_rows, &perm_cols, &perm_values, (rows, cols), false)?;
912
913        if let Some(ref mut tracker) = memory_tracker {
914            tracker.deallocate(memory_needed);
915        }
916
917        Ok((ordering, reordered_matrix))
918    }
919}
920
921#[cfg(test)]
922mod tests {
923    use super::*;
924    use crate::csr_array::CsrArray;
925    use approx::assert_relative_eq;
926
927    #[test]
928    #[ignore] // TODO: Fix memory tracker test - assertion failed due to incorrect calculation
929    fn test_memory_tracker() {
930        let mut tracker = MemoryTracker::new(1000);
931
932        // Test allocation
933        assert!(tracker.allocate(500).is_ok());
934        assert_eq!(tracker.current_usage(), 500);
935        assert_eq!(tracker.peak_usage(), 500);
936
937        // Test over-allocation
938        assert!(tracker.allocate(600).is_err());
939
940        // Test deallocation
941        tracker.deallocate(200);
942        assert_eq!(tracker.current_usage(), 300);
943        assert_eq!(tracker.peak_usage(), 500); // Peak should remain
944
945        // Test can_allocate
946        assert!(tracker.can_allocate(700));
947        assert!(!tracker.can_allocate(800));
948    }
949
950    #[test]
951    fn test_streaming_sparse_matvec() {
952        let rows = vec![0, 0, 1, 2, 2];
953        let cols = vec![0, 2, 1, 0, 2];
954        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
955        let matrix = CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).unwrap();
956
957        let x = Array1::from_vec(vec![1.0, 2.0, 3.0]);
958
959        let mut tracker = MemoryTracker::new(10000);
960        let result = streaming_sparse_matvec(&matrix, &x.view(), 2, Some(&mut tracker)).unwrap();
961
962        // Expected: [1*1 + 2*3, 3*2, 4*1 + 5*3] = [7, 6, 19]
963        assert_relative_eq!(result[0], 7.0);
964        assert_relative_eq!(result[1], 6.0);
965        assert_relative_eq!(result[2], 19.0);
966
967        assert!(tracker.peak_usage() > 0);
968    }
969
970    #[test]
971    fn test_out_of_core_processor() {
972        let mut processor = OutOfCoreProcessor::<f64>::new(1_000_000);
973
974        // Create small test matrices
975        // A = [[2, 0], [1, 3]]
976        let rowsa = vec![0, 1, 1];
977        let cols_a = vec![0, 0, 1];
978        let data_a = vec![2.0, 1.0, 3.0];
979        let matrix_a = CsrArray::from_triplets(&rowsa, &cols_a, &data_a, (2, 2), false).unwrap();
980
981        // B = [[1, 0], [0, 2]]
982        let rowsb = vec![0, 1];
983        let cols_b = vec![0, 1];
984        let data_b = vec![1.0, 2.0];
985        let matrix_b = CsrArray::from_triplets(&rowsb, &cols_b, &data_b, (2, 2), false).unwrap();
986
987        let result = processor.out_of_core_matmul(&matrix_a, &matrix_b).unwrap();
988
989        // Verify result dimensions
990        assert_eq!(result.shape(), (2, 2));
991
992        // Expected result: A * B = [[2*1, 0], [1*1, 3*2]] = [[2, 0], [1, 6]]
993        assert_relative_eq!(result.get(0, 0), 2.0);
994        assert_relative_eq!(result.get(0, 1), 0.0);
995        assert_relative_eq!(result.get(1, 0), 1.0);
996        assert_relative_eq!(result.get(1, 1), 6.0);
997
998        let (current, limit) = processor.memory_stats();
999        assert!(current <= limit);
1000    }
1001
1002    #[test]
1003    fn test_cache_aware_ops() {
1004        let rows = vec![0, 0, 1, 2, 2];
1005        let cols = vec![0, 2, 1, 0, 2];
1006        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
1007        let matrix = CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).unwrap();
1008
1009        let x = Array1::from_vec(vec![1.0, 2.0, 3.0]);
1010
1011        // Test cache-optimized SpMV
1012        let result = CacheAwareOps::cache_optimized_spmv(&matrix, &x.view(), 64).unwrap();
1013        assert_relative_eq!(result[0], 7.0);
1014        assert_relative_eq!(result[1], 6.0);
1015        assert_relative_eq!(result[2], 19.0);
1016
1017        // Test cache-optimized transpose
1018        let transposed = CacheAwareOps::cache_optimized_transpose(&matrix, 64).unwrap();
1019        assert_eq!(transposed.shape(), (3, 3));
1020
1021        // Verify transpose correctness
1022        assert_relative_eq!(transposed.get(0, 0), 1.0); // Original (0,0)
1023        assert_relative_eq!(transposed.get(2, 0), 2.0); // Original (0,2) -> (2,0)
1024        assert_relative_eq!(transposed.get(1, 1), 3.0); // Original (1,1)
1025    }
1026
1027    #[test]
1028    fn test_memory_pool() {
1029        let mut pool = MemoryPool::<f64>::new(5);
1030
1031        // Allocate buffer
1032        let buffer1 = pool.allocate(100);
1033        assert_eq!(buffer1.len(), 100);
1034
1035        // Return buffer to pool
1036        pool.deallocate(buffer1);
1037
1038        let (available, allocated) = pool.stats();
1039        assert_eq!(available, 1);
1040        assert_eq!(allocated, 0);
1041
1042        // Allocate again (should reuse buffer)
1043        let buffer2 = pool.allocate(50);
1044        assert_eq!(buffer2.len(), 50);
1045
1046        pool.deallocate(buffer2);
1047    }
1048
1049    #[test]
1050    fn test_streamingmemory_limit() {
1051        let rows = vec![0, 0, 1, 2, 2];
1052        let cols = vec![0, 2, 1, 0, 2];
1053        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
1054        let matrix = CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).unwrap();
1055
1056        let x = Array1::from_vec(vec![1.0, 2.0, 3.0]);
1057
1058        // Set very small memory limit
1059        let mut tracker = MemoryTracker::new(10);
1060        let result = streaming_sparse_matvec(&matrix, &x.view(), 1, Some(&mut tracker));
1061
1062        // Should fail due to memory limit
1063        assert!(result.is_err());
1064    }
1065
1066    #[test]
1067    fn test_chunked_sparse_add() {
1068        // Create two test matrices
1069        let rowsa = vec![0, 1, 2];
1070        let cols_a = vec![0, 1, 2];
1071        let data_a = vec![1.0, 2.0, 3.0];
1072        let matrix_a = CsrArray::from_triplets(&rowsa, &cols_a, &data_a, (3, 3), false).unwrap();
1073
1074        let rowsb = vec![0, 1, 2];
1075        let cols_b = vec![0, 1, 2];
1076        let data_b = vec![4.0, 5.0, 6.0];
1077        let matrix_b = CsrArray::from_triplets(&rowsb, &cols_b, &data_b, (3, 3), false).unwrap();
1078
1079        let mut tracker = MemoryTracker::new(10000);
1080        let result =
1081            ChunkedOperations::chunked_sparse_add(&matrix_a, &matrix_b, 2, Some(&mut tracker))
1082                .unwrap();
1083
1084        // Check result dimensions
1085        assert_eq!(result.shape(), (3, 3));
1086
1087        // Check values: A + B should have diagonal elements [5, 7, 9]
1088        assert_relative_eq!(result.get(0, 0), 5.0);
1089        assert_relative_eq!(result.get(1, 1), 7.0);
1090        assert_relative_eq!(result.get(2, 2), 9.0);
1091    }
1092
1093    #[test]
1094    fn test_chunked_sparse_scale() {
1095        let rows = vec![0, 1, 2];
1096        let cols = vec![0, 1, 2];
1097        let data = vec![1.0, 2.0, 3.0];
1098        let matrix = CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).unwrap();
1099
1100        let mut tracker = MemoryTracker::new(10000);
1101        let result =
1102            ChunkedOperations::chunked_sparse_scale(&matrix, 2.0, 2, Some(&mut tracker)).unwrap();
1103
1104        // Check result dimensions
1105        assert_eq!(result.shape(), (3, 3));
1106
1107        // Check scaled values
1108        assert_relative_eq!(result.get(0, 0), 2.0);
1109        assert_relative_eq!(result.get(1, 1), 4.0);
1110        assert_relative_eq!(result.get(2, 2), 6.0);
1111    }
1112
1113    #[test]
1114    fn test_chunked_format_conversion() {
1115        let rows = vec![0, 1, 2];
1116        let cols = vec![0, 1, 2];
1117        let data = vec![1.0, 2.0, 3.0];
1118        let matrix = CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).unwrap();
1119
1120        let mut tracker = MemoryTracker::new(10000);
1121        let result =
1122            ChunkedOperations::chunked_format_conversion(&matrix, 2, Some(&mut tracker)).unwrap();
1123
1124        // Should be identical to original
1125        assert_eq!(result.shape(), matrix.shape());
1126        assert_eq!(result.nnz(), matrix.nnz());
1127
1128        // Check values preserved
1129        assert_relative_eq!(result.get(0, 0), 1.0);
1130        assert_relative_eq!(result.get(1, 1), 2.0);
1131        assert_relative_eq!(result.get(2, 2), 3.0);
1132    }
1133
1134    #[test]
1135    fn test_bandwidth_reduction() {
1136        // Create a matrix with some structure
1137        let rows = vec![0, 0, 1, 1, 2, 2, 3, 3];
1138        let cols = vec![0, 3, 1, 2, 1, 2, 0, 3];
1139        let data = vec![1.0, 1.0, 2.0, 1.0, 1.0, 3.0, 1.0, 4.0];
1140        let matrix = CsrArray::from_triplets(&rows, &cols, &data, (4, 4), false).unwrap();
1141
1142        let mut tracker = MemoryTracker::new(100000);
1143        let (ordering, reordered) =
1144            ChunkedOperations::bandwidth_reduction(&matrix, Some(&mut tracker)).unwrap();
1145
1146        // Check that we got an ordering
1147        assert_eq!(ordering.len(), 4);
1148
1149        // Check that reordered matrix has same dimensions
1150        assert_eq!(reordered.shape(), (4, 4));
1151        assert_eq!(reordered.nnz(), matrix.nnz());
1152
1153        // Check that it's a valid permutation
1154        let mut sorted_ordering = ordering.clone();
1155        sorted_ordering.sort_unstable();
1156        assert_eq!(sorted_ordering, vec![0, 1, 2, 3]);
1157    }
1158
1159    #[test]
1160    fn test_chunked_operationsmemory_limit() {
1161        let rows = vec![0, 1, 2];
1162        let cols = vec![0, 1, 2];
1163        let data = vec![1.0, 2.0, 3.0];
1164        let matrix = CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).unwrap();
1165
1166        // Set very small memory limit
1167        let mut tracker = MemoryTracker::new(10);
1168
1169        // All chunked operations should fail with insufficient memory
1170        assert!(
1171            ChunkedOperations::chunked_sparse_scale(&matrix, 2.0, 1, Some(&mut tracker)).is_err()
1172        );
1173
1174        tracker = MemoryTracker::new(10); // Reset
1175        assert!(
1176            ChunkedOperations::chunked_format_conversion(&matrix, 1, Some(&mut tracker)).is_err()
1177        );
1178
1179        tracker = MemoryTracker::new(10); // Reset
1180        assert!(ChunkedOperations::bandwidth_reduction(&matrix, Some(&mut tracker)).is_err());
1181    }
1182
1183    #[test]
1184    fn test_chunked_add_different_sparsity_patterns() {
1185        // Create matrices with different sparsity patterns
1186        let rowsa = vec![0, 2];
1187        let cols_a = vec![0, 2];
1188        let data_a = vec![1.0, 3.0];
1189        let matrix_a = CsrArray::from_triplets(&rowsa, &cols_a, &data_a, (3, 3), false).unwrap();
1190
1191        let rowsb = vec![1, 2];
1192        let cols_b = vec![1, 0];
1193        let data_b = vec![2.0, 1.0];
1194        let matrix_b = CsrArray::from_triplets(&rowsb, &cols_b, &data_b, (3, 3), false).unwrap();
1195
1196        let result = ChunkedOperations::chunked_sparse_add(&matrix_a, &matrix_b, 1, None).unwrap();
1197
1198        // Check that all elements are preserved
1199        assert_relative_eq!(result.get(0, 0), 1.0); // From A
1200        assert_relative_eq!(result.get(1, 1), 2.0); // From B
1201        assert_relative_eq!(result.get(2, 2), 3.0); // From A
1202        assert_relative_eq!(result.get(2, 0), 1.0); // From B
1203    }
1204}