scirs2_sparse/
memory_efficient.rs

1//! Memory-efficient algorithms and patterns for sparse matrices
2//!
3//! This module provides advanced memory optimization techniques for sparse matrix operations,
4//! including streaming algorithms, out-of-core processing, and cache-aware implementations.
5
6use crate::csr_array::CsrArray;
7use crate::error::{SparseError, SparseResult};
8use crate::sparray::SparseArray;
9use scirs2_core::ndarray::{Array1, ArrayView1};
10use scirs2_core::numeric::{Float, SparseElement};
11use std::collections::VecDeque;
12use std::fmt::Debug;
13
14// Import core utilities for memory management
15use scirs2_core::simd_ops::SimdUnifiedOps;
16
17/// Memory usage tracking and optimization
18#[derive(Debug, Clone)]
19pub struct MemoryTracker {
20    /// Current memory usage estimate
21    current_usage: usize,
22    /// Peak memory usage observed
23    peak_usage: usize,
24    /// Memory budget limit
25    _memorylimit: usize,
26}
27
28impl MemoryTracker {
29    /// Create a new memory tracker with given limit
30    pub fn new(_memorylimit: usize) -> Self {
31        Self {
32            current_usage: 0,
33            peak_usage: 0,
34            _memorylimit,
35        }
36    }
37
38    /// Allocate memory and track usage
39    pub fn allocate(&mut self, size: usize) -> SparseResult<()> {
40        // Check limit BEFORE modifying current_usage
41        if self.current_usage + size > self._memorylimit {
42            return Err(SparseError::ValueError("Memory limit exceeded".to_string()));
43        }
44
45        self.current_usage += size;
46        self.peak_usage = self.peak_usage.max(self.current_usage);
47        Ok(())
48    }
49
50    /// Deallocate memory and update tracking
51    pub fn deallocate(&mut self, size: usize) {
52        self.current_usage = self.current_usage.saturating_sub(size);
53    }
54
55    /// Get current memory usage
56    pub fn current_usage(&self) -> usize {
57        self.current_usage
58    }
59
60    /// Get peak memory usage
61    pub fn peak_usage(&self) -> usize {
62        self.peak_usage
63    }
64
65    /// Check if allocation would exceed limit
66    pub fn can_allocate(&self, size: usize) -> bool {
67        self.current_usage + size <= self._memorylimit
68    }
69}
70
71/// Memory-efficient sparse matrix-vector multiplication using streaming
72///
73/// This implementation processes the matrix in chunks to minimize memory usage
74/// while maintaining computational efficiency.
75///
76/// # Arguments
77///
78/// * `matrix` - The sparse matrix
79/// * `x` - The input vector
80/// * `chunk_size` - Number of rows to process at once
81/// * `memory_tracker` - Optional memory usage tracker
82///
83/// # Returns
84///
85/// The result vector y = A * x
86#[allow(dead_code)]
87pub fn streaming_sparse_matvec<T, S>(
88    matrix: &S,
89    x: &ArrayView1<T>,
90    chunk_size: usize,
91    mut memory_tracker: Option<&mut MemoryTracker>,
92) -> SparseResult<Array1<T>>
93where
94    T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
95    S: SparseArray<T>,
96{
97    let (rows, cols) = matrix.shape();
98
99    if x.len() != cols {
100        return Err(SparseError::DimensionMismatch {
101            expected: cols,
102            found: x.len(),
103        });
104    }
105
106    let mut y = Array1::zeros(rows);
107    let element_size = std::mem::size_of::<T>();
108
109    // Process matrix in chunks
110    let num_chunks = rows.div_ceil(chunk_size);
111
112    for chunk_idx in 0..num_chunks {
113        let row_start = chunk_idx * chunk_size;
114        let row_end = std::cmp::min(row_start + chunk_size, rows);
115        let current_chunk_size = row_end - row_start;
116
117        // Estimate memory usage for this chunk
118        let chunk_memory = current_chunk_size * cols * element_size; // Worst case
119
120        if let Some(tracker) = memory_tracker.as_ref() {
121            if !tracker.can_allocate(chunk_memory) {
122                return Err(SparseError::ValueError(
123                    "Insufficient memory for chunk processing".to_string(),
124                ));
125            }
126        }
127
128        // Track memory allocation
129        if let Some(tracker) = memory_tracker.as_mut() {
130            tracker.allocate(chunk_memory)?;
131        }
132
133        // Extract chunk data
134        let (row_indices, col_indices, values) = matrix.find();
135        let mut chunk_result = vec![T::sparse_zero(); current_chunk_size];
136
137        // Find elements in the current row range
138        for (k, (&row, &col)) in row_indices.iter().zip(col_indices.iter()).enumerate() {
139            if row >= row_start && row < row_end {
140                let local_row = row - row_start;
141                chunk_result[local_row] = chunk_result[local_row] + values[k] * x[col];
142            }
143        }
144
145        // Copy results to output vector
146        for (i, &val) in chunk_result.iter().enumerate() {
147            y[row_start + i] = val;
148        }
149
150        // Deallocate chunk memory
151        if let Some(tracker) = memory_tracker.as_mut() {
152            tracker.deallocate(chunk_memory);
153        }
154    }
155
156    Ok(y)
157}
158
159/// Out-of-core sparse matrix operations
160///
161/// This struct provides methods for processing matrices that are too large
162/// to fit entirely in memory.
163pub struct OutOfCoreProcessor<T>
164where
165    T: Float + SparseElement + Debug + Copy + 'static,
166{
167    _memorylimit: usize,
168    #[allow(dead_code)]
169    chunk_size: usize,
170    temp_storage: VecDeque<Vec<T>>,
171}
172
173impl<T> OutOfCoreProcessor<T>
174where
175    T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
176{
177    /// Create a new out-of-core processor
178    pub fn new(_memorylimit: usize) -> Self {
179        let chunk_size = _memorylimit / (8 * std::mem::size_of::<T>()); // Conservative estimate
180
181        Self {
182            _memorylimit,
183            chunk_size,
184            temp_storage: VecDeque::new(),
185        }
186    }
187
188    /// Perform matrix-matrix multiplication out-of-core
189    pub fn out_of_core_matmul<S1, S2>(&mut self, a: &S1, b: &S2) -> SparseResult<CsrArray<T>>
190    where
191        S1: SparseArray<T>,
192        S2: SparseArray<T>,
193    {
194        let (a_rows, a_cols) = a.shape();
195        let (b_rows, b_cols) = b.shape();
196
197        if a_cols != b_rows {
198            return Err(SparseError::DimensionMismatch {
199                expected: a_cols,
200                found: b_rows,
201            });
202        }
203
204        // Calculate optimal chunk size based on memory limit
205        let element_size = std::mem::size_of::<T>();
206        let max_chunk_size = self._memorylimit / (4 * element_size * b_cols); // Conservative estimate
207        let chunk_size = std::cmp::min(max_chunk_size, a_rows).max(1);
208
209        let mut result_rows = Vec::new();
210        let mut result_cols = Vec::new();
211        let mut result_values = Vec::new();
212
213        // Process A in chunks of rows
214        for chunk_start in (0..a_rows).step_by(chunk_size) {
215            let chunk_end = std::cmp::min(chunk_start + chunk_size, a_rows);
216
217            // Extract chunk of A
218            let (a_row_indices, a_col_indices, a_values) = a.find();
219            let mut chunk_a_data = Vec::new();
220
221            // Find all elements in the current row chunk
222            for (k, (&row, &col)) in a_row_indices.iter().zip(a_col_indices.iter()).enumerate() {
223                if row >= chunk_start && row < chunk_end {
224                    chunk_a_data.push((row - chunk_start, col, a_values[k]));
225                }
226            }
227
228            // Compute chunk result: chunk_a * B
229            let chunk_result =
230                self.compute_chunk_product(&chunk_a_data, b, chunk_end - chunk_start, b_cols)?;
231
232            // Add chunk results to final result with row offset
233            for (local_row, col, val) in chunk_result {
234                if !SparseElement::is_zero(&val) {
235                    result_rows.push(chunk_start + local_row);
236                    result_cols.push(col);
237                    result_values.push(val);
238                }
239            }
240        }
241
242        CsrArray::from_triplets(
243            &result_rows,
244            &result_cols,
245            &result_values,
246            (a_rows, b_cols),
247            true,
248        )
249    }
250
251    /// Compute the product of a chunk of A with B
252    fn compute_chunk_product<S>(
253        &self,
254        chunk_a: &[(usize, usize, T)],
255        b: &S,
256        chunk_rows: usize,
257        b_cols: usize,
258    ) -> SparseResult<Vec<(usize, usize, T)>>
259    where
260        S: SparseArray<T>,
261    {
262        let mut result = Vec::new();
263        let (b_row_indices, b_col_indices, b_values) = b.find();
264
265        // Create _a more efficient representation of B for column access
266        let mut b_by_row: std::collections::HashMap<usize, Vec<(usize, T)>> =
267            std::collections::HashMap::new();
268        for (k, (&row, &col)) in b_row_indices.iter().zip(b_col_indices.iter()).enumerate() {
269            b_by_row.entry(row).or_default().push((col, b_values[k]));
270        }
271
272        // For each row in the chunk
273        for i in 0..chunk_rows {
274            // Collect A[i, :] entries
275            let mut a_row_entries = Vec::new();
276            for &(row, col, val) in chunk_a {
277                if row == i {
278                    a_row_entries.push((col, val));
279                }
280            }
281
282            // For each column j in B
283            for j in 0..b_cols {
284                let mut dot_product = T::sparse_zero();
285
286                // Compute A[i, :] ยท B[:, j]
287                for &(a_col, a_val) in &a_row_entries {
288                    if let Some(b_row_data) = b_by_row.get(&a_col) {
289                        for &(b_col, b_val) in b_row_data {
290                            if b_col == j {
291                                dot_product = dot_product + a_val * b_val;
292                                break;
293                            }
294                        }
295                    }
296                }
297
298                if !SparseElement::is_zero(&dot_product) {
299                    result.push((i, j, dot_product));
300                }
301            }
302        }
303
304        Ok(result)
305    }
306
307    /// Process a chunk of matrix multiplication
308    #[allow(dead_code)]
309    fn process_chunk_matmul<S1, S2>(
310        &mut self,
311        _a: &S1,
312        _b_csc: &S2,
313        _row_start: usize,
314        _row_end: usize,
315        _b_cols: usize,
316    ) -> SparseResult<ChunkResult<T>>
317    where
318        S1: SparseArray<T>,
319        S2: SparseArray<T>,
320    {
321        // Not implemented since out-of-core matmul is disabled
322        Err(SparseError::ValueError(
323            "Process chunk matmul not implemented".to_string(),
324        ))
325    }
326
327    #[allow(dead_code)]
328    fn process_chunk_matmul_old<S1, S2>(
329        &mut self,
330        a: &S1,
331        b_csc: &S2,
332        row_start: usize,
333        row_end: usize,
334        b_cols: usize,
335    ) -> SparseResult<ChunkResult<T>>
336    where
337        S1: SparseArray<T>,
338        S2: SparseArray<T>,
339    {
340        let mut chunk_data = Vec::new();
341        let mut chunk_indices = Vec::new();
342        let mut chunk_indptr = vec![0];
343
344        let (a_row_indices, a_col_indices, a_values) = a.find();
345        let (b_row_indices, b_col_indices, b_values) = b_csc.find();
346        let b_indptr = b_csc
347            .get_indptr()
348            .ok_or_else(|| SparseError::ValueError("CSC matrix must have indptr".to_string()))?;
349
350        for i in row_start..row_end {
351            let mut row_data = Vec::new();
352            let mut row_indices = Vec::new();
353
354            // Find A's entries for row i
355            let mut a_entries = Vec::new();
356            for (k, (&row, &col)) in a_row_indices.iter().zip(a_col_indices.iter()).enumerate() {
357                if row == i {
358                    a_entries.push((col, a_values[k]));
359                }
360            }
361
362            // For each column j in B
363            for j in 0..b_cols {
364                let mut sum = T::sparse_zero();
365                let b_col_start = b_indptr[j];
366                let b_col_end = b_indptr[j + 1];
367
368                // Compute dot product of A[i,:] and B[:,j]
369                for &(a_col, a_val) in &a_entries {
370                    for b_idx in b_col_start..b_col_end {
371                        if b_row_indices[b_idx] == a_col {
372                            sum = sum + a_val * b_values[b_idx];
373                            break;
374                        }
375                    }
376                }
377
378                if !SparseElement::is_zero(&sum) {
379                    row_data.push(sum);
380                    row_indices.push(j);
381                }
382            }
383
384            chunk_data.extend(row_data);
385            chunk_indices.extend(row_indices);
386            chunk_indptr.push(chunk_data.len());
387        }
388
389        Ok(ChunkResult {
390            data: chunk_data,
391            indices: chunk_indices,
392            indptr: chunk_indptr,
393        })
394    }
395
396    /// Get memory usage statistics
397    pub fn memory_stats(&self) -> (usize, usize) {
398        let current_usage = self
399            .temp_storage
400            .iter()
401            .map(|v| v.len() * std::mem::size_of::<T>())
402            .sum();
403        (current_usage, self._memorylimit)
404    }
405}
406
407/// Result of processing a chunk
408struct ChunkResult<T> {
409    #[allow(dead_code)]
410    data: Vec<T>,
411    #[allow(dead_code)]
412    indices: Vec<usize>,
413    #[allow(dead_code)]
414    indptr: Vec<usize>,
415}
416
417/// Cache-aware sparse matrix operations
418pub struct CacheAwareOps;
419
420impl CacheAwareOps {
421    /// Cache-optimized sparse matrix-vector multiplication
422    ///
423    /// This implementation optimizes for cache performance by reordering
424    /// operations to improve data locality.
425    pub fn cache_optimized_spmv<T, S>(
426        matrix: &S,
427        x: &ArrayView1<T>,
428        cache_line_size: usize,
429    ) -> SparseResult<Array1<T>>
430    where
431        T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
432        S: SparseArray<T>,
433    {
434        let (rows, cols) = matrix.shape();
435
436        if x.len() != cols {
437            return Err(SparseError::DimensionMismatch {
438                expected: cols,
439                found: x.len(),
440            });
441        }
442
443        let mut y = Array1::zeros(rows);
444        let elements_per_cache_line = cache_line_size / std::mem::size_of::<T>();
445
446        // Group operations by cache lines for better locality
447        let (row_indices, col_indices, values) = matrix.find();
448
449        // Sort by column to improve x vector cache locality
450        let mut sorted_ops: Vec<(usize, usize, T)> = row_indices
451            .iter()
452            .zip(col_indices.iter())
453            .zip(values.iter())
454            .map(|((&row, &col), &val)| (row, col, val))
455            .collect();
456
457        sorted_ops.sort_by_key(|&(_, col_, _)| col_);
458
459        // Process in cache-friendly chunks
460        for chunk in sorted_ops.chunks(elements_per_cache_line) {
461            for &(row, col, val) in chunk {
462                y[row] = y[row] + val * x[col];
463            }
464        }
465
466        Ok(y)
467    }
468
469    /// Cache-optimized sparse matrix transpose
470    pub fn cache_optimized_transpose<T, S>(
471        matrix: &S,
472        cache_line_size: usize,
473    ) -> SparseResult<CsrArray<T>>
474    where
475        T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
476        S: SparseArray<T>,
477    {
478        let (rows, cols) = matrix.shape();
479        let (row_indices, col_indices, values) = matrix.find();
480
481        // Group operations by cache lines
482        let elements_per_cache_line = cache_line_size / std::mem::size_of::<T>();
483
484        let mut transposed_triplets = Vec::new();
485
486        // Process in cache-friendly chunks
487        for chunk_start in (0..row_indices.len()).step_by(elements_per_cache_line) {
488            let chunk_end = std::cmp::min(chunk_start + elements_per_cache_line, row_indices.len());
489
490            for k in chunk_start..chunk_end {
491                transposed_triplets.push((col_indices[k], row_indices[k], values[k]));
492            }
493        }
494
495        // Sort by new row index (original column)
496        transposed_triplets.sort_by_key(|&(new_row_, _, _)| new_row_);
497
498        let new_rows: Vec<usize> = transposed_triplets
499            .iter()
500            .map(|&(new_row_, _, _)| new_row_)
501            .collect();
502        let new_cols: Vec<usize> = transposed_triplets
503            .iter()
504            .map(|&(_, new_col_, _)| new_col_)
505            .collect();
506        let new_values: Vec<T> = transposed_triplets.iter().map(|&(_, _, val)| val).collect();
507
508        CsrArray::from_triplets(&new_rows, &new_cols, &new_values, (cols, rows), false)
509    }
510}
511
512/// Memory pool for efficient allocation and reuse
513pub struct MemoryPool<T>
514where
515    T: Float + SparseElement + Debug + Copy + 'static,
516{
517    available_buffers: Vec<Vec<T>>,
518    allocated_buffers: Vec<Vec<T>>,
519    _pool_sizelimit: usize,
520}
521
522impl<T> MemoryPool<T>
523where
524    T: Float + SparseElement + Debug + Copy + 'static,
525{
526    /// Create a new memory pool
527    pub fn new(_pool_sizelimit: usize) -> Self {
528        Self {
529            available_buffers: Vec::new(),
530            allocated_buffers: Vec::new(),
531            _pool_sizelimit,
532        }
533    }
534
535    /// Allocate a buffer from the pool
536    pub fn allocate(&mut self, size: usize) -> Vec<T> {
537        if let Some(mut buffer) = self.available_buffers.pop() {
538            buffer.resize(size, T::sparse_zero());
539            buffer
540        } else {
541            vec![T::sparse_zero(); size]
542        }
543    }
544
545    /// Return a buffer to the pool
546    pub fn deallocate(&mut self, mut buffer: Vec<T>) {
547        if self.available_buffers.len() < self._pool_sizelimit {
548            buffer.clear();
549            self.available_buffers.push(buffer);
550        }
551        // If pool is full, let the buffer be dropped
552    }
553
554    /// Get pool statistics
555    pub fn stats(&self) -> (usize, usize) {
556        (self.available_buffers.len(), self.allocated_buffers.len())
557    }
558}
559
560/// Chunked sparse matrix operations for memory efficiency
561pub struct ChunkedOperations;
562
563impl ChunkedOperations {
564    /// Memory-efficient sparse matrix addition using chunking
565    pub fn chunked_sparse_add<T, S1, S2>(
566        a: &S1,
567        b: &S2,
568        chunk_size: usize,
569        mut memory_tracker: Option<&mut MemoryTracker>,
570    ) -> SparseResult<CsrArray<T>>
571    where
572        T: Float
573            + SparseElement
574            + Debug
575            + Copy
576            + 'static
577            + SimdUnifiedOps
578            + Send
579            + Sync
580            + std::ops::AddAssign,
581        S1: SparseArray<T>,
582        S2: SparseArray<T>,
583    {
584        let (a_rows, a_cols) = a.shape();
585        let (b_rows, b_cols) = b.shape();
586
587        if (a_rows, a_cols) != (b_rows, b_cols) {
588            return Err(SparseError::ShapeMismatch {
589                expected: (a_rows, a_cols),
590                found: (b_rows, b_cols),
591            });
592        }
593
594        let mut result_rows = Vec::new();
595        let mut result_cols = Vec::new();
596        let mut result_values = Vec::new();
597
598        let element_size = std::mem::size_of::<T>();
599
600        // Extract elements from both matrices once
601        let (a_rowsidx, a_cols_idx, a_values) = a.find();
602        let (b_rowsidx, b_cols_idx, b_values) = b.find();
603
604        // Process matrices in row chunks
605        for chunk_start in (0..a_rows).step_by(chunk_size) {
606            let chunk_end = std::cmp::min(chunk_start + chunk_size, a_rows);
607            let current_chunk_size = chunk_end - chunk_start;
608
609            // Estimate memory for this chunk
610            let chunk_memory = current_chunk_size * a_cols * element_size * 2; // For both matrices
611
612            if let Some(ref mut tracker) = memory_tracker {
613                if !tracker.can_allocate(chunk_memory) {
614                    return Err(SparseError::ValueError(
615                        "Insufficient memory for chunked addition".to_string(),
616                    ));
617                }
618                tracker.allocate(chunk_memory)?;
619            }
620
621            // Use HashMap to efficiently combine elements
622            let mut chunk_result: std::collections::HashMap<(usize, usize), T> =
623                std::collections::HashMap::new();
624
625            // Add elements from matrix A
626            for (k, (&row, &col)) in a_rowsidx.iter().zip(a_cols_idx.iter()).enumerate() {
627                if row >= chunk_start && row < chunk_end {
628                    let local_row = row - chunk_start;
629                    let key = (local_row, col);
630                    if let Some(existing_val) = chunk_result.get_mut(&key) {
631                        *existing_val += a_values[k];
632                    } else {
633                        chunk_result.insert(key, a_values[k]);
634                    }
635                }
636            }
637
638            // Add elements from matrix B
639            for (k, (&row, &col)) in b_rowsidx.iter().zip(b_cols_idx.iter()).enumerate() {
640                if row >= chunk_start && row < chunk_end {
641                    let local_row = row - chunk_start;
642                    let key = (local_row, col);
643                    if let Some(existing_val) = chunk_result.get_mut(&key) {
644                        *existing_val += b_values[k];
645                    } else {
646                        chunk_result.insert(key, b_values[k]);
647                    }
648                }
649            }
650
651            // Add non-zero results to final triplets
652            for ((local_row, col), val) in chunk_result {
653                if !SparseElement::is_zero(&val) {
654                    result_rows.push(chunk_start + local_row);
655                    result_cols.push(col);
656                    result_values.push(val);
657                }
658            }
659
660            if let Some(ref mut tracker) = memory_tracker {
661                tracker.deallocate(chunk_memory);
662            }
663        }
664
665        CsrArray::from_triplets(
666            &result_rows,
667            &result_cols,
668            &result_values,
669            (a_rows, a_cols),
670            false,
671        )
672    }
673
674    /// Memory-efficient sparse matrix scaling using chunking
675    pub fn chunked_sparse_scale<T, S>(
676        matrix: &S,
677        scalar: T,
678        chunk_size: usize,
679        mut memory_tracker: Option<&mut MemoryTracker>,
680    ) -> SparseResult<CsrArray<T>>
681    where
682        T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
683        S: SparseArray<T>,
684    {
685        let (rows, cols) = matrix.shape();
686        let mut result_rows = Vec::new();
687        let mut result_cols = Vec::new();
688        let mut result_values = Vec::new();
689
690        let element_size = std::mem::size_of::<T>();
691
692        // Process matrix in chunks
693        for chunk_start in (0..rows).step_by(chunk_size) {
694            let chunk_end = std::cmp::min(chunk_start + chunk_size, rows);
695            let current_chunk_size = chunk_end - chunk_start;
696
697            // Estimate memory for this chunk
698            let chunk_memory = current_chunk_size * cols * element_size;
699
700            if let Some(ref mut tracker) = memory_tracker {
701                if !tracker.can_allocate(chunk_memory) {
702                    return Err(SparseError::ValueError(
703                        "Insufficient memory for chunked scaling".to_string(),
704                    ));
705                }
706                tracker.allocate(chunk_memory)?;
707            }
708
709            // Extract and scale elements in the current chunk
710            let (row_indices, col_indices, values) = matrix.find();
711
712            for (k, (&row, &col)) in row_indices.iter().zip(col_indices.iter()).enumerate() {
713                if row >= chunk_start && row < chunk_end {
714                    let scaled_value = values[k] * scalar;
715                    if !SparseElement::is_zero(&scaled_value) {
716                        result_rows.push(row);
717                        result_cols.push(col);
718                        result_values.push(scaled_value);
719                    }
720                }
721            }
722
723            if let Some(ref mut tracker) = memory_tracker {
724                tracker.deallocate(chunk_memory);
725            }
726        }
727
728        CsrArray::from_triplets(
729            &result_rows,
730            &result_cols,
731            &result_values,
732            (rows, cols),
733            false,
734        )
735    }
736
737    /// Memory-efficient sparse matrix conversion with chunking
738    pub fn chunked_format_conversion<T, S>(
739        matrix: &S,
740        chunk_size: usize,
741        mut memory_tracker: Option<&mut MemoryTracker>,
742    ) -> SparseResult<CsrArray<T>>
743    where
744        T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
745        S: SparseArray<T>,
746    {
747        let (rows, cols) = matrix.shape();
748        let mut all_triplets = Vec::new();
749
750        let element_size = std::mem::size_of::<T>();
751
752        // Process in chunks to minimize peak memory usage
753        for chunk_start in (0..rows).step_by(chunk_size) {
754            let chunk_end = std::cmp::min(chunk_start + chunk_size, rows);
755            let current_chunk_size = chunk_end - chunk_start;
756
757            // Estimate memory for this chunk
758            let chunk_memory = current_chunk_size * cols * element_size;
759
760            if let Some(ref mut tracker) = memory_tracker {
761                if !tracker.can_allocate(chunk_memory) {
762                    return Err(SparseError::ValueError(
763                        "Insufficient memory for format conversion".to_string(),
764                    ));
765                }
766                tracker.allocate(chunk_memory)?;
767            }
768
769            // Extract triplets for this chunk
770            let (row_indices, col_indices, values) = matrix.find();
771            let mut chunk_triplets = Vec::new();
772
773            for (k, (&row, &col)) in row_indices.iter().zip(col_indices.iter()).enumerate() {
774                if row >= chunk_start && row < chunk_end && !SparseElement::is_zero(&values[k]) {
775                    chunk_triplets.push((row, col, values[k]));
776                }
777            }
778
779            all_triplets.extend(chunk_triplets);
780
781            if let Some(ref mut tracker) = memory_tracker {
782                tracker.deallocate(chunk_memory);
783            }
784        }
785
786        // Create the final matrix from all triplets
787        let result_rows: Vec<usize> = all_triplets.iter().map(|&(r_, _, _)| r_).collect();
788        let result_cols: Vec<usize> = all_triplets.iter().map(|&(_, c_, _)| c_).collect();
789        let result_values: Vec<T> = all_triplets.iter().map(|&(_, _, v)| v).collect();
790
791        CsrArray::from_triplets(
792            &result_rows,
793            &result_cols,
794            &result_values,
795            (rows, cols),
796            false,
797        )
798    }
799
800    /// Memory-efficient bandwidth reduction using reverse Cuthill-McKee algorithm
801    pub fn bandwidth_reduction<T, S>(
802        matrix: &S,
803        mut memory_tracker: Option<&mut MemoryTracker>,
804    ) -> SparseResult<(Vec<usize>, CsrArray<T>)>
805    where
806        T: Float + SparseElement + Debug + Copy + 'static + SimdUnifiedOps + Send + Sync,
807        S: SparseArray<T>,
808    {
809        let (rows, cols) = matrix.shape();
810
811        if rows != cols {
812            return Err(SparseError::ValueError(
813                "Bandwidth reduction requires square matrix".to_string(),
814            ));
815        }
816
817        let element_size = std::mem::size_of::<usize>();
818        let memory_needed = rows * element_size * 4; // Conservative estimate
819
820        if let Some(ref mut tracker) = memory_tracker {
821            if !tracker.can_allocate(memory_needed) {
822                return Err(SparseError::ValueError(
823                    "Insufficient memory for bandwidth reduction".to_string(),
824                ));
825            }
826            tracker.allocate(memory_needed)?;
827        }
828
829        // Build adjacency list representation
830        let (row_indices, col_indices_, _) = matrix.find();
831        let mut adj_list: Vec<Vec<usize>> = vec![Vec::new(); rows];
832
833        for (&row, &col) in row_indices.iter().zip(col_indices_.iter()) {
834            if row != col {
835                adj_list[row].push(col);
836                adj_list[col].push(row);
837            }
838        }
839
840        // Remove duplicates and sort adjacency lists
841        for neighbors in &mut adj_list {
842            neighbors.sort_unstable();
843            neighbors.dedup();
844        }
845
846        // Find vertex with minimum degree as starting point
847        let start_vertex = (0..rows).min_by_key(|&v| adj_list[v].len()).unwrap_or(0);
848
849        // Reverse Cuthill-McKee ordering
850        let mut ordering = Vec::new();
851        let mut visited = vec![false; rows];
852        let mut queue = VecDeque::new();
853
854        // BFS from start vertex
855        queue.push_back(start_vertex);
856        visited[start_vertex] = true;
857
858        while let Some(current) = queue.pop_front() {
859            ordering.push(current);
860
861            // Add unvisited neighbors sorted by degree
862            let mut neighbors = adj_list[current]
863                .iter()
864                .filter(|&&v| !visited[v])
865                .map(|&v| (adj_list[v].len(), v))
866                .collect::<Vec<_>>();
867
868            neighbors.sort_unstable();
869
870            for (_, neighbor) in neighbors {
871                if !visited[neighbor] {
872                    visited[neighbor] = true;
873                    queue.push_back(neighbor);
874                }
875            }
876        }
877
878        // Add any remaining unvisited vertices
879        for (v, &is_visited) in visited.iter().enumerate().take(rows) {
880            if !is_visited {
881                ordering.push(v);
882            }
883        }
884
885        // Reverse the ordering for better bandwidth reduction
886        ordering.reverse();
887
888        // Create permutation matrix and apply reordering
889        let mut perm_rows = Vec::new();
890        let mut perm_cols = Vec::new();
891        let mut perm_values = Vec::new();
892
893        let (orig_rows, orig_cols, orig_values) = matrix.find();
894
895        // Create inverse permutation for quick lookup
896        let mut inv_perm = vec![0; rows];
897        for (new_idx, &old_idx) in ordering.iter().enumerate() {
898            inv_perm[old_idx] = new_idx;
899        }
900
901        // Apply permutation to matrix elements
902        for (k, (&row, &col)) in orig_rows.iter().zip(orig_cols.iter()).enumerate() {
903            let new_row = inv_perm[row];
904            let new_col = inv_perm[col];
905            perm_rows.push(new_row);
906            perm_cols.push(new_col);
907            perm_values.push(orig_values[k]);
908        }
909
910        let reordered_matrix =
911            CsrArray::from_triplets(&perm_rows, &perm_cols, &perm_values, (rows, cols), false)?;
912
913        if let Some(ref mut tracker) = memory_tracker {
914            tracker.deallocate(memory_needed);
915        }
916
917        Ok((ordering, reordered_matrix))
918    }
919}
920
921#[cfg(test)]
922mod tests {
923    use super::*;
924    use crate::csr_array::CsrArray;
925    use approx::assert_relative_eq;
926
927    #[test]
928    fn test_memory_tracker() {
929        let mut tracker = MemoryTracker::new(1000);
930
931        // Test allocation
932        assert!(tracker.allocate(500).is_ok());
933        assert_eq!(tracker.current_usage(), 500);
934        assert_eq!(tracker.peak_usage(), 500);
935
936        // Test over-allocation
937        assert!(tracker.allocate(600).is_err());
938
939        // Test deallocation
940        tracker.deallocate(200);
941        assert_eq!(tracker.current_usage(), 300);
942        assert_eq!(tracker.peak_usage(), 500); // Peak should remain
943
944        // Test can_allocate
945        assert!(tracker.can_allocate(700));
946        assert!(!tracker.can_allocate(800));
947    }
948
949    #[test]
950    fn test_streaming_sparse_matvec() {
951        let rows = vec![0, 0, 1, 2, 2];
952        let cols = vec![0, 2, 1, 0, 2];
953        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
954        let matrix =
955            CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).expect("Operation failed");
956
957        let x = Array1::from_vec(vec![1.0, 2.0, 3.0]);
958
959        let mut tracker = MemoryTracker::new(10000);
960        let result = streaming_sparse_matvec(&matrix, &x.view(), 2, Some(&mut tracker))
961            .expect("Operation failed");
962
963        // Expected: [1*1 + 2*3, 3*2, 4*1 + 5*3] = [7, 6, 19]
964        assert_relative_eq!(result[0], 7.0);
965        assert_relative_eq!(result[1], 6.0);
966        assert_relative_eq!(result[2], 19.0);
967
968        assert!(tracker.peak_usage() > 0);
969    }
970
971    #[test]
972    fn test_out_of_core_processor() {
973        let mut processor = OutOfCoreProcessor::<f64>::new(1_000_000);
974
975        // Create small test matrices
976        // A = [[2, 0], [1, 3]]
977        let rowsa = vec![0, 1, 1];
978        let cols_a = vec![0, 0, 1];
979        let data_a = vec![2.0, 1.0, 3.0];
980        let matrix_a = CsrArray::from_triplets(&rowsa, &cols_a, &data_a, (2, 2), false)
981            .expect("Operation failed");
982
983        // B = [[1, 0], [0, 2]]
984        let rowsb = vec![0, 1];
985        let cols_b = vec![0, 1];
986        let data_b = vec![1.0, 2.0];
987        let matrix_b = CsrArray::from_triplets(&rowsb, &cols_b, &data_b, (2, 2), false)
988            .expect("Operation failed");
989
990        let result = processor
991            .out_of_core_matmul(&matrix_a, &matrix_b)
992            .expect("Operation failed");
993
994        // Verify result dimensions
995        assert_eq!(result.shape(), (2, 2));
996
997        // Expected result: A * B = [[2*1, 0], [1*1, 3*2]] = [[2, 0], [1, 6]]
998        assert_relative_eq!(result.get(0, 0), 2.0);
999        assert_relative_eq!(result.get(0, 1), 0.0);
1000        assert_relative_eq!(result.get(1, 0), 1.0);
1001        assert_relative_eq!(result.get(1, 1), 6.0);
1002
1003        let (current, limit) = processor.memory_stats();
1004        assert!(current <= limit);
1005    }
1006
1007    #[test]
1008    fn test_cache_aware_ops() {
1009        let rows = vec![0, 0, 1, 2, 2];
1010        let cols = vec![0, 2, 1, 0, 2];
1011        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
1012        let matrix =
1013            CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).expect("Operation failed");
1014
1015        let x = Array1::from_vec(vec![1.0, 2.0, 3.0]);
1016
1017        // Test cache-optimized SpMV
1018        let result =
1019            CacheAwareOps::cache_optimized_spmv(&matrix, &x.view(), 64).expect("Operation failed");
1020        assert_relative_eq!(result[0], 7.0);
1021        assert_relative_eq!(result[1], 6.0);
1022        assert_relative_eq!(result[2], 19.0);
1023
1024        // Test cache-optimized transpose
1025        let transposed =
1026            CacheAwareOps::cache_optimized_transpose(&matrix, 64).expect("Operation failed");
1027        assert_eq!(transposed.shape(), (3, 3));
1028
1029        // Verify transpose correctness
1030        assert_relative_eq!(transposed.get(0, 0), 1.0); // Original (0,0)
1031        assert_relative_eq!(transposed.get(2, 0), 2.0); // Original (0,2) -> (2,0)
1032        assert_relative_eq!(transposed.get(1, 1), 3.0); // Original (1,1)
1033    }
1034
1035    #[test]
1036    fn test_memory_pool() {
1037        let mut pool = MemoryPool::<f64>::new(5);
1038
1039        // Allocate buffer
1040        let buffer1 = pool.allocate(100);
1041        assert_eq!(buffer1.len(), 100);
1042
1043        // Return buffer to pool
1044        pool.deallocate(buffer1);
1045
1046        let (available, allocated) = pool.stats();
1047        assert_eq!(available, 1);
1048        assert_eq!(allocated, 0);
1049
1050        // Allocate again (should reuse buffer)
1051        let buffer2 = pool.allocate(50);
1052        assert_eq!(buffer2.len(), 50);
1053
1054        pool.deallocate(buffer2);
1055    }
1056
1057    #[test]
1058    fn test_streamingmemory_limit() {
1059        let rows = vec![0, 0, 1, 2, 2];
1060        let cols = vec![0, 2, 1, 0, 2];
1061        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
1062        let matrix =
1063            CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).expect("Operation failed");
1064
1065        let x = Array1::from_vec(vec![1.0, 2.0, 3.0]);
1066
1067        // Set very small memory limit
1068        let mut tracker = MemoryTracker::new(10);
1069        let result = streaming_sparse_matvec(&matrix, &x.view(), 1, Some(&mut tracker));
1070
1071        // Should fail due to memory limit
1072        assert!(result.is_err());
1073    }
1074
1075    #[test]
1076    fn test_chunked_sparse_add() {
1077        // Create two test matrices
1078        let rowsa = vec![0, 1, 2];
1079        let cols_a = vec![0, 1, 2];
1080        let data_a = vec![1.0, 2.0, 3.0];
1081        let matrix_a = CsrArray::from_triplets(&rowsa, &cols_a, &data_a, (3, 3), false)
1082            .expect("Operation failed");
1083
1084        let rowsb = vec![0, 1, 2];
1085        let cols_b = vec![0, 1, 2];
1086        let data_b = vec![4.0, 5.0, 6.0];
1087        let matrix_b = CsrArray::from_triplets(&rowsb, &cols_b, &data_b, (3, 3), false)
1088            .expect("Operation failed");
1089
1090        let mut tracker = MemoryTracker::new(10000);
1091        let result =
1092            ChunkedOperations::chunked_sparse_add(&matrix_a, &matrix_b, 2, Some(&mut tracker))
1093                .expect("Operation failed");
1094
1095        // Check result dimensions
1096        assert_eq!(result.shape(), (3, 3));
1097
1098        // Check values: A + B should have diagonal elements [5, 7, 9]
1099        assert_relative_eq!(result.get(0, 0), 5.0);
1100        assert_relative_eq!(result.get(1, 1), 7.0);
1101        assert_relative_eq!(result.get(2, 2), 9.0);
1102    }
1103
1104    #[test]
1105    fn test_chunked_sparse_scale() {
1106        let rows = vec![0, 1, 2];
1107        let cols = vec![0, 1, 2];
1108        let data = vec![1.0, 2.0, 3.0];
1109        let matrix =
1110            CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).expect("Operation failed");
1111
1112        let mut tracker = MemoryTracker::new(10000);
1113        let result = ChunkedOperations::chunked_sparse_scale(&matrix, 2.0, 2, Some(&mut tracker))
1114            .expect("Operation failed");
1115
1116        // Check result dimensions
1117        assert_eq!(result.shape(), (3, 3));
1118
1119        // Check scaled values
1120        assert_relative_eq!(result.get(0, 0), 2.0);
1121        assert_relative_eq!(result.get(1, 1), 4.0);
1122        assert_relative_eq!(result.get(2, 2), 6.0);
1123    }
1124
1125    #[test]
1126    fn test_chunked_format_conversion() {
1127        let rows = vec![0, 1, 2];
1128        let cols = vec![0, 1, 2];
1129        let data = vec![1.0, 2.0, 3.0];
1130        let matrix =
1131            CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).expect("Operation failed");
1132
1133        let mut tracker = MemoryTracker::new(10000);
1134        let result = ChunkedOperations::chunked_format_conversion(&matrix, 2, Some(&mut tracker))
1135            .expect("Operation failed");
1136
1137        // Should be identical to original
1138        assert_eq!(result.shape(), matrix.shape());
1139        assert_eq!(result.nnz(), matrix.nnz());
1140
1141        // Check values preserved
1142        assert_relative_eq!(result.get(0, 0), 1.0);
1143        assert_relative_eq!(result.get(1, 1), 2.0);
1144        assert_relative_eq!(result.get(2, 2), 3.0);
1145    }
1146
1147    #[test]
1148    fn test_bandwidth_reduction() {
1149        // Create a matrix with some structure
1150        let rows = vec![0, 0, 1, 1, 2, 2, 3, 3];
1151        let cols = vec![0, 3, 1, 2, 1, 2, 0, 3];
1152        let data = vec![1.0, 1.0, 2.0, 1.0, 1.0, 3.0, 1.0, 4.0];
1153        let matrix =
1154            CsrArray::from_triplets(&rows, &cols, &data, (4, 4), false).expect("Operation failed");
1155
1156        let mut tracker = MemoryTracker::new(100000);
1157        let (ordering, reordered) =
1158            ChunkedOperations::bandwidth_reduction(&matrix, Some(&mut tracker))
1159                .expect("Operation failed");
1160
1161        // Check that we got an ordering
1162        assert_eq!(ordering.len(), 4);
1163
1164        // Check that reordered matrix has same dimensions
1165        assert_eq!(reordered.shape(), (4, 4));
1166        assert_eq!(reordered.nnz(), matrix.nnz());
1167
1168        // Check that it's a valid permutation
1169        let mut sorted_ordering = ordering.clone();
1170        sorted_ordering.sort_unstable();
1171        assert_eq!(sorted_ordering, vec![0, 1, 2, 3]);
1172    }
1173
1174    #[test]
1175    fn test_chunked_operationsmemory_limit() {
1176        let rows = vec![0, 1, 2];
1177        let cols = vec![0, 1, 2];
1178        let data = vec![1.0, 2.0, 3.0];
1179        let matrix =
1180            CsrArray::from_triplets(&rows, &cols, &data, (3, 3), false).expect("Operation failed");
1181
1182        // Set very small memory limit
1183        let mut tracker = MemoryTracker::new(10);
1184
1185        // All chunked operations should fail with insufficient memory
1186        assert!(
1187            ChunkedOperations::chunked_sparse_scale(&matrix, 2.0, 1, Some(&mut tracker)).is_err()
1188        );
1189
1190        tracker = MemoryTracker::new(10); // Reset
1191        assert!(
1192            ChunkedOperations::chunked_format_conversion(&matrix, 1, Some(&mut tracker)).is_err()
1193        );
1194
1195        tracker = MemoryTracker::new(10); // Reset
1196        assert!(ChunkedOperations::bandwidth_reduction(&matrix, Some(&mut tracker)).is_err());
1197    }
1198
1199    #[test]
1200    fn test_chunked_add_different_sparsity_patterns() {
1201        // Create matrices with different sparsity patterns
1202        let rowsa = vec![0, 2];
1203        let cols_a = vec![0, 2];
1204        let data_a = vec![1.0, 3.0];
1205        let matrix_a = CsrArray::from_triplets(&rowsa, &cols_a, &data_a, (3, 3), false)
1206            .expect("Operation failed");
1207
1208        let rowsb = vec![1, 2];
1209        let cols_b = vec![1, 0];
1210        let data_b = vec![2.0, 1.0];
1211        let matrix_b = CsrArray::from_triplets(&rowsb, &cols_b, &data_b, (3, 3), false)
1212            .expect("Operation failed");
1213
1214        let result = ChunkedOperations::chunked_sparse_add(&matrix_a, &matrix_b, 1, None)
1215            .expect("Operation failed");
1216
1217        // Check that all elements are preserved
1218        assert_relative_eq!(result.get(0, 0), 1.0); // From A
1219        assert_relative_eq!(result.get(1, 1), 2.0); // From B
1220        assert_relative_eq!(result.get(2, 2), 3.0); // From A
1221        assert_relative_eq!(result.get(2, 0), 1.0); // From B
1222    }
1223}