scirs2_text/
sparse.rs

1//! Sparse matrix representations for memory-efficient text processing
2//!
3//! This module provides sparse matrix implementations optimized for text data
4//! where most values are zero (common in TF-IDF and count vectorization).
5
6use crate::error::{Result, TextError};
7use scirs2_core::ndarray::{Array1, Array2};
8use std::collections::HashMap;
9
10/// Compressed Sparse Row (CSR) matrix representation
11#[derive(Debug, Clone)]
12pub struct CsrMatrix {
13    /// Non-zero values
14    values: Vec<f64>,
15    /// Column indices for each value
16    colindices: Vec<usize>,
17    /// Row pointers (cumulative sum of non-zeros per row)
18    row_ptrs: Vec<usize>,
19    /// Number of rows
20    n_rows: usize,
21    /// Number of columns
22    n_cols: usize,
23}
24
25impl CsrMatrix {
26    /// Create a new CSR matrix from dense representation
27    pub fn from_dense(dense: &Array2<f64>) -> Self {
28        let (n_rows, n_cols) = dense.dim();
29        let mut values = Vec::new();
30        let mut colindices = Vec::new();
31        let mut row_ptrs = vec![0];
32
33        for row in dense.rows() {
34            for (colidx, &value) in row.iter().enumerate() {
35                if value != 0.0 {
36                    values.push(value);
37                    colindices.push(colidx);
38                }
39            }
40            row_ptrs.push(values.len());
41        }
42
43        Self {
44            values,
45            colindices,
46            row_ptrs,
47            n_rows,
48            n_cols,
49        }
50    }
51
52    /// Create an empty CSR matrix
53    pub fn zeros(_n_rows: usize, ncols: usize) -> Self {
54        Self {
55            values: Vec::new(),
56            colindices: Vec::new(),
57            row_ptrs: vec![0; _n_rows + 1],
58            n_rows: _n_rows,
59            n_cols: ncols,
60        }
61    }
62
63    /// Get the shape of the matrix
64    pub fn shape(&self) -> (usize, usize) {
65        (self.n_rows, self.n_cols)
66    }
67
68    /// Get the number of non-zero elements
69    pub fn nnz(&self) -> usize {
70        self.values.len()
71    }
72
73    /// Get memory usage in bytes (approximate)
74    pub fn memory_usage(&self) -> usize {
75        self.values.len() * std::mem::size_of::<f64>()
76            + self.colindices.len() * std::mem::size_of::<usize>()
77            + self.row_ptrs.len() * std::mem::size_of::<usize>()
78    }
79
80    /// Convert to dense representation
81    pub fn to_dense(&self) -> Array2<f64> {
82        let mut dense = Array2::zeros((self.n_rows, self.n_cols));
83
84        for rowidx in 0..self.n_rows {
85            let start = self.row_ptrs[rowidx];
86            let end = self.row_ptrs[rowidx + 1];
87
88            for i in start..end {
89                let colidx = self.colindices[i];
90                let value = self.values[i];
91                dense[[rowidx, colidx]] = value;
92            }
93        }
94
95        dense
96    }
97
98    /// Get a specific row as a sparse vector
99    pub fn get_row(&self, rowidx: usize) -> Result<SparseVector> {
100        if rowidx >= self.n_rows {
101            return Err(TextError::InvalidInput(format!(
102                "Row index {} out of bounds for matrix with {} rows",
103                rowidx, self.n_rows
104            )));
105        }
106
107        let start = self.row_ptrs[rowidx];
108        let end = self.row_ptrs[rowidx + 1];
109
110        let indices: Vec<usize> = self.colindices[start..end].to_vec();
111        let values: Vec<f64> = self.values[start..end].to_vec();
112
113        Ok(SparseVector {
114            indices,
115            values,
116            size: self.n_cols,
117        })
118    }
119
120    /// Multiply by a dense vector
121    pub fn dot(&self, vector: &Array1<f64>) -> Result<Array1<f64>> {
122        if vector.len() != self.n_cols {
123            return Err(TextError::InvalidInput(format!(
124                "Vector dimension {} doesn't match matrix columns {}",
125                vector.len(),
126                self.n_cols
127            )));
128        }
129
130        let mut result = Array1::zeros(self.n_rows);
131
132        for rowidx in 0..self.n_rows {
133            let start = self.row_ptrs[rowidx];
134            let end = self.row_ptrs[rowidx + 1];
135
136            let mut sum = 0.0;
137            for i in start..end {
138                let colidx = self.colindices[i];
139                sum += self.values[i] * vector[colidx];
140            }
141            result[rowidx] = sum;
142        }
143
144        Ok(result)
145    }
146}
147
148/// Sparse vector representation using index-value pairs
149#[derive(Debug, Clone)]
150pub struct SparseVector {
151    /// Indices of non-zero elements
152    indices: Vec<usize>,
153    /// Values of non-zero elements
154    values: Vec<f64>,
155    /// Total size of the vector
156    size: usize,
157}
158
159impl SparseVector {
160    /// Create a new sparse vector
161    pub fn new(size: usize) -> Self {
162        Self {
163            indices: Vec::new(),
164            values: Vec::new(),
165            size,
166        }
167    }
168
169    /// Create from a dense vector
170    pub fn from_dense(dense: &Array1<f64>) -> Self {
171        let mut indices = Vec::new();
172        let mut values = Vec::new();
173
174        for (idx, &value) in dense.iter().enumerate() {
175            if value != 0.0 {
176                indices.push(idx);
177                values.push(value);
178            }
179        }
180
181        Self {
182            indices,
183            values,
184            size: dense.len(),
185        }
186    }
187
188    /// Create from indices and values
189    pub fn fromindices_values(indices: Vec<usize>, values: Vec<f64>, size: usize) -> Self {
190        assert_eq!(
191            indices.len(),
192            values.len(),
193            "Indices and values must have the same length"
194        );
195        Self {
196            indices,
197            values,
198            size,
199        }
200    }
201
202    /// Convert to dense representation
203    pub fn to_dense(&self) -> Array1<f64> {
204        let mut dense = Array1::zeros(self.size);
205
206        for (&idx, &value) in self.indices.iter().zip(self.values.iter()) {
207            dense[idx] = value;
208        }
209
210        dense
211    }
212
213    /// Get the number of non-zero elements
214    pub fn nnz(&self) -> usize {
215        self.values.len()
216    }
217
218    /// Get the size of the vector
219    pub fn size(&self) -> usize {
220        self.size
221    }
222
223    /// Compute dot product with another sparse vector
224    pub fn dotsparse(&self, other: &SparseVector) -> Result<f64> {
225        if self.size != other.size {
226            return Err(TextError::InvalidInput(format!(
227                "Vector dimensions don't match: {} vs {}",
228                self.size, other.size
229            )));
230        }
231
232        let mut result = 0.0;
233        let mut i = 0;
234        let mut j = 0;
235
236        // Merge-like algorithm for sorted indices
237        while i < self.indices.len() && j < other.indices.len() {
238            match self.indices[i].cmp(&other.indices[j]) {
239                std::cmp::Ordering::Equal => {
240                    result += self.values[i] * other.values[j];
241                    i += 1;
242                    j += 1;
243                }
244                std::cmp::Ordering::Less => {
245                    i += 1;
246                }
247                std::cmp::Ordering::Greater => {
248                    j += 1;
249                }
250            }
251        }
252
253        Ok(result)
254    }
255
256    /// Compute L2 norm
257    pub fn norm(&self) -> f64 {
258        self.values.iter().map(|&x| x * x).sum::<f64>().sqrt()
259    }
260
261    /// Scale by a constant
262    pub fn scale(&mut self, scalar: f64) {
263        for value in &mut self.values {
264            *value *= scalar;
265        }
266    }
267
268    /// Get the indices of non-zero elements
269    pub fn indices(&self) -> &[usize] {
270        &self.indices
271    }
272
273    /// Get the values of non-zero elements
274    pub fn values(&self) -> &[f64] {
275        &self.values
276    }
277
278    /// Get mutable reference to values
279    pub fn values_mut(&mut self) -> &mut [f64] {
280        &mut self.values
281    }
282}
283
284/// Dictionary of Keys (DOK) format for building sparse matrices
285#[derive(Debug, Clone)]
286pub struct DokMatrix {
287    data: HashMap<(usize, usize), f64>,
288    n_rows: usize,
289    n_cols: usize,
290}
291
292impl DokMatrix {
293    /// Create a new DOK matrix
294    pub fn new(_n_rows: usize, ncols: usize) -> Self {
295        Self {
296            data: HashMap::new(),
297            n_rows: _n_rows,
298            n_cols: ncols,
299        }
300    }
301
302    /// Set a value at the given position
303    pub fn set(&mut self, row: usize, col: usize, value: f64) -> Result<()> {
304        if row >= self.n_rows || col >= self.n_cols {
305            return Err(TextError::InvalidInput(format!(
306                "Index ({}, {}) out of bounds for matrix shape ({}, {})",
307                row, col, self.n_rows, self.n_cols
308            )));
309        }
310
311        if value != 0.0 {
312            self.data.insert((row, col), value);
313        } else {
314            self.data.remove(&(row, col));
315        }
316
317        Ok(())
318    }
319
320    /// Get a value at the given position
321    pub fn get(&self, row: usize, col: usize) -> f64 {
322        *self.data.get(&(row, col)).unwrap_or(&0.0)
323    }
324
325    /// Convert to CSR format
326    pub fn to_csr(&self) -> CsrMatrix {
327        let mut entries: Vec<((usize, usize), f64)> =
328            self.data.iter().map(|(&k, &v)| (k, v)).collect();
329
330        // Sort by row, then by column
331        entries.sort_by_key(|&((r, c), _)| (r, c));
332
333        let mut values = Vec::with_capacity(entries.len());
334        let mut colindices = Vec::with_capacity(entries.len());
335        let mut row_ptrs = vec![0];
336
337        let mut current_row = 0;
338
339        for ((row, col), value) in entries {
340            while current_row < row {
341                row_ptrs.push(values.len());
342                current_row += 1;
343            }
344
345            values.push(value);
346            colindices.push(col);
347        }
348
349        while current_row < self.n_rows {
350            row_ptrs.push(values.len());
351            current_row += 1;
352        }
353
354        CsrMatrix {
355            values,
356            colindices,
357            row_ptrs,
358            n_rows: self.n_rows,
359            n_cols: self.n_cols,
360        }
361    }
362
363    /// Get the number of non-zero elements
364    pub fn nnz(&self) -> usize {
365        self.data.len()
366    }
367}
368
369/// Builder for efficiently constructing sparse matrices row by row
370pub struct SparseMatrixBuilder {
371    rows: Vec<SparseVector>,
372    n_cols: usize,
373}
374
375impl SparseMatrixBuilder {
376    /// Create a new builder
377    pub fn new(_ncols: usize) -> Self {
378        Self {
379            rows: Vec::new(),
380            n_cols: _ncols,
381        }
382    }
383
384    /// Add a row to the matrix
385    pub fn add_row(&mut self, row: SparseVector) -> Result<()> {
386        if row.size() != self.n_cols {
387            return Err(TextError::InvalidInput(format!(
388                "Row size {} doesn't match expected columns {}",
389                row.size(),
390                self.n_cols
391            )));
392        }
393
394        self.rows.push(row);
395        Ok(())
396    }
397
398    /// Build the final CSR matrix
399    pub fn build(self) -> CsrMatrix {
400        let n_rows = self.rows.len();
401        let mut values = Vec::new();
402        let mut colindices = Vec::new();
403        let mut row_ptrs = vec![0];
404
405        for row in self.rows {
406            values.extend(row.values);
407            colindices.extend(row.indices);
408            row_ptrs.push(values.len());
409        }
410
411        CsrMatrix {
412            values,
413            colindices,
414            row_ptrs,
415            n_rows,
416            n_cols: self.n_cols,
417        }
418    }
419}
420
421/// COO (Coordinate) sparse matrix format for efficient construction
422#[derive(Debug, Clone)]
423pub struct CooMatrix {
424    /// Row indices
425    rowindices: Vec<usize>,
426    /// Column indices
427    colindices: Vec<usize>,
428    /// Values
429    values: Vec<f64>,
430    /// Number of rows
431    n_rows: usize,
432    /// Number of columns
433    n_cols: usize,
434}
435
436impl CooMatrix {
437    /// Create a new COO matrix
438    pub fn new(_n_rows: usize, ncols: usize) -> Self {
439        Self {
440            rowindices: Vec::new(),
441            colindices: Vec::new(),
442            values: Vec::new(),
443            n_rows: _n_rows,
444            n_cols: ncols,
445        }
446    }
447
448    /// Add a value to the matrix
449    pub fn push(&mut self, row: usize, col: usize, value: f64) -> Result<()> {
450        if row >= self.n_rows || col >= self.n_cols {
451            return Err(TextError::InvalidInput(format!(
452                "Index ({}, {}) out of bounds for matrix shape ({}, {})",
453                row, col, self.n_rows, self.n_cols
454            )));
455        }
456
457        if value != 0.0 {
458            self.rowindices.push(row);
459            self.colindices.push(col);
460            self.values.push(value);
461        }
462
463        Ok(())
464    }
465
466    /// Convert to CSR format (efficient for row operations)
467    pub fn to_csr(&self) -> CsrMatrix {
468        // Sort by row then column
469        let mut indices: Vec<usize> = (0..self.values.len()).collect();
470        indices.sort_by_key(|&i| (self.rowindices[i], self.colindices[i]));
471
472        let mut values = Vec::with_capacity(self.values.len());
473        let mut colindices = Vec::with_capacity(self.values.len());
474        let mut row_ptrs = vec![0];
475
476        let mut current_row = 0;
477        for &idx in &indices {
478            let row = self.rowindices[idx];
479
480            // Fill row pointers for empty rows
481            while current_row < row {
482                row_ptrs.push(values.len());
483                current_row += 1;
484            }
485
486            values.push(self.values[idx]);
487            colindices.push(self.colindices[idx]);
488        }
489
490        // Fill remaining row pointers
491        while row_ptrs.len() <= self.n_rows {
492            row_ptrs.push(values.len());
493        }
494
495        CsrMatrix {
496            values,
497            colindices,
498            row_ptrs,
499            n_rows: self.n_rows,
500            n_cols: self.n_cols,
501        }
502    }
503
504    /// Get number of non-zero elements
505    pub fn nnz(&self) -> usize {
506        self.values.len()
507    }
508}
509
510/// CSC (Compressed Sparse Column) format for efficient column operations
511#[derive(Debug, Clone)]
512pub struct CscMatrix {
513    /// Non-zero values
514    values: Vec<f64>,
515    /// Row indices for each value
516    rowindices: Vec<usize>,
517    /// Column pointers (cumulative sum of non-zeros per column)
518    col_ptrs: Vec<usize>,
519    /// Number of rows
520    n_rows: usize,
521    /// Number of columns
522    n_cols: usize,
523}
524
525impl CscMatrix {
526    /// Create from COO format
527    pub fn from_coo(coo: &CooMatrix) -> Self {
528        // Sort by column then row
529        let mut indices: Vec<usize> = (0..coo.values.len()).collect();
530        indices.sort_by_key(|&i| (coo.colindices[i], coo.rowindices[i]));
531
532        let mut values = Vec::with_capacity(coo.values.len());
533        let mut rowindices = Vec::with_capacity(coo.values.len());
534        let mut col_ptrs = vec![0];
535
536        let mut current_col = 0;
537        for &idx in &indices {
538            let col = coo.colindices[idx];
539
540            // Fill column pointers for empty columns
541            while current_col < col {
542                col_ptrs.push(values.len());
543                current_col += 1;
544            }
545
546            values.push(coo.values[idx]);
547            rowindices.push(coo.rowindices[idx]);
548        }
549
550        // Fill remaining column pointers
551        while col_ptrs.len() <= coo.n_cols {
552            col_ptrs.push(values.len());
553        }
554
555        Self {
556            values,
557            rowindices,
558            col_ptrs,
559            n_rows: coo.n_rows,
560            n_cols: coo.n_cols,
561        }
562    }
563
564    /// Get a column as a sparse vector
565    pub fn get_col(&self, colidx: usize) -> Result<SparseVector> {
566        if colidx >= self.n_cols {
567            return Err(TextError::InvalidInput(format!(
568                "Column index {} out of bounds for matrix with {} columns",
569                colidx, self.n_cols
570            )));
571        }
572
573        let start = self.col_ptrs[colidx];
574        let end = self.col_ptrs[colidx + 1];
575
576        let indices: Vec<usize> = self.rowindices[start..end].to_vec();
577        let values: Vec<f64> = self.values[start..end].to_vec();
578
579        Ok(SparseVector::fromindices_values(
580            indices,
581            values,
582            self.n_rows,
583        ))
584    }
585
586    /// Get the shape of the matrix
587    pub fn shape(&self) -> (usize, usize) {
588        (self.n_rows, self.n_cols)
589    }
590
591    /// Get the number of non-zero elements
592    pub fn nnz(&self) -> usize {
593        self.values.len()
594    }
595
596    /// Get memory usage in bytes
597    pub fn memory_usage(&self) -> usize {
598        self.values.len() * std::mem::size_of::<f64>()
599            + self.rowindices.len() * std::mem::size_of::<usize>()
600            + self.col_ptrs.len() * std::mem::size_of::<usize>()
601    }
602
603    /// Multiply by a dense vector (A^T * x)
604    pub fn dot(&self, vector: &Array1<f64>) -> Result<Array1<f64>> {
605        if vector.len() != self.n_rows {
606            return Err(TextError::InvalidInput(format!(
607                "Vector dimension {} doesn't match matrix rows {}",
608                vector.len(),
609                self.n_rows
610            )));
611        }
612
613        let mut result = Array1::zeros(self.n_cols);
614
615        for colidx in 0..self.n_cols {
616            let start = self.col_ptrs[colidx];
617            let end = self.col_ptrs[colidx + 1];
618
619            let mut sum = 0.0;
620            for i in start..end {
621                let rowidx = self.rowindices[i];
622                sum += self.values[i] * vector[rowidx];
623            }
624            result[colidx] = sum;
625        }
626
627        Ok(result)
628    }
629}
630
631/// Block sparse matrix for better cache efficiency
632#[derive(Debug, Clone)]
633pub struct BlockSparseMatrix {
634    /// Block size (square blocks)
635    _blocksize: usize,
636    /// Non-empty blocks stored as dense matrices
637    blocks: HashMap<(usize, usize), Array2<f64>>,
638    /// Number of rows
639    n_rows: usize,
640    /// Number of columns
641    n_cols: usize,
642}
643
644impl BlockSparseMatrix {
645    /// Create a new block sparse matrix
646    pub fn new(_n_rows: usize, n_cols: usize, _blocksize: usize) -> Self {
647        Self {
648            _blocksize,
649            blocks: HashMap::new(),
650            n_rows: _n_rows,
651            n_cols,
652        }
653    }
654
655    /// Set a value in the matrix
656    pub fn set(&mut self, row: usize, col: usize, value: f64) -> Result<()> {
657        if row >= self.n_rows || col >= self.n_cols {
658            return Err(TextError::InvalidInput(format!(
659                "Index ({row}, {col}) out of bounds"
660            )));
661        }
662
663        let block_row = row / self._blocksize;
664        let block_col = col / self._blocksize;
665        let local_row = row % self._blocksize;
666        let local_col = col % self._blocksize;
667
668        let block = self
669            .blocks
670            .entry((block_row, block_col))
671            .or_insert_with(|| Array2::zeros((self._blocksize, self._blocksize)));
672
673        block[[local_row, local_col]] = value;
674
675        Ok(())
676    }
677
678    /// Get a value from the matrix
679    pub fn get(&self, row: usize, col: usize) -> f64 {
680        if row >= self.n_rows || col >= self.n_cols {
681            return 0.0;
682        }
683
684        let block_row = row / self._blocksize;
685        let block_col = col / self._blocksize;
686        let local_row = row % self._blocksize;
687        let local_col = col % self._blocksize;
688
689        self.blocks
690            .get(&(block_row, block_col))
691            .map(|block| block[[local_row, local_col]])
692            .unwrap_or(0.0)
693    }
694
695    /// Get memory usage
696    pub fn memory_usage(&self) -> usize {
697        self.blocks.len() * self._blocksize * self._blocksize * std::mem::size_of::<f64>()
698    }
699}
700
701/// Hierarchical sparse matrix using two-level indexing for advanced-sparse data
702#[derive(Debug, Clone)]
703pub struct HierarchicalSparseMatrix {
704    /// Top-level blocks (coarse granularity)
705    top_level_blocks: HashMap<(usize, usize), CompressedBlock>,
706    /// Block size for top level
707    top_blocksize: usize,
708    /// Sub-block size within each top-level block
709    #[allow(dead_code)]
710    sub_blocksize: usize,
711    /// Matrix dimensions
712    n_rows: usize,
713    n_cols: usize,
714}
715
716/// Compressed block storing sparse data with RLE compression
717#[derive(Debug, Clone)]
718pub struct CompressedBlock {
719    /// Run-length encoded indices and values
720    rle_data: Vec<(usize, f64)>, // (run_length, value)
721    /// Original block dimensions
722    #[allow(dead_code)]
723    block_rows: usize,
724    #[allow(dead_code)]
725    block_cols: usize,
726}
727
728impl HierarchicalSparseMatrix {
729    /// Create new hierarchical sparse matrix
730    pub fn new(_n_rows: usize, n_cols: usize, top_blocksize: usize, sub_blocksize: usize) -> Self {
731        Self {
732            top_level_blocks: HashMap::new(),
733            top_blocksize,
734            sub_blocksize,
735            n_rows: _n_rows,
736            n_cols,
737        }
738    }
739
740    /// Set value with hierarchical indexing
741    pub fn set(&mut self, row: usize, col: usize, value: f64) -> Result<()> {
742        if row >= self.n_rows || col >= self.n_cols {
743            return Err(TextError::InvalidInput(format!(
744                "Index ({row}, {col}) out of bounds"
745            )));
746        }
747
748        let top_row = row / self.top_blocksize;
749        let top_col = col / self.top_blocksize;
750
751        // For simplicity, store as COO within each block for now
752        // In a full implementation, this would use sub-blocks
753        let block = self
754            .top_level_blocks
755            .entry((top_row, top_col))
756            .or_insert_with(|| CompressedBlock::new(self.top_blocksize, self.top_blocksize));
757
758        // Store the linearized position and value
759        let local_row = row % self.top_blocksize;
760        let local_col = col % self.top_blocksize;
761        let linear_pos = local_row * self.top_blocksize + local_col;
762
763        block.set_value(linear_pos, value);
764        Ok(())
765    }
766
767    /// Get value with hierarchical lookup
768    pub fn get(&self, row: usize, col: usize) -> f64 {
769        if row >= self.n_rows || col >= self.n_cols {
770            return 0.0;
771        }
772
773        let top_row = row / self.top_blocksize;
774        let top_col = col / self.top_blocksize;
775
776        if let Some(block) = self.top_level_blocks.get(&(top_row, top_col)) {
777            let local_row = row % self.top_blocksize;
778            let local_col = col % self.top_blocksize;
779            let linear_pos = local_row * self.top_blocksize + local_col;
780            block.get_value(linear_pos)
781        } else {
782            0.0
783        }
784    }
785
786    /// Get memory usage
787    pub fn memory_usage(&self) -> usize {
788        self.top_level_blocks
789            .values()
790            .map(|block| block.memory_usage())
791            .sum::<usize>()
792            + std::mem::size_of::<HashMap<(usize, usize), CompressedBlock>>()
793    }
794}
795
796impl CompressedBlock {
797    fn new(rows: usize, cols: usize) -> Self {
798        Self {
799            rle_data: Vec::new(),
800            block_rows: rows,
801            block_cols: cols,
802        }
803    }
804
805    fn set_value(&mut self, position: usize, value: f64) {
806        if value == 0.0 {
807            // Remove zero values to maintain sparsity
808            self.rle_data.retain(|(pos_, _)| *pos_ != position);
809            return;
810        }
811
812        // Find if position already exists
813        if let Some(entry) = self.rle_data.iter_mut().find(|(pos_, _)| *pos_ == position) {
814            entry.1 = value;
815        } else {
816            // Insert new entry and keep sorted by position
817            let insert_pos = self
818                .rle_data
819                .binary_search_by_key(&position, |&(pos_, _)| pos_)
820                .unwrap_or_else(|pos| pos);
821            self.rle_data.insert(insert_pos, (position, value));
822        }
823    }
824
825    fn get_value(&self, position: usize) -> f64 {
826        self.rle_data
827            .iter()
828            .find(|(pos_, _)| *pos_ == position)
829            .map(|(_, value)| *value)
830            .unwrap_or(0.0)
831    }
832
833    fn memory_usage(&self) -> usize {
834        self.rle_data.len() * std::mem::size_of::<(usize, f64)>()
835    }
836}
837
838/// Bit-packed sparse vector for boolean data with extreme compression
839#[derive(Debug, Clone)]
840pub struct BitPackedSparseVector {
841    /// Bit-packed indices (each bit represents presence/absence)
842    bit_data: Vec<u64>,
843    /// Total size of the vector
844    size: usize,
845}
846
847impl BitPackedSparseVector {
848    /// Create from a boolean sparse vector
849    pub fn from_boolindices(indices: &[usize], size: usize) -> Self {
850        let num_words = size.div_ceil(64); // Round up to nearest 64
851        let mut bit_data = vec![0u64; num_words];
852
853        for &idx in indices {
854            if idx < size {
855                let word_idx = idx / 64;
856                let bit_idx = idx % 64;
857                bit_data[word_idx] |= 1u64 << bit_idx;
858            }
859        }
860
861        Self { bit_data, size }
862    }
863
864    /// Get all set indices
865    pub fn getindices(&self) -> Vec<usize> {
866        let mut indices = Vec::new();
867
868        for (word_idx, &word) in self.bit_data.iter().enumerate() {
869            if word != 0 {
870                for bit_idx in 0..64 {
871                    if word & (1u64 << bit_idx) != 0 {
872                        let global_idx = word_idx * 64 + bit_idx;
873                        if global_idx < self.size {
874                            indices.push(global_idx);
875                        }
876                    }
877                }
878            }
879        }
880
881        indices
882    }
883
884    /// Check if index is set
885    pub fn is_set(&self, idx: usize) -> bool {
886        if idx >= self.size {
887            return false;
888        }
889
890        let word_idx = idx / 64;
891        let bit_idx = idx % 64;
892
893        if word_idx < self.bit_data.len() {
894            self.bit_data[word_idx] & (1u64 << bit_idx) != 0
895        } else {
896            false
897        }
898    }
899
900    /// Count number of set bits
901    pub fn count_ones(&self) -> usize {
902        self.bit_data
903            .iter()
904            .map(|&word| word.count_ones() as usize)
905            .sum()
906    }
907
908    /// Memory usage in bytes
909    pub fn memory_usage(&self) -> usize {
910        self.bit_data.len() * std::mem::size_of::<u64>() + std::mem::size_of::<usize>()
911    }
912
913    /// Intersection with another bit-packed vector
914    pub fn intersection(&self, other: &BitPackedSparseVector) -> Result<BitPackedSparseVector> {
915        if self.size != other.size {
916            return Err(TextError::InvalidInput(format!(
917                "Vector sizes don't match for intersection: {} vs {}",
918                self.size, other.size
919            )));
920        }
921
922        let mut result_data = Vec::with_capacity(self.bit_data.len());
923        for (&a, &b) in self.bit_data.iter().zip(other.bit_data.iter()) {
924            result_data.push(a & b);
925        }
926
927        Ok(BitPackedSparseVector {
928            bit_data: result_data,
929            size: self.size,
930        })
931    }
932
933    /// Union with another bit-packed vector
934    pub fn union(&self, other: &BitPackedSparseVector) -> Result<BitPackedSparseVector> {
935        if self.size != other.size {
936            return Err(TextError::InvalidInput(format!(
937                "Vector sizes don't match for union: {} vs {}",
938                self.size, other.size
939            )));
940        }
941
942        let mut result_data = Vec::with_capacity(self.bit_data.len());
943        for (&a, &b) in self.bit_data.iter().zip(other.bit_data.iter()) {
944            result_data.push(a | b);
945        }
946
947        Ok(BitPackedSparseVector {
948            bit_data: result_data,
949            size: self.size,
950        })
951    }
952}
953
954/// Memory pool for efficient sparse matrix allocation
955pub struct SparseMemoryPool {
956    /// Pre-allocated f64 values
957    value_pool: Vec<Vec<f64>>,
958    /// Pre-allocated usize indices
959    index_pool: Vec<Vec<usize>>,
960    /// Block size for allocations
961    _blocksize: usize,
962}
963
964impl SparseMemoryPool {
965    /// Create new memory pool
966    pub fn new(__blocksize: usize) -> Self {
967        Self {
968            value_pool: Vec::new(),
969            index_pool: Vec::new(),
970            _blocksize: __blocksize,
971        }
972    }
973
974    /// Allocate a vector of f64 values
975    pub fn allocate_values(&mut self, size: usize) -> Vec<f64> {
976        if let Some(mut vec) = self.value_pool.pop() {
977            vec.clear();
978            vec.reserve(size);
979            vec
980        } else {
981            Vec::with_capacity(size.max(self._blocksize))
982        }
983    }
984
985    /// Allocate a vector of indices
986    pub fn allocateindices(&mut self, size: usize) -> Vec<usize> {
987        if let Some(mut vec) = self.index_pool.pop() {
988            vec.clear();
989            vec.reserve(size);
990            vec
991        } else {
992            Vec::with_capacity(size.max(self._blocksize))
993        }
994    }
995
996    /// Return vectors to the pool
997    pub fn deallocate_values(&mut self, mut vec: Vec<f64>) {
998        if vec.capacity() >= self._blocksize {
999            vec.clear();
1000            self.value_pool.push(vec);
1001        }
1002    }
1003
1004    /// Return index vectors to the pool
1005    pub fn deallocateindices(&mut self, mut vec: Vec<usize>) {
1006        if vec.capacity() >= self._blocksize {
1007            vec.clear();
1008            self.index_pool.push(vec);
1009        }
1010    }
1011}
1012
1013/// Adaptive sparse matrix that chooses optimal format based on sparsity pattern
1014#[derive(Debug)]
1015pub enum AdaptiveSparseMatrix {
1016    /// Compressed Sparse Row format
1017    Csr(CsrMatrix),
1018    /// Compressed Sparse Column format
1019    Csc(CscMatrix),
1020    /// Coordinate format
1021    Coo(CooMatrix),
1022    /// Block sparse format
1023    Block(BlockSparseMatrix),
1024    /// Hierarchical sparse format
1025    Hierarchical(HierarchicalSparseMatrix),
1026}
1027
1028impl AdaptiveSparseMatrix {
1029    /// Create from COO and automatically select best format
1030    pub fn from_coo_adaptive(coo: CooMatrix) -> Self {
1031        let nnz = coo.nnz();
1032        let (n_rows, n_cols) = (coo.n_rows, coo.n_cols);
1033        let total_elements = n_rows * n_cols;
1034        let sparsity = nnz as f64 / total_elements as f64;
1035
1036        // Choose format based on sparsity and size
1037        if sparsity < 0.001 && total_elements > 10000 {
1038            // Very sparse large matrix - use hierarchical
1039            let hierarchical = HierarchicalSparseMatrix::new(n_rows, n_cols, 64, 8);
1040            // Convert COO to hierarchical (simplified)
1041            Self::Hierarchical(hierarchical)
1042        } else if sparsity < 0.01 && nnz > 1000 {
1043            // Sparse with some structure - use block format
1044            let _blocksize = ((nnz as f64).sqrt() as usize).clamp(8, 64);
1045            Self::Block(BlockSparseMatrix::new(n_rows, n_cols, _blocksize))
1046        } else if n_rows > n_cols * 2 {
1047            // Tall matrix - CSC might be better for column operations
1048            Self::Csc(CscMatrix::from_coo(&coo))
1049        } else {
1050            // Default to CSR for row operations
1051            Self::Csr(coo.to_csr())
1052        }
1053    }
1054
1055    /// Get memory usage
1056    pub fn memory_usage(&self) -> usize {
1057        match self {
1058            Self::Csr(m) => m.memory_usage(),
1059            Self::Csc(m) => m.memory_usage(),
1060            Self::Coo(m) => {
1061                m.nnz() * (2 * std::mem::size_of::<usize>() + std::mem::size_of::<f64>())
1062            }
1063            Self::Block(m) => m.memory_usage(),
1064            Self::Hierarchical(m) => m.memory_usage(),
1065        }
1066    }
1067}
1068
1069/// Streaming sparse matrix processor for out-of-core operations
1070pub struct StreamingSparseProcessor {
1071    chunk_size: usize,
1072    #[allow(dead_code)]
1073    memory_limit: usize,
1074}
1075
1076impl StreamingSparseProcessor {
1077    /// Create new streaming processor
1078    pub fn new(_chunk_size: usize, memorylimit: usize) -> Self {
1079        Self {
1080            chunk_size: _chunk_size,
1081            memory_limit: memorylimit,
1082        }
1083    }
1084
1085    /// Process large sparse matrix in chunks
1086    pub fn process_chunks<F, R>(&self, matrix: &CsrMatrix, processor: F) -> Result<Vec<R>>
1087    where
1088        F: Fn(&CsrMatrix) -> Result<R>,
1089    {
1090        let mut results = Vec::new();
1091        let (n_rows, n_cols) = matrix.shape();
1092        let rows_per_chunk = (self.chunk_size).min(n_rows);
1093
1094        for chunk_start in (0..n_rows).step_by(rows_per_chunk) {
1095            let chunk_end = (chunk_start + rows_per_chunk).min(n_rows);
1096
1097            // Extract chunk (simplified - would implement efficient row slicing)
1098            let chunk = self.extract_row_slice(matrix, chunk_start, chunk_end)?;
1099            let result = processor(&chunk)?;
1100            results.push(result);
1101        }
1102
1103        Ok(results)
1104    }
1105
1106    /// Extract a slice of rows from CSR matrix
1107    fn extract_row_slice(
1108        &self,
1109        matrix: &CsrMatrix,
1110        start_row: usize,
1111        end_row: usize,
1112    ) -> Result<CsrMatrix> {
1113        // Simplified implementation - would efficiently slice the CSR data
1114        let mut values = Vec::new();
1115        let mut colindices = Vec::new();
1116        let mut row_ptrs = vec![0];
1117
1118        for _row in start_row..end_row {
1119            if let Ok(sparse_row) = matrix.get_row(_row) {
1120                values.extend(sparse_row.values().iter());
1121                colindices.extend(sparse_row.indices().iter());
1122                row_ptrs.push(values.len());
1123            }
1124        }
1125
1126        Ok(CsrMatrix {
1127            values,
1128            colindices,
1129            row_ptrs,
1130            n_rows: end_row - start_row,
1131            n_cols: matrix.shape().1,
1132        })
1133    }
1134}
1135
1136/// Approximate sparse matrix using sketching for reduced memory
1137#[derive(Debug, Clone)]
1138pub struct ApproximateSparseMatrix {
1139    /// Hash-based sketches of rows
1140    row_sketches: Vec<Vec<(u32, f32)>>, // (hash, value) pairs
1141    /// Sketch size
1142    #[allow(dead_code)]
1143    sketchsize: usize,
1144    /// Original dimensions
1145    n_rows: usize,
1146    #[allow(dead_code)]
1147    n_cols: usize,
1148}
1149
1150impl ApproximateSparseMatrix {
1151    /// Create approximate representation of a sparse matrix
1152    pub fn from_csr(matrix: &CsrMatrix, sketchsize: usize) -> Self {
1153        let (n_rows, n_cols) = matrix.shape();
1154        let mut row_sketches = Vec::with_capacity(n_rows);
1155
1156        for rowidx in 0..n_rows {
1157            if let Ok(row) = matrix.get_row(rowidx) {
1158                let sketch = Self::create_row_sketch(&row, sketchsize);
1159                row_sketches.push(sketch);
1160            } else {
1161                row_sketches.push(Vec::new());
1162            }
1163        }
1164
1165        Self {
1166            row_sketches,
1167            sketchsize,
1168            n_rows,
1169            n_cols,
1170        }
1171    }
1172
1173    /// Create a hash-based sketch of a sparse row
1174    fn create_row_sketch(row: &SparseVector, sketchsize: usize) -> Vec<(u32, f32)> {
1175        let mut sketch = vec![(0u32, 0.0f32); sketchsize];
1176
1177        for (&idx, &val) in row.indices().iter().zip(row.values().iter()) {
1178            // Simple hash function for demonstration
1179            let hash = ((idx as u64 * 2654435761u64) % (sketchsize as u64)) as usize;
1180            sketch[hash].0 = idx as u32;
1181            sketch[hash].1 += val as f32;
1182        }
1183
1184        // Keep only non-zero sketches
1185        sketch.into_iter().filter(|(_, val)| *val != 0.0).collect()
1186    }
1187
1188    /// Approximate dot product between two row sketches
1189    pub fn approximate_row_similarity(&self, row1: usize, row2: usize) -> f32 {
1190        if row1 >= self.n_rows || row2 >= self.n_rows {
1191            return 0.0;
1192        }
1193
1194        let sketch1 = &self.row_sketches[row1];
1195        let sketch2 = &self.row_sketches[row2];
1196
1197        let mut similarity = 0.0f32;
1198        let mut i = 0;
1199        let mut j = 0;
1200
1201        // Merge-like algorithm on hash keys
1202        while i < sketch1.len() && j < sketch2.len() {
1203            match sketch1[i].0.cmp(&sketch2[j].0) {
1204                std::cmp::Ordering::Equal => {
1205                    similarity += sketch1[i].1 * sketch2[j].1;
1206                    i += 1;
1207                    j += 1;
1208                }
1209                std::cmp::Ordering::Less => i += 1,
1210                std::cmp::Ordering::Greater => j += 1,
1211            }
1212        }
1213
1214        similarity
1215    }
1216
1217    /// Memory usage
1218    pub fn memory_usage(&self) -> usize {
1219        self.row_sketches
1220            .iter()
1221            .map(|sketch| sketch.len() * std::mem::size_of::<(u32, f32)>())
1222            .sum::<usize>()
1223    }
1224}
1225
1226/// Quantized sparse vector for reduced memory usage
1227#[derive(Debug, Clone)]
1228pub struct QuantizedSparseVector {
1229    /// Indices of non-zero elements
1230    indices: Vec<u32>,
1231    /// Quantized values (8-bit)
1232    values: Vec<i8>,
1233    /// Scale factor for dequantization
1234    scale: f32,
1235    /// Zero point for dequantization
1236    zero_point: i8,
1237    /// Total size
1238    size: usize,
1239}
1240
1241impl QuantizedSparseVector {
1242    /// Quantize a sparse vector to 8-bit representation
1243    pub fn fromsparse(sparse: &SparseVector) -> Self {
1244        if sparse.values.is_empty() {
1245            return Self {
1246                indices: Vec::new(),
1247                values: Vec::new(),
1248                scale: 1.0,
1249                zero_point: 0,
1250                size: sparse.size,
1251            };
1252        }
1253
1254        // Find min and max values
1255        let min_val = sparse.values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
1256        let max_val = sparse
1257            .values
1258            .iter()
1259            .fold(f64::NEG_INFINITY, |a, &b| a.max(b));
1260
1261        // Calculate quantization parameters
1262        let range = max_val - min_val;
1263        let scale = (range / 255.0) as f32;
1264        let zero_point = (-min_val / range * 255.0).round() as i8;
1265
1266        // Quantize values
1267        let indices: Vec<u32> = sparse.indices.iter().map(|&i| i as u32).collect();
1268        let values: Vec<i8> = sparse
1269            .values
1270            .iter()
1271            .map(|&v| {
1272                let scaled = ((v - min_val) / range * 255.0).round();
1273                (scaled.clamp(0.0, 255.0) as i16 - 128) as i8 // Center around 0
1274            })
1275            .collect();
1276
1277        Self {
1278            indices,
1279            values,
1280            scale,
1281            zero_point,
1282            size: sparse.size,
1283        }
1284    }
1285
1286    /// Dequantize back to a regular sparse vector
1287    pub fn tosparse(&self) -> SparseVector {
1288        let indices: Vec<usize> = self.indices.iter().map(|&i| i as usize).collect();
1289        let values: Vec<f64> = self
1290            .values
1291            .iter()
1292            .map(|&v| {
1293                let dequantized = (v as f32 + 128.0 - self.zero_point as f32) * self.scale;
1294                dequantized as f64
1295            })
1296            .collect();
1297
1298        SparseVector::fromindices_values(indices, values, self.size)
1299    }
1300
1301    /// Get memory usage in bytes
1302    pub fn memory_usage(&self) -> usize {
1303        self.indices.len() * std::mem::size_of::<u32>()
1304            + self.values.len() * std::mem::size_of::<i8>()
1305            + std::mem::size_of::<f32>() * 2
1306            + std::mem::size_of::<usize>()
1307    }
1308}
1309
1310#[cfg(test)]
1311mod tests {
1312    use super::*;
1313    use scirs2_core::ndarray::{arr1, arr2};
1314
1315    #[test]
1316    fn test_csr_from_dense() {
1317        let dense = arr2(&[[1.0, 0.0, 2.0], [0.0, 3.0, 0.0], [4.0, 0.0, 5.0]]);
1318
1319        let csr = CsrMatrix::from_dense(&dense);
1320
1321        assert_eq!(csr.shape(), (3, 3));
1322        assert_eq!(csr.nnz(), 5);
1323
1324        let reconstructed = csr.to_dense();
1325        assert_eq!(reconstructed, dense);
1326    }
1327
1328    #[test]
1329    fn testsparse_vector() {
1330        let dense = arr1(&[0.0, 1.0, 0.0, 2.0, 0.0]);
1331        let sparse = SparseVector::from_dense(&dense);
1332
1333        assert_eq!(sparse.nnz(), 2);
1334        assert_eq!(sparse.size(), 5);
1335
1336        let reconstructed = sparse.to_dense();
1337        assert_eq!(reconstructed, dense);
1338    }
1339
1340    #[test]
1341    fn testsparse_dot_product() {
1342        let v1 = SparseVector::from_dense(&arr1(&[1.0, 0.0, 2.0, 0.0]));
1343        let v2 = SparseVector::from_dense(&arr1(&[0.0, 3.0, 2.0, 0.0]));
1344
1345        let dot = v1.dotsparse(&v2).unwrap();
1346        assert_eq!(dot, 4.0); // 1*0 + 0*3 + 2*2 + 0*0 = 4
1347    }
1348
1349    #[test]
1350    fn test_dok_matrix() {
1351        let mut dok = DokMatrix::new(3, 3);
1352
1353        dok.set(0, 0, 1.0).unwrap();
1354        dok.set(1, 1, 2.0).unwrap();
1355        dok.set(2, 0, 3.0).unwrap();
1356
1357        assert_eq!(dok.get(0, 0), 1.0);
1358        assert_eq!(dok.get(0, 1), 0.0);
1359        assert_eq!(dok.nnz(), 3);
1360
1361        let csr = dok.to_csr();
1362        assert_eq!(csr.nnz(), 3);
1363    }
1364
1365    #[test]
1366    fn test_matrix_builder() {
1367        let mut builder = SparseMatrixBuilder::new(4);
1368
1369        builder
1370            .add_row(SparseVector::from_dense(&arr1(&[1.0, 0.0, 2.0, 0.0])))
1371            .unwrap();
1372        builder
1373            .add_row(SparseVector::from_dense(&arr1(&[0.0, 3.0, 0.0, 4.0])))
1374            .unwrap();
1375
1376        let matrix = builder.build();
1377        assert_eq!(matrix.shape(), (2, 4));
1378        assert_eq!(matrix.nnz(), 4);
1379    }
1380
1381    #[test]
1382    fn test_memory_efficiency() {
1383        // Create a large sparse matrix (1000x1000 with 1% density)
1384        let n = 1000;
1385        let mut dense = Array2::zeros((n, n));
1386
1387        // Add some random non-zero values
1388        for i in 0..n / 10 {
1389            for j in 0..n / 10 {
1390                dense[[i, j]] = i as f64 + j as f64;
1391            }
1392        }
1393
1394        let sparse = CsrMatrix::from_dense(&dense);
1395
1396        // Calculate memory savings
1397        let dense_memory = n * n * std::mem::size_of::<f64>();
1398        let sparse_memory = sparse.memory_usage();
1399
1400        println!("Dense memory: {dense_memory} bytes");
1401        println!("Sparse memory: {sparse_memory} bytes");
1402        println!(
1403            "Memory savings: {:.1}%",
1404            (1.0 - sparse_memory as f64 / dense_memory as f64) * 100.0
1405        );
1406
1407        assert!(sparse_memory < dense_memory / 10); // Should use less than 10% of dense memory
1408    }
1409
1410    #[test]
1411    fn test_coo_matrix() {
1412        let mut coo = CooMatrix::new(3, 3);
1413
1414        coo.push(0, 0, 1.0).unwrap();
1415        coo.push(1, 1, 2.0).unwrap();
1416        coo.push(2, 0, 3.0).unwrap();
1417        coo.push(1, 2, 4.0).unwrap();
1418
1419        assert_eq!(coo.nnz(), 4);
1420
1421        // Convert to CSR
1422        let csr = coo.to_csr();
1423        assert_eq!(csr.nnz(), 4);
1424        assert_eq!(csr.shape(), (3, 3));
1425
1426        // Verify values
1427        let dense = csr.to_dense();
1428        assert_eq!(dense[[0, 0]], 1.0);
1429        assert_eq!(dense[[1, 1]], 2.0);
1430        assert_eq!(dense[[2, 0]], 3.0);
1431        assert_eq!(dense[[1, 2]], 4.0);
1432    }
1433
1434    #[test]
1435    fn test_csc_matrix() {
1436        let mut coo = CooMatrix::new(3, 3);
1437        coo.push(0, 0, 1.0).unwrap();
1438        coo.push(1, 1, 2.0).unwrap();
1439        coo.push(2, 0, 3.0).unwrap();
1440
1441        let csc = CscMatrix::from_coo(&coo);
1442
1443        // Get column 0
1444        let col0 = csc.get_col(0).unwrap();
1445        assert_eq!(col0.nnz(), 2);
1446        let dense_col0 = col0.to_dense();
1447        assert_eq!(dense_col0[0], 1.0);
1448        assert_eq!(dense_col0[2], 3.0);
1449
1450        // Get column 1
1451        let col1 = csc.get_col(1).unwrap();
1452        assert_eq!(col1.nnz(), 1);
1453        let dense_col1 = col1.to_dense();
1454        assert_eq!(dense_col1[1], 2.0);
1455    }
1456
1457    #[test]
1458    fn test_blocksparse_matrix() {
1459        let mut blocksparse = BlockSparseMatrix::new(10, 10, 3);
1460
1461        // Set some values
1462        blocksparse.set(0, 0, 1.0).unwrap();
1463        blocksparse.set(5, 5, 2.0).unwrap();
1464        blocksparse.set(9, 9, 3.0).unwrap();
1465
1466        // Get values
1467        assert_eq!(blocksparse.get(0, 0), 1.0);
1468        assert_eq!(blocksparse.get(5, 5), 2.0);
1469        assert_eq!(blocksparse.get(9, 9), 3.0);
1470        assert_eq!(blocksparse.get(3, 3), 0.0);
1471
1472        // Check memory usage
1473        let memory = blocksparse.memory_usage();
1474        assert!(memory > 0);
1475    }
1476
1477    #[test]
1478    fn test_quantizedsparse_vector() {
1479        let mut sparse = SparseVector::new(10);
1480        sparse.indices = vec![1, 3, 7];
1481        // Use values that should quantize better - avoiding very small values
1482        sparse.values = vec![100.0, 50.0, 200.0];
1483
1484        // Quantize
1485        let quantized = QuantizedSparseVector::fromsparse(&sparse);
1486        assert_eq!(quantized.indices.len(), 3);
1487        assert_eq!(quantized.values.len(), 3);
1488
1489        // Dequantize
1490        let dequantized = quantized.tosparse();
1491        assert_eq!(dequantized.nnz(), 3);
1492
1493        // Check values are approximately preserved
1494        let orig_values = sparse.values();
1495        let deq_values = dequantized.values();
1496        for i in 0..3 {
1497            let error = (orig_values[i] - deq_values[i]).abs();
1498            let relative_error = error / orig_values[i].abs();
1499            // Increase tolerance slightly for quantization
1500            assert!(relative_error < 0.05); // Less than 5% error
1501        }
1502
1503        // Check memory savings
1504        let orig_memory = sparse.indices.len() * std::mem::size_of::<usize>()
1505            + sparse.values.len() * std::mem::size_of::<f64>();
1506        let quantized_memory = quantized.memory_usage();
1507        assert!(quantized_memory < orig_memory);
1508    }
1509}
scirs2_text/sparse.rs

scirs2_text/
sparse.rs