scirs2_sparse/adaptive_memory_compression/
compressed_data.rs

1//! Compressed data structures for adaptive memory compression
2//!
3//! This module contains the data structures used to represent compressed
4//! sparse matrices and their component blocks.
5
6use super::cache::BlockId;
7use super::config::CompressionAlgorithm;
8use super::stats::CompressionMetadata;
9use std::marker::PhantomData;
10
11/// Compressed sparse matrix representation
12#[derive(Debug)]
13pub struct CompressedMatrix<T> {
14    pub matrixid: u64,
15    pub original_rows: usize,
16    pub original_cols: usize,
17    pub compressed_blocks: Vec<CompressedBlock>,
18    pub compression_algorithm: CompressionAlgorithm,
19    pub block_size: usize,
20    pub metadata: CompressionMetadata,
21    _phantom: PhantomData<T>,
22}
23
24/// Compressed block of matrix data
25#[derive(Debug, Clone)]
26pub struct CompressedBlock {
27    pub blockid: BlockId,
28    pub block_type: BlockType,
29    pub compressed_data: Vec<u8>,
30    pub original_size: usize,
31    pub compression_level: u8,
32    pub checksum: Option<u64>,
33    pub timestamp: u64,
34}
35
36/// Type of data stored in a block
37#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
38pub enum BlockType {
39    IndPtr,
40    Indices,
41    #[default]
42    Data,
43    Combined,
44    Metadata,
45}
46
47/// Block header for disk storage
48#[derive(Debug)]
49#[allow(dead_code)]
50pub(crate) struct BlockHeader {
51    pub blockid: BlockId,
52    pub block_type: u8,
53    pub original_size: usize,
54    pub compressed_size: usize,
55    pub compression_level: u8,
56    pub checksum: u64,
57    pub timestamp: u64,
58}
59
60/// Serialized block header (fixed size for disk storage)
61#[repr(C)]
62#[allow(dead_code)]
63pub(crate) struct BlockHeaderSerialized {
64    pub blockid: u64,
65    pub block_type: u8,
66    pub original_size: u64,
67    pub compressed_size: u64,
68    pub compression_level: u8,
69    pub checksum: u64,
70    pub timestamp: u64,
71    pub padding: [u8; 3], // Ensure proper alignment
72}
73
74impl<T> CompressedMatrix<T> {
75    /// Create a new compressed matrix
76    pub fn new(
77        matrix_id: u64,
78        original_rows: usize,
79        original_cols: usize,
80        compression_algorithm: CompressionAlgorithm,
81        block_size: usize,
82    ) -> Self {
83        Self {
84            matrixid: matrix_id,
85            original_rows,
86            original_cols,
87            compressed_blocks: Vec::new(),
88            compression_algorithm,
89            block_size,
90            metadata: CompressionMetadata::new(0, 0, 0.0),
91            _phantom: PhantomData,
92        }
93    }
94
95    /// Add a compressed block
96    pub fn add_block(&mut self, block: CompressedBlock) {
97        self.compressed_blocks.push(block);
98        self.update_metadata();
99    }
100
101    /// Get block by ID
102    pub fn get_block(&self, block_id: &BlockId) -> Option<&CompressedBlock> {
103        self.compressed_blocks
104            .iter()
105            .find(|block| &block.blockid == block_id)
106    }
107
108    /// Get mutable block by ID
109    pub fn get_block_mut(&mut self, block_id: &BlockId) -> Option<&mut CompressedBlock> {
110        self.compressed_blocks
111            .iter_mut()
112            .find(|block| &block.blockid == block_id)
113    }
114
115    /// Remove a block
116    pub fn remove_block(&mut self, block_id: &BlockId) -> Option<CompressedBlock> {
117        if let Some(pos) = self
118            .compressed_blocks
119            .iter()
120            .position(|block| &block.blockid == block_id)
121        {
122            let removed = self.compressed_blocks.remove(pos);
123            self.update_metadata();
124            Some(removed)
125        } else {
126            None
127        }
128    }
129
130    /// Get blocks of specific type
131    pub fn get_blocks_by_type(&self, block_type: BlockType) -> Vec<&CompressedBlock> {
132        self.compressed_blocks
133            .iter()
134            .filter(|block| block.block_type == block_type)
135            .collect()
136    }
137
138    /// Update metadata based on current blocks
139    fn update_metadata(&mut self) {
140        let total_original_size: usize = self
141            .compressed_blocks
142            .iter()
143            .map(|block| block.original_size)
144            .sum();
145
146        let total_compressed_size: usize = self
147            .compressed_blocks
148            .iter()
149            .map(|block| block.compressed_data.len())
150            .sum();
151
152        self.metadata = CompressionMetadata::new(
153            total_original_size,
154            total_compressed_size,
155            0.0, // Compression time would be tracked separately
156        );
157    }
158
159    /// Get total number of blocks
160    pub fn block_count(&self) -> usize {
161        self.compressed_blocks.len()
162    }
163
164    /// Get total compressed size
165    pub fn compressed_size(&self) -> usize {
166        self.compressed_blocks
167            .iter()
168            .map(|block| block.compressed_data.len())
169            .sum()
170    }
171
172    /// Get total original size
173    pub fn original_size(&self) -> usize {
174        self.compressed_blocks
175            .iter()
176            .map(|block| block.original_size)
177            .sum()
178    }
179
180    /// Get compression ratio
181    pub fn compression_ratio(&self) -> f64 {
182        self.metadata.compression_ratio
183    }
184
185    /// Check data integrity
186    pub fn verify_integrity(&self) -> Result<(), String> {
187        for block in &self.compressed_blocks {
188            if let Some(expected_checksum) = block.checksum {
189                let actual_checksum = Self::calculate_checksum(&block.compressed_data);
190                if actual_checksum != expected_checksum {
191                    return Err(format!("Checksum mismatch for block {}", block.blockid));
192                }
193            }
194        }
195        Ok(())
196    }
197
198    /// Calculate checksum for data
199    fn calculate_checksum(data: &[u8]) -> u64 {
200        use std::collections::hash_map::DefaultHasher;
201        use std::hash::{Hash, Hasher};
202
203        let mut hasher = DefaultHasher::new();
204        data.hash(&mut hasher);
205        hasher.finish()
206    }
207
208    /// Get memory footprint
209    pub fn memory_footprint(&self) -> usize {
210        std::mem::size_of::<Self>()
211            + self
212                .compressed_blocks
213                .iter()
214                .map(|block| block.memory_footprint())
215                .sum::<usize>()
216    }
217
218    /// Optimize block organization
219    pub fn optimize_blocks(&mut self) {
220        // Sort blocks by access frequency (if available) or by block ID
221        self.compressed_blocks.sort_by(|a, b| {
222            a.blockid
223                .block_row
224                .cmp(&b.blockid.block_row)
225                .then_with(|| a.blockid.block_col.cmp(&b.blockid.block_col))
226        });
227    }
228
229    /// Get blocks in row-major order
230    pub fn get_blocks_row_major(&self) -> Vec<&CompressedBlock> {
231        let mut blocks = self.compressed_blocks.iter().collect::<Vec<_>>();
232        blocks.sort_by(|a, b| {
233            a.blockid
234                .block_row
235                .cmp(&b.blockid.block_row)
236                .then_with(|| a.blockid.block_col.cmp(&b.blockid.block_col))
237        });
238        blocks
239    }
240
241    /// Export metadata for persistence
242    pub fn export_metadata(&self) -> MatrixMetadataExport {
243        MatrixMetadataExport {
244            matrix_id: self.matrixid,
245            original_rows: self.original_rows,
246            original_cols: self.original_cols,
247            block_count: self.compressed_blocks.len(),
248            compression_algorithm: self.compression_algorithm,
249            block_size: self.block_size,
250            total_original_size: self.original_size(),
251            total_compressed_size: self.compressed_size(),
252            compression_ratio: self.compression_ratio(),
253            block_map: self
254                .compressed_blocks
255                .iter()
256                .map(|block| (block.blockid.clone(), block.block_type))
257                .collect(),
258        }
259    }
260}
261
262impl CompressedBlock {
263    /// Create a new compressed block
264    pub fn new(
265        block_id: BlockId,
266        block_type: BlockType,
267        compressed_data: Vec<u8>,
268        original_size: usize,
269        compression_level: u8,
270    ) -> Self {
271        let checksum = Self::calculate_checksum(&compressed_data);
272
273        Self {
274            blockid: block_id,
275            block_type,
276            compressed_data,
277            original_size,
278            compression_level,
279            checksum: Some(checksum),
280            timestamp: Self::current_timestamp(),
281        }
282    }
283
284    /// Create without checksum (for faster creation)
285    pub fn new_unchecked(
286        block_id: BlockId,
287        block_type: BlockType,
288        compressed_data: Vec<u8>,
289        original_size: usize,
290        compression_level: u8,
291    ) -> Self {
292        Self {
293            blockid: block_id,
294            block_type,
295            compressed_data,
296            original_size,
297            compression_level,
298            checksum: None,
299            timestamp: Self::current_timestamp(),
300        }
301    }
302
303    /// Get compression ratio for this block
304    pub fn compression_ratio(&self) -> f64 {
305        if self.original_size > 0 {
306            self.compressed_data.len() as f64 / self.original_size as f64
307        } else {
308            1.0
309        }
310    }
311
312    /// Get space savings in bytes
313    pub fn space_savings(&self) -> usize {
314        self.original_size
315            .saturating_sub(self.compressed_data.len())
316    }
317
318    /// Verify block integrity
319    pub fn verify_integrity(&self) -> bool {
320        if let Some(expected_checksum) = self.checksum {
321            let actual_checksum = Self::calculate_checksum(&self.compressed_data);
322            actual_checksum == expected_checksum
323        } else {
324            true // No checksum to verify
325        }
326    }
327
328    /// Update checksum
329    pub fn update_checksum(&mut self) {
330        self.checksum = Some(Self::calculate_checksum(&self.compressed_data));
331    }
332
333    /// Calculate checksum for data
334    fn calculate_checksum(data: &[u8]) -> u64 {
335        use std::collections::hash_map::DefaultHasher;
336        use std::hash::{Hash, Hasher};
337
338        let mut hasher = DefaultHasher::new();
339        data.hash(&mut hasher);
340        hasher.finish()
341    }
342
343    /// Get current timestamp
344    fn current_timestamp() -> u64 {
345        std::time::SystemTime::now()
346            .duration_since(std::time::UNIX_EPOCH)
347            .unwrap_or_default()
348            .as_secs()
349    }
350
351    /// Get memory footprint of this block
352    pub fn memory_footprint(&self) -> usize {
353        std::mem::size_of::<Self>() + self.compressed_data.len()
354    }
355
356    /// Get age in seconds
357    pub fn age_seconds(&self) -> u64 {
358        Self::current_timestamp().saturating_sub(self.timestamp)
359    }
360
361    /// Check if block is old
362    pub fn is_old(&self, max_age_seconds: u64) -> bool {
363        self.age_seconds() > max_age_seconds
364    }
365
366    /// Clone data without metadata
367    pub fn clone_data(&self) -> Vec<u8> {
368        self.compressed_data.clone()
369    }
370
371    /// Get size information
372    pub fn size_info(&self) -> BlockSizeInfo {
373        BlockSizeInfo {
374            original_size: self.original_size,
375            compressed_size: self.compressed_data.len(),
376            compression_ratio: self.compression_ratio(),
377            space_savings: self.space_savings(),
378        }
379    }
380}
381
382impl BlockType {
383    /// Get string representation
384    pub fn as_str(&self) -> &'static str {
385        match self {
386            BlockType::IndPtr => "indptr",
387            BlockType::Indices => "indices",
388            BlockType::Data => "data",
389            BlockType::Combined => "combined",
390            BlockType::Metadata => "metadata",
391        }
392    }
393
394    /// Parse from string
395    pub fn from_str(s: &str) -> Option<Self> {
396        match s.to_lowercase().as_str() {
397            "indptr" => Some(BlockType::IndPtr),
398            "indices" => Some(BlockType::Indices),
399            "data" => Some(BlockType::Data),
400            "combined" => Some(BlockType::Combined),
401            "metadata" => Some(BlockType::Metadata),
402            _ => None,
403        }
404    }
405
406    /// Get priority for compression (higher = more important)
407    pub fn compression_priority(&self) -> u8 {
408        match self {
409            BlockType::Data => 10,    // Highest priority - usually largest
410            BlockType::Indices => 8,  // High priority - often compressible
411            BlockType::Combined => 7, // High priority - mixed content
412            BlockType::IndPtr => 5,   // Medium priority - usually small
413            BlockType::Metadata => 3, // Lower priority - typically small
414        }
415    }
416
417    /// Check if this block type typically benefits from compression
418    pub fn benefits_from_compression(&self) -> bool {
419        match self {
420            BlockType::Data => true,      // Numerical data often compresses well
421            BlockType::Indices => true,   // Sorted indices compress well
422            BlockType::Combined => true,  // Mixed content varies
423            BlockType::IndPtr => false,   // Usually small and regular
424            BlockType::Metadata => false, // Usually small
425        }
426    }
427}
428
429impl BlockHeader {
430    /// Create new block header
431    pub fn new(
432        block_id: BlockId,
433        block_type: BlockType,
434        original_size: usize,
435        compressed_size: usize,
436        compression_level: u8,
437    ) -> Self {
438        Self {
439            blockid: block_id,
440            block_type: block_type as u8,
441            original_size,
442            compressed_size,
443            compression_level,
444            checksum: 0, // Will be calculated separately
445            timestamp: std::time::SystemTime::now()
446                .duration_since(std::time::UNIX_EPOCH)
447                .unwrap_or_default()
448                .as_secs(),
449        }
450    }
451
452    /// Serialize header to bytes
453    #[allow(dead_code)]
454    pub fn serialize(&self) -> Vec<u8> {
455        let serialized = BlockHeaderSerialized {
456            blockid: self.blockid.to_u64(),
457            block_type: self.block_type,
458            original_size: self.original_size as u64,
459            compressed_size: self.compressed_size as u64,
460            compression_level: self.compression_level,
461            checksum: self.checksum,
462            timestamp: self.timestamp,
463            padding: [0; 3],
464        };
465
466        // Convert to bytes
467        unsafe {
468            let ptr = &serialized as *const BlockHeaderSerialized as *const u8;
469            std::slice::from_raw_parts(ptr, std::mem::size_of::<BlockHeaderSerialized>()).to_vec()
470        }
471    }
472
473    /// Deserialize header from bytes
474    #[allow(dead_code)]
475    pub fn deserialize(data: &[u8]) -> Result<Self, String> {
476        if data.len() < std::mem::size_of::<BlockHeaderSerialized>() {
477            return Err("Invalid header size".to_string());
478        }
479
480        // Convert from bytes
481        let serialized: BlockHeaderSerialized = unsafe {
482            let ptr = data.as_ptr() as *const BlockHeaderSerialized;
483            ptr.read()
484        };
485
486        Ok(BlockHeader {
487            blockid: BlockId::from_u64(serialized.blockid),
488            block_type: serialized.block_type,
489            original_size: serialized.original_size as usize,
490            compressed_size: serialized.compressed_size as usize,
491            compression_level: serialized.compression_level,
492            checksum: serialized.checksum,
493            timestamp: serialized.timestamp,
494        })
495    }
496
497    /// Get header size in bytes
498    pub fn size() -> usize {
499        std::mem::size_of::<BlockHeaderSerialized>()
500    }
501}
502
503/// Matrix metadata for export/import
504#[derive(Debug, Clone)]
505pub struct MatrixMetadataExport {
506    pub matrix_id: u64,
507    pub original_rows: usize,
508    pub original_cols: usize,
509    pub block_count: usize,
510    pub compression_algorithm: CompressionAlgorithm,
511    pub block_size: usize,
512    pub total_original_size: usize,
513    pub total_compressed_size: usize,
514    pub compression_ratio: f64,
515    pub block_map: Vec<(BlockId, BlockType)>,
516}
517
518/// Block size information
519#[derive(Debug, Clone)]
520pub struct BlockSizeInfo {
521    pub original_size: usize,
522    pub compressed_size: usize,
523    pub compression_ratio: f64,
524    pub space_savings: usize,
525}
526
527impl std::fmt::Display for BlockType {
528    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
529        write!(f, "{}", self.as_str())
530    }
531}