hexz_core/ops/
pack.rs

1//! High-level snapshot packing operations.
2//!
3//! This module implements the core business logic for creating Hexz snapshot files
4//! from raw disk and memory images. It orchestrates a multi-stage pipeline that
5//! transforms raw input data into compressed, indexed, and optionally encrypted
6//! snapshot files optimized for fast random access and deduplication.
7//!
8//! # Core Capabilities
9//!
10//! - **Dictionary Training**: Intelligent sampling and Zstd dictionary optimization
11//! - **Chunking Strategies**: Fixed-size blocks or content-defined (FastCDC) for better deduplication
12//! - **Compression**: LZ4 (fast) or Zstd (high-ratio) with optional dictionary support
13//! - **Encryption**: Per-block AES-256-GCM authenticated encryption
14//! - **Deduplication**: BLAKE3 based content deduplication (disabled for encrypted data)
15//! - **Hierarchical Indexing**: Two-level index structure for efficient random access
16//! - **Progress Reporting**: Optional callback interface for UI integration
17//!
18//! # Architecture
19//!
20//! The packing process follows a carefully orchestrated pipeline. Each stage is designed
21//! to be memory-efficient (streaming) and to minimize write amplification:
22//!
23//! ```text
24//! ┌─────────────────────────────────────────────────────────────────────┐
25//! │ Stage 1: Dictionary Training (Optional, Zstd only)                  │
26//! │                                                                      │
27//! │  Input File → Stratified Sampling → Entropy Filtering → Zstd Train │
28//! │                                                                      │
29//! │  - Samples ~4000 blocks evenly distributed across input             │
30//! │  - Filters out zero blocks and high-entropy data (>6.0 bits/byte)   │
31//! │  - Produces dictionary (max 110 KiB) optimized for dataset          │
32//! │  - Training time: 2-5 seconds for typical VM images                 │
33//! └─────────────────────────────────────────────────────────────────────┘
34//!                                  ↓
35//! ┌─────────────────────────────────────────────────────────────────────┐
36//! │ Stage 2: Stream Processing (Per Input: Disk, Memory)                │
37//! │                                                                      │
38//! │  Raw Input → Chunking → Compression → Encryption → Dedup → Write   │
39//! │                                                                      │
40//! │  Chunking:                                                           │
41//! │   - Fixed-size: Divide into equal blocks (default 64 KiB)           │
42//! │   - FastCDC: Content-defined boundaries for better deduplication    │
43//! │                                                                      │
44//! │  Zero Block Optimization:                                            │
45//! │   - Detect all-zero chunks (common in VM images)                    │
46//! │   - Store as metadata only (offset=0, length=0)                     │
47//! │   - Saves significant space for sparse images                       │
48//! │                                                                      │
49//! │  Deduplication (Unencrypted only):                                  │
50//! │   - Compute BLAKE3 hash of compressed data                           │
51//! │   - Check hash table for existing block                             │
52//! │   - Reuse offset if duplicate found                                 │
53//! │   - Note: Disabled for encrypted data (unique nonces prevent dedup) │
54//! │                                                                      │
55//! │  Index Page Building:                                                │
56//! │   - Accumulate BlockInfo metadata (offset, length, checksum)        │
57//! │   - Flush page when reaching 4096 entries (~16 MB logical data)     │
58//! │   - Write serialized page to output, record PageEntry               │
59//! └─────────────────────────────────────────────────────────────────────┘
60//!                                  ↓
61//! ┌─────────────────────────────────────────────────────────────────────┐
62//! │ Stage 3: Index Finalization                                          │
63//! │                                                                      │
64//! │  MasterIndex (disk_pages[], memory_pages[], sizes) → Serialize      │
65//! │                                                                      │
66//! │  - Collect all PageEntry records from both streams                  │
67//! │  - Write master index at end of file                                │
68//! │  - Record index offset in header                                    │
69//! └─────────────────────────────────────────────────────────────────────┘
70//!                                  ↓
71//! ┌─────────────────────────────────────────────────────────────────────┐
72//! │ Stage 4: Header Writing                                              │
73//! │                                                                      │
74//! │  - Seek to file start (reserved 512 bytes)                          │
75//! │  - Write Header with format metadata                          │
76//! │  - Includes: compression type, encryption params, index offset      │
77//! │  - Flush to ensure atomicity                                        │
78//! └─────────────────────────────────────────────────────────────────────┘
79//! ```
80//!
81//! # Optimization Strategies
82//!
83//! ## Dictionary Training Algorithm
84//!
85//! The dictionary training process improves compression ratios by 10-30% for
86//! structured data (file systems, databases) by building a Zstd shared dictionary:
87//!
88//! 1. **Stratified Sampling**: Sample blocks evenly across input to capture diversity
89//!    - Step size = file_size / target_samples (typically 4000 samples)
90//!    - Ensures coverage of different file system regions
91//!
92//! 2. **Quality Filtering**: Exclude unsuitable blocks
93//!    - Skip all-zero blocks (no compressible patterns)
94//!    - Compute Shannon entropy for each block
95//!    - Reject blocks with entropy > 6.0 bits/byte (likely encrypted/random)
96//!
97//! 3. **Training**: Feed filtered samples to Zstd dictionary builder
98//!    - Target dictionary size: 110 KiB (fits in L2 cache)
99//!    - Uses Zstd's COVER algorithm to extract common patterns
100//!
101//! ## Deduplication Mechanism
102//!
103//! Content-based deduplication eliminates redundant blocks:
104//!
105//! - **Hash Table**: Maps BLAKE3 hash → physical offset for each unique compressed block
106//! - **Collision Handling**: BLAKE3 collisions are astronomically unlikely (2^128 blocks)
107//! - **Memory Usage**: ~48 bytes per unique block (32-byte hash + 8-byte offset + HashMap overhead)
108//! - **Write Behavior**: Only write each unique block once; reuse offset for duplicates
109//! - **Encryption Interaction**: Disabled when encrypting (each block gets unique nonce/ciphertext)
110//!
111//! ## Index Page Management
112//!
113//! The two-level index hierarchy balances random access performance and metadata overhead:
114//!
115//! - **Page Size**: 4096 entries per page
116//!   - With 64 KiB blocks: Each page covers ~256 MB of logical data
117//!   - Serialized page size: ~64 KiB (fits in L2 cache)
118//!
119//! - **Flushing Strategy**: Eager flush when page fills
120//!   - Prevents memory growth during large packs
121//!   - Enables streaming operation (constant memory)
122//!
123//! - **Master Index**: Array of PageEntry records
124//!   - Binary search for O(log N) page lookup
125//!   - Typical overhead: 1 KiB per GB of data
126//!
127//! # Memory Usage Patterns
128//!
129//! The packing operation is designed for constant memory usage regardless of input size:
130//!
131//! - **Chunking Buffer**: 1 block (64 KiB default)
132//! - **Compression Output**: ~1.5× block size (worst case: incompressible data)
133//! - **Current Index Page**: Up to 4096 × 20 bytes = 80 KiB
134//! - **Deduplication Map**: ~48 bytes × unique_blocks
135//!   - Example: 10 GB image with 50% dedup = ~80 MB HashMap
136//! - **Dictionary**: 110 KiB (if trained)
137//!
138//! Total typical memory: 100-200 MB for dedup hash table + ~1 MB working set.
139//!
140//! # Error Recovery
141//!
142//! The packing operation is not atomic. On failure:
143//!
144//! - **Partial File**: Output file is left in incomplete state
145//! - **Header Invalid**: Header is written last, so partial packs have zeroed header
146//! - **Detection**: Readers validate magic bytes and header checksum
147//! - **Recovery**: None; must delete partial file and retry pack operation
148//!
149//! Future enhancement: Two-phase commit with temporary file + atomic rename.
150//!
151//! # Usage Contexts
152//!
153//! This module is designed to be called from multiple contexts:
154//!
155//! - **CLI Commands**: `hexz data pack` (with terminal progress bars)
156//! - **Python Bindings**: `hexz.pack()` (with optional callbacks)
157//! - **Rust Applications**: Direct API usage for embedded scenarios
158//!
159//! By keeping pack operations separate from UI/CLI code, we avoid pulling in
160//! heavy dependencies (`clap`, `indicatif`) into library contexts.
161//!
162//! # Examples
163//!
164//! ## Basic Packing (LZ4, No Encryption)
165//!
166//! ```no_run
167//! use hexz_core::ops::pack::{pack_snapshot, PackConfig};
168//! use std::path::PathBuf;
169//!
170//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
171//! let config = PackConfig {
172//!     disk: Some(PathBuf::from("disk.raw")),
173//!     memory: None,
174//!     output: PathBuf::from("snapshot.hxz"),
175//!     compression: "lz4".to_string(),
176//!     ..Default::default()
177//! };
178//!
179//! pack_snapshot::<fn(u64, u64)>(config, None)?;
180//! # Ok(())
181//! # }
182//! ```
183//!
184//! ## Advanced Packing (Zstd with Dictionary, CDC, Encryption)
185//!
186//! ```no_run
187//! use hexz_core::ops::pack::{pack_snapshot, PackConfig};
188//! use std::path::PathBuf;
189//!
190//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
191//! let config = PackConfig {
192//!     disk: Some(PathBuf::from("ubuntu.qcow2")),
193//!     output: PathBuf::from("ubuntu.hxz"),
194//!     compression: "zstd".to_string(),
195//!     train_dict: true,         // Train dictionary for better ratio
196//!     cdc_enabled: true,        // Content-defined chunking
197//!     encrypt: true,
198//!     password: Some("secure_passphrase".to_string()),
199//!     min_chunk: 16384,         // 16 KiB minimum chunk
200//!     avg_chunk: 65536,         // 64 KiB average chunk
201//!     max_chunk: 262144,        // 256 KiB maximum chunk
202//!     ..Default::default()
203//! };
204//!
205//! pack_snapshot::<fn(u64, u64)>(config, None)?;
206//! # Ok(())
207//! # }
208//! ```
209//!
210//! ## Progress Reporting
211//!
212//! ```no_run
213//! use hexz_core::ops::pack::{pack_snapshot, PackConfig};
214//! use std::path::PathBuf;
215//!
216//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
217//! let config = PackConfig {
218//!     disk: Some(PathBuf::from("disk.raw")),
219//!     output: PathBuf::from("snapshot.hxz"),
220//!     ..Default::default()
221//! };
222//!
223//! // Callback receives (current_logical_pos, total_size)
224//! pack_snapshot(config, Some(|pos, total| {
225//!     let pct = (pos as f64 / total as f64) * 100.0;
226//!     println!("Packing: {:.1}%", pct);
227//! }))?;
228//! # Ok(())
229//! # }
230//! ```
231//!
232//! # Performance Characteristics
233//!
234//! ## Throughput (Single-Threaded, i7-14700K)
235//!
236//! Validated benchmarks (see `docs/project-docs/BENCHMARKS.md` for details):
237//!
238//! - **LZ4 Compression**: 22 GB/s (minimal CPU overhead)
239//! - **LZ4 Decompression**: 31 GB/s
240//! - **Zstd Level 3 Compression**: 8.7 GB/s
241//! - **Zstd Level 3 Decompression**: 12.9 GB/s
242//! - **BLAKE3 Hashing**: 5.3 GB/s (2.2× faster than SHA-256)
243//! - **SHA-256 Hashing**: 2.5 GB/s
244//! - **FastCDC Chunking**: 2.7 GB/s (gear-based rolling hash)
245//! - **AES-256-GCM Encryption**: 2.1 GB/s (hardware AES-NI acceleration)
246//! - **Pack Throughput (LZ4, no CDC)**: 4.9 GB/s (64KB blocks)
247//! - **Pack Throughput (LZ4 + CDC)**: 1.9 GB/s (CDC adds 2.6× overhead)
248//! - **Pack Throughput (Zstd-3)**: 1.6 GB/s
249//! - **Block Size Impact**: 2.3 GB/s (4KB) → 4.7 GB/s (64KB) → 5.1 GB/s (1MB)
250//!
251//! Typical bottleneck: CDC chunking (when enabled) or compression CPU time. SSD I/O rarely limits.
252//!
253//! Run benchmarks: `cargo bench --bench compression`, `cargo bench --bench hashing`, `cargo bench --bench cdc_chunking`, `cargo bench --bench encryption`, `cargo bench --bench write_throughput`, and `cargo bench --bench block_size_tradeoffs`
254//!
255//! ## Compression Ratios (Typical VM Images)
256//!
257//! - **LZ4**: 2-3× (fast but lower ratio)
258//! - **Zstd Level 3**: 3-5× (good balance)
259//! - **Zstd + Dictionary**: 4-7× (+30% improvement from dictionary)
260//! - **CDC Deduplication**: Not validated - need benchmark comparing CDC vs fixed-size chunking
261//!
262//! ## Time Estimates (64 GB VM Image, Single Thread)
263//!
264//! - **LZ4, Fixed Blocks**: ~30-45 seconds
265//! - **Zstd, Fixed Blocks**: ~2-3 minutes
266//! - **Zstd + Dictionary + CDC**: ~3-5 minutes (includes 2-5s training time)
267//!
268//! # Atomicity and Crash Safety
269//!
270//! **WARNING**: Pack operations are NOT atomic. If interrupted:
271//!
272//! - Output file is left in a partially written state
273//! - The header (written last) will be all zeros
274//! - Readers will reject the file due to invalid magic bytes
275//! - Manual cleanup is required (delete partial file)
276//!
277//! For production use cases requiring atomicity, write to a temporary file and
278//! perform an atomic rename after successful completion.
279
280use hexz_common::constants::{DICT_TRAINING_SIZE, ENTROPY_THRESHOLD};
281use hexz_common::crypto::KeyDerivationParams;
282use hexz_common::{Error, Result};
283use std::fs::File;
284use std::io::{Read, Seek, SeekFrom};
285use std::path::{Path, PathBuf};
286
287use crate::algo::compression::{create_compressor_from_str, zstd::ZstdCompressor};
288use crate::algo::dedup::cdc::StreamChunker;
289use crate::algo::dedup::dcam::DedupeParams;
290use crate::algo::encryption::{Encryptor, aes_gcm::AesGcmEncryptor};
291use crate::ops::parallel_pack::{CompressedChunk, RawChunk};
292use crate::ops::snapshot_writer::SnapshotWriter;
293
294/// Configuration parameters for snapshot packing.
295///
296/// This struct encapsulates all settings for the packing process. It's designed
297/// to be easily constructed from CLI arguments or programmatic APIs.
298///
299/// # Examples
300///
301/// ```
302/// use hexz_core::ops::pack::PackConfig;
303/// use std::path::PathBuf;
304///
305/// // Basic configuration with defaults
306/// let config = PackConfig {
307///     disk: Some(PathBuf::from("disk.img")),
308///     output: PathBuf::from("snapshot.hxz"),
309///     ..Default::default()
310/// };
311///
312/// // Advanced configuration with CDC and encryption
313/// let advanced = PackConfig {
314///     disk: Some(PathBuf::from("disk.img")),
315///     output: PathBuf::from("snapshot.hxz"),
316///     compression: "zstd".to_string(),
317///     encrypt: true,
318///     password: Some("secret".to_string()),
319///     cdc_enabled: true,
320///     min_chunk: 16384,
321///     avg_chunk: 65536,
322///     max_chunk: 131072,
323///     ..Default::default()
324/// };
325/// ```
326#[derive(Debug, Clone)]
327pub struct PackConfig {
328    /// Path to the disk image (optional).
329    pub disk: Option<PathBuf>,
330    /// Path to the memory image (optional).
331    pub memory: Option<PathBuf>,
332    /// Output snapshot file path.
333    pub output: PathBuf,
334    /// Compression algorithm ("lz4" or "zstd").
335    pub compression: String,
336    /// Enable encryption.
337    pub encrypt: bool,
338    /// Encryption password (required if encrypt=true).
339    pub password: Option<String>,
340    /// Train a compression dictionary (zstd only).
341    pub train_dict: bool,
342    /// Block size in bytes.
343    pub block_size: u32,
344    /// Enable content-defined chunking (CDC).
345    pub cdc_enabled: bool,
346    /// Minimum chunk size for CDC.
347    pub min_chunk: u32,
348    /// Average chunk size for CDC.
349    pub avg_chunk: u32,
350    /// Maximum chunk size for CDC.
351    pub max_chunk: u32,
352    /// Enable parallel compression (use multiple CPU cores).
353    pub parallel: bool,
354    /// Number of worker threads (0 = auto-detect).
355    pub num_workers: usize,
356    /// Show progress bar (if no callback provided).
357    pub show_progress: bool,
358}
359
360impl Default for PackConfig {
361    fn default() -> Self {
362        Self {
363            disk: None,
364            memory: None,
365            output: PathBuf::from("output.hxz"),
366            compression: "lz4".to_string(),
367            encrypt: false,
368            password: None,
369            train_dict: false,
370            block_size: 65536,
371            cdc_enabled: false,
372            min_chunk: 16384,
373            avg_chunk: 65536,
374            max_chunk: 131072,
375            parallel: true,      // Enable by default for performance
376            num_workers: 0,      // Auto-detect CPU cores
377            show_progress: true, // Show progress by default
378        }
379    }
380}
381
382/// Calculates Shannon entropy of a byte slice.
383///
384/// Shannon entropy measures the "randomness" or information content of data:
385/// - **0.0**: All bytes are identical (highly compressible)
386/// - **8.0**: Maximum entropy, random data (incompressible)
387///
388/// # Formula
389///
390/// ```text
391/// H(X) = -Σ p(x) * log2(p(x))
392/// ```
393///
394/// Where `p(x)` is the frequency of each byte value.
395///
396/// # Usage
397///
398/// Used during dictionary training to filter out high-entropy (random) blocks
399/// that wouldn't benefit from compression. Only blocks with entropy below
400/// `ENTROPY_THRESHOLD` are included in the training set.
401///
402/// # Parameters
403///
404/// - `data`: Byte slice to analyze
405///
406/// # Returns
407///
408/// Entropy value from 0.0 (homogeneous) to 8.0 (random).
409///
410/// # Examples
411///
412/// ```
413/// # use hexz_core::ops::pack::calculate_entropy;
414/// // Homogeneous data (low entropy)
415/// let zeros = vec![0u8; 1024];
416/// let entropy = calculate_entropy(&zeros);
417/// assert_eq!(entropy, 0.0);
418///
419/// // Random data (high entropy)
420/// let random: Vec<u8> = (0..=255).cycle().take(1024).collect();
421/// let entropy = calculate_entropy(&random);
422/// assert!(entropy > 7.0);
423/// ```
424pub fn calculate_entropy(data: &[u8]) -> f64 {
425    if data.is_empty() {
426        return 0.0;
427    }
428
429    let mut frequencies = [0u32; 256];
430    for &byte in data {
431        frequencies[byte as usize] += 1;
432    }
433
434    let len = data.len() as f64;
435    let mut entropy = 0.0;
436
437    for &count in frequencies.iter() {
438        if count > 0 {
439            let p = count as f64 / len;
440            entropy -= p * p.log2();
441        }
442    }
443
444    entropy
445}
446
447/// Fixed-size block chunker with buffer reuse.
448///
449/// Splits input into equal-sized blocks (except possibly the last one).
450/// Simpler and faster than CDC, but less effective for deduplication.
451///
452/// Reuses an internal buffer across calls to `next_chunk()`, eliminating
453/// per-chunk allocation after the first call.
454pub struct FixedChunker<R> {
455    reader: R,
456    block_size: usize,
457    buffer: Vec<u8>,
458    done: bool,
459}
460
461impl<R: Read> FixedChunker<R> {
462    /// Creates a new fixed-size chunker.
463    pub fn new(reader: R, block_size: usize) -> Self {
464        Self {
465            reader,
466            block_size,
467            buffer: vec![0u8; block_size],
468            done: false,
469        }
470    }
471
472    /// Returns the next chunk as a borrowed slice, or `None` at EOF.
473    ///
474    /// Zero allocations after the first call thanks to buffer reuse.
475    fn next_chunk(&mut self) -> std::io::Result<Option<&[u8]>> {
476        if self.done {
477            return Ok(None);
478        }
479        let mut pos = 0;
480        self.buffer.resize(self.block_size, 0);
481        while pos < self.block_size {
482            match self.reader.read(&mut self.buffer[pos..]) {
483                Ok(0) => break,
484                Ok(n) => pos += n,
485                Err(e) => return Err(e),
486            }
487        }
488        if pos == 0 {
489            self.done = true;
490            Ok(None)
491        } else {
492            self.buffer.truncate(pos);
493            Ok(Some(&self.buffer))
494        }
495    }
496}
497
498impl<R: Read> Iterator for FixedChunker<R> {
499    type Item = std::io::Result<Vec<u8>>;
500
501    fn next(&mut self) -> Option<Self::Item> {
502        match self.next_chunk() {
503            Ok(Some(slice)) => Some(Ok(slice.to_vec())),
504            Ok(None) => None,
505            Err(e) => Some(Err(e)),
506        }
507    }
508}
509
510/// Packs a snapshot file from disk and/or memory images.
511///
512/// This is the main entry point for creating Hexz snapshot files. It orchestrates
513/// the complete packing pipeline: dictionary training, stream processing, index
514/// building, and header finalization.
515///
516/// # Workflow
517///
518/// 1. **Validation**: Ensure at least one input (disk or memory) is provided
519/// 2. **File Creation**: Create output file, reserve 512 bytes for header
520/// 3. **Dictionary Training**: If requested (Zstd only), train dictionary from input samples
521/// 4. **Dictionary Writing**: If trained, write dictionary immediately after header
522/// 5. **Compressor Initialization**: Create LZ4 or Zstd compressor (with optional dictionary)
523/// 6. **Encryptor Initialization**: If requested, derive key from password using PBKDF2
524/// 7. **Stream Processing**: Process disk stream (if provided), then memory stream (if provided)
525///    - Each stream independently chunks, compresses, encrypts, deduplicates, and indexes
526/// 8. **Master Index Writing**: Serialize master index (all PageEntry records) to end of file
527/// 9. **Header Writing**: Seek to start, write complete header with metadata and offsets
528/// 10. **Flush**: Ensure all data is written to disk
529///
530/// # Parameters
531///
532/// - `config`: Packing configuration parameters (see [`PackConfig`])
533/// - `progress_callback`: Optional callback for progress reporting
534///   - Called frequently during stream processing (~once per 64 KiB)
535///   - Signature: `Fn(logical_pos: u64, total_size: u64)`
536///   - Example: `|pos, total| println!("Progress: {:.1}%", (pos as f64 / total as f64) * 100.0)`
537///
538/// # Returns
539///
540/// - `Ok(())`: Snapshot packed successfully
541/// - `Err(Error::Io)`: I/O error (file access, disk full, permission denied)
542/// - `Err(Error::Compression)`: Compression error (unlikely, usually indicates invalid state)
543/// - `Err(Error::Encryption)`: Encryption error (invalid password format, crypto failure)
544///
545/// # Errors
546///
547/// This function can fail for several reasons:
548///
549/// ## I/O Errors
550///
551/// - **Input file not found**: `config.disk` or `config.memory` path doesn't exist
552/// - **Permission denied**: Cannot read input or write output
553/// - **Disk full**: Insufficient space for output file
554/// - **Output exists**: May overwrite existing file without warning
555///
556/// ## Configuration Errors
557///
558/// - **No inputs**: Neither `disk` nor `memory` is provided
559/// - **Missing password**: `encrypt = true` but `password = None`
560/// - **Invalid block size**: Block size too small (<1 KiB) or too large (>16 MiB)
561/// - **Invalid CDC params**: `min_chunk >= avg_chunk >= max_chunk` constraint violated
562///
563/// ## Compression/Encryption Errors
564///
565/// - **Dictionary training failure**: Zstd training fails (rare, usually on corrupted input)
566/// - **Compression failure**: Compressor returns error (rare, usually indicates bug)
567/// - **Encryption failure**: Key derivation or cipher initialization fails
568///
569/// # Examples
570///
571/// ## Basic Usage
572///
573/// ```no_run
574/// use hexz_core::ops::pack::{pack_snapshot, PackConfig};
575/// use std::path::PathBuf;
576///
577/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
578/// let config = PackConfig {
579///     disk: Some(PathBuf::from("disk.raw")),
580///     output: PathBuf::from("snapshot.hxz"),
581///     ..Default::default()
582/// };
583///
584/// pack_snapshot::<fn(u64, u64)>(config, None)?;
585/// # Ok(())
586/// # }
587/// ```
588///
589/// ## With Progress Reporting
590///
591/// ```no_run
592/// use hexz_core::ops::pack::{pack_snapshot, PackConfig};
593/// use std::path::PathBuf;
594///
595/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
596/// let config = PackConfig {
597///     disk: Some(PathBuf::from("ubuntu.qcow2")),
598///     output: PathBuf::from("ubuntu.hxz"),
599///     compression: "zstd".to_string(),
600///     train_dict: true,
601///     ..Default::default()
602/// };
603///
604/// pack_snapshot(config, Some(|pos, total| {
605///     eprint!("\rPacking: {:.1}%", (pos as f64 / total as f64) * 100.0);
606/// }))?;
607/// eprintln!("\nDone!");
608/// # Ok(())
609/// # }
610/// ```
611///
612/// ## Encrypted Snapshot
613///
614/// ```no_run
615/// use hexz_core::ops::pack::{pack_snapshot, PackConfig};
616/// use std::path::PathBuf;
617///
618/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
619/// let config = PackConfig {
620///     disk: Some(PathBuf::from("sensitive.raw")),
621///     output: PathBuf::from("sensitive.hxz"),
622///     encrypt: true,
623///     password: Some("strong_passphrase".to_string()),
624///     ..Default::default()
625/// };
626///
627/// pack_snapshot::<fn(u64, u64)>(config, None)?;
628/// println!("Encrypted snapshot created");
629/// # Ok(())
630/// # }
631/// ```
632///
633/// ## Content-Defined Chunking for Deduplication
634///
635/// ```no_run
636/// use hexz_core::ops::pack::{pack_snapshot, PackConfig};
637/// use std::path::PathBuf;
638///
639/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
640/// let config = PackConfig {
641///     disk: Some(PathBuf::from("incremental-backup.raw")),
642///     output: PathBuf::from("backup.hxz"),
643///     cdc_enabled: true,
644///     min_chunk: 16384,   // 16 KiB
645///     avg_chunk: 65536,   // 64 KiB
646///     max_chunk: 262144,  // 256 KiB
647///     ..Default::default()
648/// };
649///
650/// pack_snapshot::<fn(u64, u64)>(config, None)?;
651/// # Ok(())
652/// # }
653/// ```
654///
655/// # Performance
656///
657/// See module-level documentation for detailed performance characteristics.
658///
659/// Typical throughput for a 64 GB VM image on modern hardware (Intel i7, NVMe SSD):
660///
661/// - **LZ4, no encryption**: ~2 GB/s (~30 seconds total)
662/// - **Zstd level 3, no encryption**: ~500 MB/s (~2 minutes total)
663/// - **Zstd + dictionary + CDC**: ~400 MB/s (~3 minutes including training)
664///
665/// # Atomicity
666///
667/// This operation is NOT atomic. On failure, the output file will be left in a
668/// partially written state. The file header is written last, so incomplete files
669/// will have an all-zero header and will be rejected by readers.
670///
671/// For atomic pack operations, write to a temporary file and perform an atomic
672/// rename after success:
673///
674/// ```no_run
675/// # use hexz_core::ops::pack::{pack_snapshot, PackConfig};
676/// # use std::path::PathBuf;
677/// # use std::fs;
678/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
679/// let mut config = PackConfig {
680///     disk: Some(PathBuf::from("disk.raw")),
681///     output: PathBuf::from("snapshot.st.tmp"),
682///     ..Default::default()
683/// };
684///
685/// pack_snapshot::<fn(u64, u64)>(config.clone(), None)?;
686/// fs::rename("snapshot.st.tmp", "snapshot.hxz")?;
687/// # Ok(())
688/// # }
689/// ```
690///
691/// # Thread Safety
692///
693/// This function is not thread-safe with respect to the output file. Do not call
694/// `pack_snapshot` concurrently with the same output path. Concurrent packing to
695/// different output files is safe.
696///
697/// The progress callback must be `Send + Sync` if you want to call this function
698/// from a non-main thread.
699pub fn pack_snapshot<F>(config: PackConfig, progress_callback: Option<F>) -> Result<()>
700where
701    F: Fn(u64, u64) + Send + Sync,
702{
703    // Validate inputs
704    if config.disk.is_none() && config.memory.is_none() {
705        return Err(Error::Io(std::io::Error::new(
706            std::io::ErrorKind::InvalidInput,
707            "At least one input (disk or memory) must be provided",
708        )));
709    }
710
711    // Train compression dictionary if requested
712    let dictionary = if config.compression == "zstd" && config.train_dict {
713        Some(train_dictionary(
714            config
715                .disk
716                .as_ref()
717                .or(config.memory.as_ref())
718                .ok_or_else(|| {
719                    Error::Io(std::io::Error::new(
720                        std::io::ErrorKind::InvalidInput,
721                        "No input file available for dictionary training",
722                    ))
723                })?,
724            config.block_size,
725        )?)
726    } else {
727        None
728    };
729
730    // Initialize compressor
731    let (compressor, compression_type) =
732        create_compressor_from_str(&config.compression, None, dictionary.clone())?;
733
734    // Initialize encryptor if requested
735    let (encryptor, enc_params): (Option<Box<dyn Encryptor>>, _) = if config.encrypt {
736        let password = config.password.clone().ok_or_else(|| {
737            Error::Io(std::io::Error::new(
738                std::io::ErrorKind::InvalidInput,
739                "Password required for encryption",
740            ))
741        })?;
742        let params = KeyDerivationParams::default();
743        let enc = AesGcmEncryptor::new(password.as_bytes(), &params.salt, params.iterations)?;
744        (Some(Box::new(enc) as Box<dyn Encryptor>), Some(params))
745    } else {
746        (None, None)
747    };
748
749    // Build the snapshot writer with optional encryption
750    let mut builder = SnapshotWriter::builder(&config.output, compressor, compression_type)
751        .block_size(config.block_size)
752        .variable_blocks(config.cdc_enabled);
753
754    if let (Some(enc), Some(params)) = (encryptor, enc_params) {
755        builder = builder.encryption(enc, params);
756    }
757
758    let mut writer = builder.build()?;
759
760    // Write dictionary to file
761    if let Some(d) = &dictionary {
762        writer.write_dictionary(d)?;
763    }
764
765    // Set up progress bar if show_progress is enabled and no user callback given
766    let disk_size = config
767        .disk
768        .as_ref()
769        .and_then(|p| std::fs::metadata(p).ok())
770        .map(|m| m.len())
771        .unwrap_or(0);
772    let memory_size = config
773        .memory
774        .as_ref()
775        .and_then(|p| std::fs::metadata(p).ok())
776        .map(|m| m.len())
777        .unwrap_or(0);
778    let total_size = disk_size + memory_size;
779
780    let progress_bar = if config.show_progress && progress_callback.is_none() && total_size > 0 {
781        Some(crate::ops::progress::PackProgress::new(
782            total_size, "Packing",
783        ))
784    } else {
785        None
786    };
787
788    // Process disk stream
789    if let Some(ref path) = config.disk {
790        let cb = |pos: u64, total: u64| {
791            if let Some(ref pb) = progress_bar {
792                pb.set_position(pos);
793            }
794            if let Some(ref cb) = progress_callback {
795                cb(pos, total);
796            }
797        };
798        process_stream(path.clone(), true, &mut writer, &config, Some(&cb))?;
799    }
800
801    // Process memory stream
802    if let Some(ref path) = config.memory {
803        let cb = |pos: u64, total: u64| {
804            if let Some(ref pb) = progress_bar {
805                pb.set_position(disk_size + pos);
806            }
807            if let Some(ref cb) = progress_callback {
808                cb(pos, total);
809            }
810        };
811        process_stream(path.clone(), false, &mut writer, &config, Some(&cb))?;
812    }
813
814    if let Some(ref pb) = progress_bar {
815        pb.finish();
816    }
817
818    writer.finalize(None, None)?;
819
820    Ok(())
821}
822
823/// Trains a Zstd compression dictionary from stratified samples.
824///
825/// Dictionary training analyzes a representative sample of input blocks to build
826/// a shared dictionary that improves compression ratios for structured data
827/// (file systems, databases, logs) by capturing common patterns.
828///
829/// # Algorithm
830///
831/// 1. **Stratified Sampling**: Sample blocks evenly across the file
832///    - Compute step size: `file_size / target_samples`
833///    - Read one block at each sample point
834///    - Ensures coverage of different regions (boot sector, metadata, data)
835///
836/// 2. **Quality Filtering**: Exclude unsuitable blocks
837///    - Skip all-zero blocks (no compressible patterns)
838///    - Compute Shannon entropy (0-8 bits per byte)
839///    - Reject blocks with entropy > `ENTROPY_THRESHOLD` (6.0)
840///    - Rationale: High-entropy data (encrypted, random) doesn't benefit from dictionaries
841///
842/// 3. **Dictionary Training**: Feed filtered samples to Zstd
843///    - Uses Zstd's COVER algorithm (fast_cover variant)
844///    - Analyzes n-grams to find common subsequences
845///    - Outputs dictionary up to `DICT_TRAINING_SIZE` (110 KiB)
846///
847/// # Parameters
848///
849/// - `input_path`: Path to the input file to sample from
850/// - `block_size`: Size of each sample block in bytes
851///
852/// # Returns
853///
854/// - `Ok(Vec<u8>)`: Trained dictionary bytes (empty if training fails or no suitable samples)
855/// - `Err(Error)`: I/O error reading input file
856///
857/// # Performance
858///
859/// - **Sampling time**: ~100-500 ms (depends on file size and disk speed)
860/// - **Training time**: ~2-5 seconds for 4000 samples
861/// - **Memory usage**: ~256 MB (sample corpus in RAM)
862///
863/// # Compression Improvement
864///
865/// - **Typical**: 10-30% better ratio vs. no dictionary
866/// - **Best case**: 50%+ improvement for highly structured data (databases)
867/// - **Worst case**: No improvement or slight regression (already compressed data)
868///
869/// # Edge Cases
870///
871/// - **Empty file**: Returns empty dictionary with warning
872/// - **All high-entropy data**: Returns empty dictionary with warning
873/// - **Small files**: May not reach target sample count (trains on available data)
874///
875/// # Examples
876///
877/// Called internally by `pack_snapshot` when `train_dict` is enabled:
878///
879/// ```text
880/// let dict = train_dictionary(Path::new("disk.raw"), 65536)?;
881/// // dict: Vec<u8> containing the trained zstd dictionary
882/// ```
883fn train_dictionary(input_path: &Path, block_size: u32) -> Result<Vec<u8>> {
884    let mut f = File::open(input_path)?;
885    let file_len = f.metadata()?.len();
886
887    let mut samples = Vec::new();
888    let mut buffer = vec![0u8; block_size as usize];
889    let target_samples = DICT_TRAINING_SIZE;
890
891    let step = if file_len > 0 {
892        (file_len / target_samples as u64).max(block_size as u64)
893    } else {
894        0
895    };
896
897    let mut attempts = 0;
898    while samples.len() < target_samples && attempts < target_samples * 2 {
899        let offset = attempts as u64 * step;
900        if offset >= file_len {
901            break;
902        }
903
904        f.seek(SeekFrom::Start(offset))?;
905        let n = f.read(&mut buffer)?;
906        if n == 0 {
907            break;
908        }
909        let chunk = &buffer[..n];
910        let is_zeros = chunk.iter().all(|&b| b == 0);
911
912        if !is_zeros {
913            let entropy = calculate_entropy(chunk);
914            if entropy < ENTROPY_THRESHOLD {
915                samples.push(chunk.to_vec());
916            }
917        }
918        attempts += 1;
919    }
920
921    if samples.is_empty() {
922        tracing::warn!("Input seems to be empty or high entropy. Dictionary will be empty.");
923        Ok(Vec::new())
924    } else {
925        let dict_bytes = ZstdCompressor::train(&samples, DICT_TRAINING_SIZE)?;
926        tracing::info!("Dictionary trained: {} bytes", dict_bytes.len());
927        Ok(dict_bytes)
928    }
929}
930
931/// Processes a single input stream (disk or memory) via the [`SnapshotWriter`].
932fn process_stream<F>(
933    path: PathBuf,
934    is_disk: bool,
935    writer: &mut SnapshotWriter,
936    config: &PackConfig,
937    progress_callback: Option<&F>,
938) -> Result<()>
939where
940    F: Fn(u64, u64),
941{
942    let f = File::open(&path)?;
943    let len = f.metadata()?.len();
944
945    writer.begin_stream(is_disk, len);
946
947    // Use parallel path when enabled and not encrypting (encryption needs sequential nonces)
948    if config.parallel && !config.encrypt {
949        process_stream_parallel(f, len, writer, config, progress_callback)?;
950    } else {
951        process_stream_serial(f, len, writer, config, progress_callback)?;
952    }
953
954    writer.end_stream()?;
955    Ok(())
956}
957
958/// Serial (original) stream processing path.
959fn process_stream_serial<F>(
960    f: File,
961    len: u64,
962    writer: &mut SnapshotWriter,
963    config: &PackConfig,
964    progress_callback: Option<&F>,
965) -> Result<()>
966where
967    F: Fn(u64, u64),
968{
969    let mut logical_pos = 0u64;
970
971    if config.cdc_enabled {
972        let params = DedupeParams {
973            f: (config.avg_chunk as f64).log2() as u32,
974            m: config.min_chunk,
975            z: config.max_chunk,
976            w: 48,
977            v: 8,
978        };
979        let chunker = StreamChunker::new(f, params);
980        for chunk_res in chunker {
981            let chunk = chunk_res?;
982            logical_pos += chunk.len() as u64;
983            writer.write_data_block(&chunk)?;
984            if let Some(callback) = progress_callback {
985                callback(logical_pos, len);
986            }
987        }
988    } else {
989        let mut chunker = FixedChunker::new(f, config.block_size as usize);
990        loop {
991            match chunker.next_chunk() {
992                Ok(Some(chunk)) => {
993                    logical_pos += chunk.len() as u64;
994                    writer.write_data_block(chunk)?;
995                    if let Some(callback) = progress_callback {
996                        callback(logical_pos, len);
997                    }
998                }
999                Ok(None) => break,
1000                Err(e) => return Err(Error::Io(e)),
1001            }
1002        }
1003    }
1004
1005    Ok(())
1006}
1007
1008/// Parallel stream processing: single persistent pipeline for the entire stream.
1009///
1010/// Architecture:
1011/// - Reader thread: reads input file, chunks it, sends to workers
1012/// - N worker threads: compress + BLAKE3 hash chunks in parallel
1013/// - Main thread: receives compressed chunks, reorders via BTreeMap, writes sequentially
1014///
1015/// This avoids per-batch thread pool creation overhead (the old approach created
1016/// ~2800 thread pools for a 180GB file).
1017fn process_stream_parallel<F>(
1018    f: File,
1019    len: u64,
1020    writer: &mut SnapshotWriter,
1021    config: &PackConfig,
1022    progress_callback: Option<&F>,
1023) -> Result<()>
1024where
1025    F: Fn(u64, u64),
1026{
1027    use crate::algo::compression::Compressor;
1028    use crossbeam::channel::bounded;
1029    use std::collections::BTreeMap;
1030    use std::sync::Arc;
1031
1032    let num_workers = if config.num_workers > 0 {
1033        config.num_workers
1034    } else {
1035        num_cpus::get()
1036    };
1037
1038    // Create shared compressor for all workers
1039    let (compressor, _) = create_compressor_from_str(&config.compression, None, None)?;
1040    let compressor: Arc<Box<dyn Compressor + Send + Sync>> = Arc::new(compressor);
1041
1042    // Bounded channels for backpressure: enough to keep workers busy without excessive memory.
1043    // Each in-flight chunk is ~64KB, so num_workers*4 chunks ≈ num_workers*256KB.
1044    let channel_size = num_workers * 4;
1045    let (tx_raw, rx_raw) = bounded::<(u64, RawChunk)>(channel_size);
1046    let (tx_compressed, rx_compressed) = bounded::<(u64, CompressedChunk)>(channel_size);
1047
1048    // Spawn persistent compression workers
1049    let mut workers = Vec::with_capacity(num_workers);
1050    for _ in 0..num_workers {
1051        let rx = rx_raw.clone();
1052        let tx = tx_compressed.clone();
1053        let comp = compressor.clone();
1054        workers.push(std::thread::spawn(move || -> Result<()> {
1055            for (seq, chunk) in rx {
1056                let compressed_data = comp.compress(&chunk.data)?;
1057                let hash = blake3::hash(&chunk.data);
1058                if tx
1059                    .send((
1060                        seq,
1061                        CompressedChunk {
1062                            compressed: compressed_data,
1063                            hash: hash.into(),
1064                            logical_offset: chunk.logical_offset,
1065                            original_size: chunk.data.len(),
1066                        },
1067                    ))
1068                    .is_err()
1069                {
1070                    break; // Receiver dropped, pipeline shutting down
1071                }
1072            }
1073            Ok(())
1074        }));
1075    }
1076
1077    // Drop our copies so channels close when all real holders finish
1078    drop(rx_raw);
1079    drop(tx_compressed);
1080
1081    // Spawn reader thread: reads input, chunks it, feeds workers
1082    let reader_config = config.clone();
1083    let reader = std::thread::spawn(move || -> Result<()> {
1084        let mut seq = 0u64;
1085        let mut logical_pos = 0u64;
1086
1087        if reader_config.cdc_enabled {
1088            let params = DedupeParams {
1089                f: (reader_config.avg_chunk as f64).log2() as u32,
1090                m: reader_config.min_chunk,
1091                z: reader_config.max_chunk,
1092                w: 48,
1093                v: 8,
1094            };
1095            let chunker = StreamChunker::new(f, params);
1096            for chunk_res in chunker {
1097                let chunk = chunk_res?;
1098                let chunk_len = chunk.len();
1099                if tx_raw
1100                    .send((
1101                        seq,
1102                        RawChunk {
1103                            data: chunk,
1104                            logical_offset: logical_pos,
1105                        },
1106                    ))
1107                    .is_err()
1108                {
1109                    break; // Workers shut down
1110                }
1111                logical_pos += chunk_len as u64;
1112                seq += 1;
1113            }
1114        } else {
1115            let mut chunker = FixedChunker::new(f, reader_config.block_size as usize);
1116            loop {
1117                match chunker.next_chunk() {
1118                    Ok(Some(chunk)) => {
1119                        let chunk_len = chunk.len();
1120                        if tx_raw
1121                            .send((
1122                                seq,
1123                                RawChunk {
1124                                    data: chunk.to_vec(),
1125                                    logical_offset: logical_pos,
1126                                },
1127                            ))
1128                            .is_err()
1129                        {
1130                            break; // Workers shut down
1131                        }
1132                        logical_pos += chunk_len as u64;
1133                        seq += 1;
1134                    }
1135                    Ok(None) => break,
1136                    Err(e) => return Err(Error::Io(e)),
1137                }
1138            }
1139        }
1140        Ok(())
1141    });
1142
1143    // Main thread: receive compressed chunks, reorder, write sequentially.
1144    // Workers return chunks out-of-order; BTreeMap restores logical order.
1145    let mut next_seq = 0u64;
1146    let mut reorder_buf: BTreeMap<u64, CompressedChunk> = BTreeMap::new();
1147    let mut write_error: Option<Error> = None;
1148
1149    for (seq, compressed) in rx_compressed.iter() {
1150        reorder_buf.insert(seq, compressed);
1151
1152        // Drain all consecutive chunks ready to write
1153        while let Some(chunk) = reorder_buf.remove(&next_seq) {
1154            match writer.write_precompressed_block(
1155                &chunk.compressed,
1156                &chunk.hash,
1157                chunk.original_size as u32,
1158            ) {
1159                Ok(()) => {
1160                    if let Some(callback) = progress_callback {
1161                        callback(chunk.logical_offset + chunk.original_size as u64, len);
1162                    }
1163                    next_seq += 1;
1164                }
1165                Err(e) => {
1166                    write_error = Some(e);
1167                    break;
1168                }
1169            }
1170        }
1171        if write_error.is_some() {
1172            break;
1173        }
1174    }
1175
1176    // Drop receiver to unblock workers/reader if we exited early due to write error.
1177    // This causes workers' send() to fail → workers exit → reader's send() fails → reader exits.
1178    drop(rx_compressed);
1179
1180    // Wait for all threads to finish
1181    let reader_result = reader
1182        .join()
1183        .map_err(|_| Error::Io(std::io::Error::other("Reader thread panicked")))?;
1184
1185    for worker in workers {
1186        worker
1187            .join()
1188            .map_err(|_| Error::Io(std::io::Error::other("Worker thread panicked")))?
1189            .ok(); // Ignore worker errors if we already have a write error
1190    }
1191
1192    // Propagate errors (write errors take priority)
1193    if let Some(e) = write_error {
1194        return Err(e);
1195    }
1196    reader_result?;
1197
1198    Ok(())
1199}
1200
1201#[cfg(test)]
1202mod tests {
1203    use super::*;
1204    use std::io::Cursor;
1205
1206    #[test]
1207    fn test_calculate_entropy_empty() {
1208        assert_eq!(calculate_entropy(&[]), 0.0);
1209    }
1210
1211    #[test]
1212    fn test_calculate_entropy_uniform() {
1213        // All same byte - lowest entropy
1214        let data = vec![0x42; 1000];
1215        let entropy = calculate_entropy(&data);
1216        assert!(
1217            entropy < 0.01,
1218            "Entropy should be near 0.0 for uniform data"
1219        );
1220    }
1221
1222    #[test]
1223    fn test_calculate_entropy_binary() {
1224        // Two values - low entropy
1225        let mut data = vec![0u8; 500];
1226        data.extend(vec![1u8; 500]);
1227        let entropy = calculate_entropy(&data);
1228        assert!(
1229            entropy > 0.9 && entropy < 1.1,
1230            "Entropy should be ~1.0 for binary data"
1231        );
1232    }
1233
1234    #[test]
1235    fn test_calculate_entropy_random() {
1236        // All 256 values - high entropy
1237        let data: Vec<u8> = (0..=255).cycle().take(256 * 4).collect();
1238        let entropy = calculate_entropy(&data);
1239        assert!(
1240            entropy > 7.5,
1241            "Entropy should be high for all byte values: got {}",
1242            entropy
1243        );
1244    }
1245
1246    #[test]
1247    fn test_calculate_entropy_single_byte() {
1248        assert_eq!(calculate_entropy(&[42]), 0.0);
1249    }
1250
1251    #[test]
1252    fn test_calculate_entropy_two_different_bytes() {
1253        let data = vec![0, 255];
1254        let entropy = calculate_entropy(&data);
1255        assert!(entropy > 0.9 && entropy < 1.1, "Entropy should be ~1.0");
1256    }
1257
1258    #[test]
1259    fn test_fixed_chunker_exact_blocks() {
1260        let data = vec![1, 2, 3, 4, 5, 6, 7, 8];
1261        let cursor = Cursor::new(data);
1262        let chunker = FixedChunker::new(cursor, 4);
1263
1264        let chunks: Vec<_> = chunker.map(|r| r.unwrap()).collect();
1265
1266        assert_eq!(chunks.len(), 2);
1267        assert_eq!(chunks[0], vec![1, 2, 3, 4]);
1268        assert_eq!(chunks[1], vec![5, 6, 7, 8]);
1269    }
1270
1271    #[test]
1272    fn test_fixed_chunker_partial_last_block() {
1273        let data = vec![1, 2, 3, 4, 5];
1274        let cursor = Cursor::new(data);
1275        let chunker = FixedChunker::new(cursor, 3);
1276
1277        let chunks: Vec<_> = chunker.map(|r| r.unwrap()).collect();
1278
1279        assert_eq!(chunks.len(), 2);
1280        assert_eq!(chunks[0], vec![1, 2, 3]);
1281        assert_eq!(chunks[1], vec![4, 5]);
1282    }
1283
1284    #[test]
1285    fn test_fixed_chunker_empty_input() {
1286        let data = vec![];
1287        let cursor = Cursor::new(data);
1288        let chunker = FixedChunker::new(cursor, 1024);
1289
1290        let chunks: Vec<_> = chunker.map(|r| r.unwrap()).collect();
1291
1292        assert_eq!(chunks.len(), 0);
1293    }
1294
1295    #[test]
1296    fn test_fixed_chunker_single_byte_blocks() {
1297        let data = vec![1, 2, 3];
1298        let cursor = Cursor::new(data);
1299        let chunker = FixedChunker::new(cursor, 1);
1300
1301        let chunks: Vec<_> = chunker.map(|r| r.unwrap()).collect();
1302
1303        assert_eq!(chunks.len(), 3);
1304        assert_eq!(chunks[0], vec![1]);
1305        assert_eq!(chunks[1], vec![2]);
1306        assert_eq!(chunks[2], vec![3]);
1307    }
1308
1309    #[test]
1310    fn test_fixed_chunker_large_block_size() {
1311        let data = vec![1, 2, 3, 4, 5];
1312        let cursor = Cursor::new(data.clone());
1313        let chunker = FixedChunker::new(cursor, 10000);
1314
1315        let chunks: Vec<_> = chunker.map(|r| r.unwrap()).collect();
1316
1317        assert_eq!(chunks.len(), 1);
1318        assert_eq!(chunks[0], data);
1319    }
1320
1321    #[test]
1322    fn test_pack_config_default() {
1323        let config = PackConfig::default();
1324
1325        assert_eq!(config.compression, "lz4");
1326        assert!(!config.encrypt);
1327        assert_eq!(config.password, None);
1328        assert!(!config.train_dict);
1329        assert_eq!(config.block_size, 65536);
1330        assert!(!config.cdc_enabled);
1331        assert_eq!(config.min_chunk, 16384);
1332        assert_eq!(config.avg_chunk, 65536);
1333        assert_eq!(config.max_chunk, 131072);
1334    }
1335
1336    #[test]
1337    fn test_pack_config_clone() {
1338        let config1 = PackConfig {
1339            disk: Some(PathBuf::from("/dev/sda")),
1340            output: PathBuf::from("output.hxz"),
1341            compression: "zstd".to_string(),
1342            encrypt: true,
1343            password: Some("secret".to_string()),
1344            ..Default::default()
1345        };
1346
1347        let config2 = config1.clone();
1348
1349        assert_eq!(config2.disk, config1.disk);
1350        assert_eq!(config2.output, config1.output);
1351        assert_eq!(config2.compression, config1.compression);
1352        assert_eq!(config2.encrypt, config1.encrypt);
1353        assert_eq!(config2.password, config1.password);
1354    }
1355
1356    #[test]
1357    fn test_pack_config_debug() {
1358        let config = PackConfig::default();
1359        let debug_str = format!("{:?}", config);
1360
1361        assert!(debug_str.contains("PackConfig"));
1362        assert!(debug_str.contains("lz4"));
1363    }
1364
1365    #[test]
1366    fn test_entropy_threshold_filtering() {
1367        // Test data with entropy below threshold (compressible)
1368        let low_entropy_data = vec![0u8; 1024];
1369        assert!(calculate_entropy(&low_entropy_data) < ENTROPY_THRESHOLD);
1370
1371        // Test data with entropy above threshold (random)
1372        let high_entropy_data: Vec<u8> = (0..1024).map(|i| ((i * 7) % 256) as u8).collect();
1373        let entropy = calculate_entropy(&high_entropy_data);
1374        // This might not always be above threshold depending on the pattern,
1375        // but we can still test that entropy calculation works
1376        assert!((0.0..=8.0).contains(&entropy));
1377    }
1378
1379    #[test]
1380    fn test_entropy_calculation_properties() {
1381        // Entropy should increase with more unique values
1382        let data1 = vec![0u8; 100];
1383        let data2 = [0u8, 1u8].repeat(50);
1384        let mut data3 = Vec::new();
1385        for i in 0..100 {
1386            data3.push((i % 10) as u8);
1387        }
1388
1389        let entropy1 = calculate_entropy(&data1);
1390        let entropy2 = calculate_entropy(&data2);
1391        let entropy3 = calculate_entropy(&data3);
1392
1393        assert!(
1394            entropy1 < entropy2,
1395            "More unique values should increase entropy"
1396        );
1397        assert!(
1398            entropy2 < entropy3,
1399            "Even more unique values should further increase entropy"
1400        );
1401    }
1402
1403    #[test]
1404    fn test_fixed_chunker_with_different_sizes() {
1405        let data = vec![0u8; 10000];
1406
1407        // Test with various chunk sizes
1408        for chunk_size in [64, 256, 1024, 4096, 65536] {
1409            let cursor = Cursor::new(data.clone());
1410            let chunker = FixedChunker::new(cursor, chunk_size);
1411
1412            let chunks: Vec<_> = chunker.map(|r| r.unwrap()).collect();
1413
1414            // Verify total data matches
1415            let total_len: usize = chunks.iter().map(|c| c.len()).sum();
1416            assert_eq!(
1417                total_len,
1418                data.len(),
1419                "Total chunked data should match original for chunk_size={}",
1420                chunk_size
1421            );
1422
1423            // Verify all except possibly last chunk have correct size
1424            for (i, chunk) in chunks.iter().enumerate() {
1425                if i < chunks.len() - 1 {
1426                    assert_eq!(
1427                        chunk.len(),
1428                        chunk_size,
1429                        "Non-final chunks should be exactly chunk_size"
1430                    );
1431                } else {
1432                    assert!(
1433                        chunk.len() <= chunk_size,
1434                        "Final chunk should be <= chunk_size"
1435                    );
1436                }
1437            }
1438        }
1439    }
1440}
hexz_core/ops/pack.rs

hexz_core/ops/
pack.rs