hexz_core/ops/
pack.rs

1//! High-level snapshot packing operations.
2//!
3//! This module implements the core business logic for creating Hexz snapshot files
4//! from raw disk and memory images. It orchestrates a multi-stage pipeline that
5//! transforms raw input data into compressed, indexed, and optionally encrypted
6//! snapshot files optimized for fast random access and deduplication.
7//!
8//! # Core Capabilities
9//!
10//! - **Dictionary Training**: Intelligent sampling and Zstd dictionary optimization
11//! - **Chunking Strategies**: Fixed-size blocks or content-defined (FastCDC) for better deduplication
12//! - **Compression**: LZ4 (fast) or Zstd (high-ratio) with optional dictionary support
13//! - **Encryption**: Per-block AES-256-GCM authenticated encryption
14//! - **Deduplication**: BLAKE3 based content deduplication (disabled for encrypted data)
15//! - **Hierarchical Indexing**: Two-level index structure for efficient random access
16//! - **Progress Reporting**: Optional callback interface for UI integration
17//!
18//! # Architecture
19//!
20//! The packing process follows a carefully orchestrated pipeline. Each stage is designed
21//! to be memory-efficient (streaming) and to minimize write amplification:
22//!
23//! ```text
24//! ┌─────────────────────────────────────────────────────────────────────┐
25//! │ Stage 1: Dictionary Training (Optional, Zstd only)                  │
26//! │                                                                      │
27//! │  Input File → Stratified Sampling → Entropy Filtering → Zstd Train │
28//! │                                                                      │
29//! │  - Samples ~4000 blocks evenly distributed across input             │
30//! │  - Filters out zero blocks and high-entropy data (>6.0 bits/byte)   │
31//! │  - Produces dictionary (max 110 KiB) optimized for dataset          │
32//! │  - Training time: 2-5 seconds for typical VM images                 │
33//! └─────────────────────────────────────────────────────────────────────┘
34//!                                  ↓
35//! ┌─────────────────────────────────────────────────────────────────────┐
36//! │ Stage 2: Stream Processing (Per Input: Disk, Memory)                │
37//! │                                                                      │
38//! │  Raw Input → Chunking → Compression → Encryption → Dedup → Write   │
39//! │                                                                      │
40//! │  Chunking:                                                           │
41//! │   - Fixed-size: Divide into equal blocks (default 64 KiB)           │
42//! │   - FastCDC: Content-defined boundaries for better deduplication    │
43//! │                                                                      │
44//! │  Zero Block Optimization:                                            │
45//! │   - Detect all-zero chunks (common in VM images)                    │
46//! │   - Store as metadata only (offset=0, length=0)                     │
47//! │   - Saves significant space for sparse images                       │
48//! │                                                                      │
49//! │  Deduplication (Unencrypted only):                                  │
50//! │   - Compute BLAKE3 hash of compressed data                           │
51//! │   - Check hash table for existing block                             │
52//! │   - Reuse offset if duplicate found                                 │
53//! │   - Note: Disabled for encrypted data (unique nonces prevent dedup) │
54//! │                                                                      │
55//! │  Index Page Building:                                                │
56//! │   - Accumulate BlockInfo metadata (offset, length, checksum)        │
57//! │   - Flush page when reaching 4096 entries (~16 MB logical data)     │
58//! │   - Write serialized page to output, record PageEntry               │
59//! └─────────────────────────────────────────────────────────────────────┘
60//!                                  ↓
61//! ┌─────────────────────────────────────────────────────────────────────┐
62//! │ Stage 3: Index Finalization                                          │
63//! │                                                                      │
64//! │  MasterIndex (disk_pages[], memory_pages[], sizes) → Serialize      │
65//! │                                                                      │
66//! │  - Collect all PageEntry records from both streams                  │
67//! │  - Write master index at end of file                                │
68//! │  - Record index offset in header                                    │
69//! └─────────────────────────────────────────────────────────────────────┘
70//!                                  ↓
71//! ┌─────────────────────────────────────────────────────────────────────┐
72//! │ Stage 4: Header Writing                                              │
73//! │                                                                      │
74//! │  - Seek to file start (reserved 512 bytes)                          │
75//! │  - Write Header with format metadata                          │
76//! │  - Includes: compression type, encryption params, index offset      │
77//! │  - Flush to ensure atomicity                                        │
78//! └─────────────────────────────────────────────────────────────────────┘
79//! ```
80//!
81//! # Optimization Strategies
82//!
83//! ## Dictionary Training Algorithm
84//!
85//! The dictionary training process improves compression ratios by 10-30% for
86//! structured data (file systems, databases) by building a Zstd shared dictionary:
87//!
88//! 1. **Stratified Sampling**: Sample blocks evenly across input to capture diversity
89//!    - Step size = file_size / target_samples (typically 4000 samples)
90//!    - Ensures coverage of different file system regions
91//!
92//! 2. **Quality Filtering**: Exclude unsuitable blocks
93//!    - Skip all-zero blocks (no compressible patterns)
94//!    - Compute Shannon entropy for each block
95//!    - Reject blocks with entropy > 6.0 bits/byte (likely encrypted/random)
96//!
97//! 3. **Training**: Feed filtered samples to Zstd dictionary builder
98//!    - Target dictionary size: 110 KiB (fits in L2 cache)
99//!    - Uses Zstd's COVER algorithm to extract common patterns
100//!
101//! ## Deduplication Mechanism
102//!
103//! Content-based deduplication eliminates redundant blocks:
104//!
105//! - **Hash Table**: Maps BLAKE3 hash → physical offset for each unique compressed block
106//! - **Collision Handling**: BLAKE3 collisions are astronomically unlikely (2^128 blocks)
107//! - **Memory Usage**: ~48 bytes per unique block (32-byte hash + 8-byte offset + HashMap overhead)
108//! - **Write Behavior**: Only write each unique block once; reuse offset for duplicates
109//! - **Encryption Interaction**: Disabled when encrypting (each block gets unique nonce/ciphertext)
110//!
111//! ## Index Page Management
112//!
113//! The two-level index hierarchy balances random access performance and metadata overhead:
114//!
115//! - **Page Size**: 4096 entries per page
116//!   - With 64 KiB blocks: Each page covers ~256 MB of logical data
117//!   - Serialized page size: ~64 KiB (fits in L2 cache)
118//!
119//! - **Flushing Strategy**: Eager flush when page fills
120//!   - Prevents memory growth during large packs
121//!   - Enables streaming operation (constant memory)
122//!
123//! - **Master Index**: Array of PageEntry records
124//!   - Binary search for O(log N) page lookup
125//!   - Typical overhead: 1 KiB per GB of data
126//!
127//! # Memory Usage Patterns
128//!
129//! The packing operation is designed for constant memory usage regardless of input size:
130//!
131//! - **Chunking Buffer**: 1 block (64 KiB default)
132//! - **Compression Output**: ~1.5× block size (worst case: incompressible data)
133//! - **Current Index Page**: Up to 4096 × 20 bytes = 80 KiB
134//! - **Deduplication Map**: ~48 bytes × unique_blocks
135//!   - Example: 10 GB image with 50% dedup = ~80 MB HashMap
136//! - **Dictionary**: 110 KiB (if trained)
137//!
138//! Total typical memory: 100-200 MB for dedup hash table + ~1 MB working set.
139//!
140//! # Error Recovery
141//!
142//! The packing operation is not atomic. On failure:
143//!
144//! - **Partial File**: Output file is left in incomplete state
145//! - **Header Invalid**: Header is written last, so partial packs have zeroed header
146//! - **Detection**: Readers validate magic bytes and header checksum
147//! - **Recovery**: None; must delete partial file and retry pack operation
148//!
149//! Future enhancement: Two-phase commit with temporary file + atomic rename.
150//!
151//! # Usage Contexts
152//!
153//! This module is designed to be called from multiple contexts:
154//!
155//! - **CLI Commands**: `hexz data pack` (with terminal progress bars)
156//! - **Python Bindings**: `hexz.pack()` (with optional callbacks)
157//! - **Rust Applications**: Direct API usage for embedded scenarios
158//!
159//! By keeping pack operations separate from UI/CLI code, we avoid pulling in
160//! heavy dependencies (`clap`, `indicatif`) into library contexts.
161//!
162//! # Examples
163//!
164//! ## Basic Packing (LZ4, No Encryption)
165//!
166//! ```no_run
167//! use hexz_core::ops::pack::{pack_snapshot, PackConfig};
168//! use std::path::PathBuf;
169//!
170//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
171//! let config = PackConfig {
172//!     disk: Some(PathBuf::from("disk.raw")),
173//!     memory: None,
174//!     output: PathBuf::from("snapshot.hxz"),
175//!     compression: "lz4".to_string(),
176//!     ..Default::default()
177//! };
178//!
179//! pack_snapshot::<fn(u64, u64)>(config, None)?;
180//! # Ok(())
181//! # }
182//! ```
183//!
184//! ## Advanced Packing (Zstd with Dictionary, CDC, Encryption)
185//!
186//! ```no_run
187//! use hexz_core::ops::pack::{pack_snapshot, PackConfig};
188//! use std::path::PathBuf;
189//!
190//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
191//! let config = PackConfig {
192//!     disk: Some(PathBuf::from("ubuntu.qcow2")),
193//!     output: PathBuf::from("ubuntu.hxz"),
194//!     compression: "zstd".to_string(),
195//!     train_dict: true,         // Train dictionary for better ratio
196//!     cdc_enabled: true,        // Content-defined chunking
197//!     encrypt: true,
198//!     password: Some("secure_passphrase".to_string()),
199//!     min_chunk: 16384,         // 16 KiB minimum chunk
200//!     avg_chunk: 65536,         // 64 KiB average chunk
201//!     max_chunk: 262144,        // 256 KiB maximum chunk
202//!     ..Default::default()
203//! };
204//!
205//! pack_snapshot::<fn(u64, u64)>(config, None)?;
206//! # Ok(())
207//! # }
208//! ```
209//!
210//! ## Progress Reporting
211//!
212//! ```no_run
213//! use hexz_core::ops::pack::{pack_snapshot, PackConfig};
214//! use std::path::PathBuf;
215//!
216//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
217//! let config = PackConfig {
218//!     disk: Some(PathBuf::from("disk.raw")),
219//!     output: PathBuf::from("snapshot.hxz"),
220//!     ..Default::default()
221//! };
222//!
223//! // Callback receives (current_logical_pos, total_size)
224//! pack_snapshot(config, Some(|pos, total| {
225//!     let pct = (pos as f64 / total as f64) * 100.0;
226//!     println!("Packing: {:.1}%", pct);
227//! }))?;
228//! # Ok(())
229//! # }
230//! ```
231//!
232//! # Performance Characteristics
233//!
234//! ## Throughput (Single-Threaded, i7-14700K)
235//!
236//! Validated benchmarks (see `docs/project-docs/BENCHMARKS.md` for details):
237//!
238//! - **LZ4 Compression**: 22 GB/s (minimal CPU overhead)
239//! - **LZ4 Decompression**: 31 GB/s
240//! - **Zstd Level 3 Compression**: 8.7 GB/s
241//! - **Zstd Level 3 Decompression**: 12.9 GB/s
242//! - **BLAKE3 Hashing**: 5.3 GB/s (2.2× faster than SHA-256)
243//! - **SHA-256 Hashing**: 2.5 GB/s
244//! - **FastCDC Chunking**: 2.7 GB/s (gear-based rolling hash)
245//! - **AES-256-GCM Encryption**: 2.1 GB/s (hardware AES-NI acceleration)
246//! - **Pack Throughput (LZ4, no CDC)**: 4.9 GB/s (64KB blocks)
247//! - **Pack Throughput (LZ4 + CDC)**: 1.9 GB/s (CDC adds 2.6× overhead)
248//! - **Pack Throughput (Zstd-3)**: 1.6 GB/s
249//! - **Block Size Impact**: 2.3 GB/s (4KB) → 4.7 GB/s (64KB) → 5.1 GB/s (1MB)
250//!
251//! Typical bottleneck: CDC chunking (when enabled) or compression CPU time. SSD I/O rarely limits.
252//!
253//! Run benchmarks: `cargo bench --bench compression`, `cargo bench --bench hashing`, `cargo bench --bench cdc_chunking`, `cargo bench --bench encryption`, `cargo bench --bench write_throughput`, and `cargo bench --bench block_size_tradeoffs`
254//!
255//! ## Compression Ratios (Typical VM Images)
256//!
257//! - **LZ4**: 2-3× (fast but lower ratio)
258//! - **Zstd Level 3**: 3-5× (good balance)
259//! - **Zstd + Dictionary**: 4-7× (+30% improvement from dictionary)
260//! - **CDC Deduplication**: Not validated - need benchmark comparing CDC vs fixed-size chunking
261//!
262//! ## Time Estimates (64 GB VM Image, Single Thread)
263//!
264//! - **LZ4, Fixed Blocks**: ~30-45 seconds
265//! - **Zstd, Fixed Blocks**: ~2-3 minutes
266//! - **Zstd + Dictionary + CDC**: ~3-5 minutes (includes 2-5s training time)
267//!
268//! # Atomicity and Crash Safety
269//!
270//! **WARNING**: Pack operations are NOT atomic. If interrupted:
271//!
272//! - Output file is left in a partially written state
273//! - The header (written last) will be all zeros
274//! - Readers will reject the file due to invalid magic bytes
275//! - Manual cleanup is required (delete partial file)
276//!
277//! For production use cases requiring atomicity, write to a temporary file and
278//! perform an atomic rename after successful completion.
279
280use hexz_common::constants::{DICT_TRAINING_SIZE, ENTROPY_THRESHOLD};
281use hexz_common::crypto::KeyDerivationParams;
282use hexz_common::{Error, Result};
283use std::fs::File;
284use std::io::{Read, Seek, SeekFrom};
285use std::path::{Path, PathBuf};
286
287use crate::algo::compression::{create_compressor_from_str, zstd::ZstdCompressor};
288use crate::algo::dedup::cdc::StreamChunker;
289use crate::algo::dedup::dcam::DedupeParams;
290use crate::algo::encryption::{Encryptor, aes_gcm::AesGcmEncryptor};
291use crate::ops::snapshot_writer::SnapshotWriter;
292
293/// Configuration parameters for snapshot packing.
294///
295/// This struct encapsulates all settings for the packing process. It's designed
296/// to be easily constructed from CLI arguments or programmatic APIs.
297///
298/// # Examples
299///
300/// ```
301/// use hexz_core::ops::pack::PackConfig;
302/// use std::path::PathBuf;
303///
304/// // Basic configuration with defaults
305/// let config = PackConfig {
306///     disk: Some(PathBuf::from("disk.img")),
307///     output: PathBuf::from("snapshot.hxz"),
308///     ..Default::default()
309/// };
310///
311/// // Advanced configuration with CDC and encryption
312/// let advanced = PackConfig {
313///     disk: Some(PathBuf::from("disk.img")),
314///     output: PathBuf::from("snapshot.hxz"),
315///     compression: "zstd".to_string(),
316///     encrypt: true,
317///     password: Some("secret".to_string()),
318///     cdc_enabled: true,
319///     min_chunk: 16384,
320///     avg_chunk: 65536,
321///     max_chunk: 131072,
322///     ..Default::default()
323/// };
324/// ```
325#[derive(Debug, Clone)]
326pub struct PackConfig {
327    /// Path to the disk image (optional).
328    pub disk: Option<PathBuf>,
329    /// Path to the memory image (optional).
330    pub memory: Option<PathBuf>,
331    /// Output snapshot file path.
332    pub output: PathBuf,
333    /// Compression algorithm ("lz4" or "zstd").
334    pub compression: String,
335    /// Enable encryption.
336    pub encrypt: bool,
337    /// Encryption password (required if encrypt=true).
338    pub password: Option<String>,
339    /// Train a compression dictionary (zstd only).
340    pub train_dict: bool,
341    /// Block size in bytes.
342    pub block_size: u32,
343    /// Enable content-defined chunking (CDC).
344    pub cdc_enabled: bool,
345    /// Minimum chunk size for CDC.
346    pub min_chunk: u32,
347    /// Average chunk size for CDC.
348    pub avg_chunk: u32,
349    /// Maximum chunk size for CDC.
350    pub max_chunk: u32,
351}
352
353impl Default for PackConfig {
354    fn default() -> Self {
355        Self {
356            disk: None,
357            memory: None,
358            output: PathBuf::from("output.hxz"),
359            compression: "lz4".to_string(),
360            encrypt: false,
361            password: None,
362            train_dict: false,
363            block_size: 65536,
364            cdc_enabled: false,
365            min_chunk: 16384,
366            avg_chunk: 65536,
367            max_chunk: 131072,
368        }
369    }
370}
371
372/// Calculates Shannon entropy of a byte slice.
373///
374/// Shannon entropy measures the "randomness" or information content of data:
375/// - **0.0**: All bytes are identical (highly compressible)
376/// - **8.0**: Maximum entropy, random data (incompressible)
377///
378/// # Formula
379///
380/// ```text
381/// H(X) = -Σ p(x) * log2(p(x))
382/// ```
383///
384/// Where `p(x)` is the frequency of each byte value.
385///
386/// # Usage
387///
388/// Used during dictionary training to filter out high-entropy (random) blocks
389/// that wouldn't benefit from compression. Only blocks with entropy below
390/// `ENTROPY_THRESHOLD` are included in the training set.
391///
392/// # Parameters
393///
394/// - `data`: Byte slice to analyze
395///
396/// # Returns
397///
398/// Entropy value from 0.0 (homogeneous) to 8.0 (random).
399///
400/// # Examples
401///
402/// ```
403/// # use hexz_core::ops::pack::calculate_entropy;
404/// // Homogeneous data (low entropy)
405/// let zeros = vec![0u8; 1024];
406/// let entropy = calculate_entropy(&zeros);
407/// assert_eq!(entropy, 0.0);
408///
409/// // Random data (high entropy)
410/// let random: Vec<u8> = (0..=255).cycle().take(1024).collect();
411/// let entropy = calculate_entropy(&random);
412/// assert!(entropy > 7.0);
413/// ```
414pub fn calculate_entropy(data: &[u8]) -> f64 {
415    if data.is_empty() {
416        return 0.0;
417    }
418
419    let mut frequencies = [0u32; 256];
420    for &byte in data {
421        frequencies[byte as usize] += 1;
422    }
423
424    let len = data.len() as f64;
425    let mut entropy = 0.0;
426
427    for &count in frequencies.iter() {
428        if count > 0 {
429            let p = count as f64 / len;
430            entropy -= p * p.log2();
431        }
432    }
433
434    entropy
435}
436
437/// Trait for chunk iterators (fixed-size or content-defined).
438///
439/// This trait provides a unified interface for both fixed-size and CDC chunkers,
440/// allowing the packing logic to be agnostic to the chunking strategy.
441trait Chunker: Iterator<Item = std::io::Result<Vec<u8>>> {}
442impl<T: Iterator<Item = std::io::Result<Vec<u8>>>> Chunker for T {}
443
444/// Fixed-size block chunker.
445///
446/// Splits input into equal-sized blocks (except possibly the last one).
447/// Simpler and faster than CDC, but less effective for deduplication.
448///
449/// Uses a read loop to guarantee full blocks, avoiding short reads from
450/// pipes, network streams, or OS buffering.
451pub struct FixedChunker<R> {
452    reader: R,
453    block_size: usize,
454}
455
456impl<R: Read> FixedChunker<R> {
457    /// Creates a new fixed-size chunker.
458    pub fn new(reader: R, block_size: usize) -> Self {
459        Self { reader, block_size }
460    }
461}
462
463impl<R: Read> Iterator for FixedChunker<R> {
464    type Item = std::io::Result<Vec<u8>>;
465
466    fn next(&mut self) -> Option<Self::Item> {
467        let mut buf = vec![0u8; self.block_size];
468        let mut pos = 0;
469        while pos < self.block_size {
470            match self.reader.read(&mut buf[pos..]) {
471                Ok(0) => break,
472                Ok(n) => pos += n,
473                Err(e) => return Some(Err(e)),
474            }
475        }
476        if pos == 0 {
477            None
478        } else {
479            buf.truncate(pos);
480            Some(Ok(buf))
481        }
482    }
483}
484
485/// Packs a snapshot file from disk and/or memory images.
486///
487/// This is the main entry point for creating Hexz snapshot files. It orchestrates
488/// the complete packing pipeline: dictionary training, stream processing, index
489/// building, and header finalization.
490///
491/// # Workflow
492///
493/// 1. **Validation**: Ensure at least one input (disk or memory) is provided
494/// 2. **File Creation**: Create output file, reserve 512 bytes for header
495/// 3. **Dictionary Training**: If requested (Zstd only), train dictionary from input samples
496/// 4. **Dictionary Writing**: If trained, write dictionary immediately after header
497/// 5. **Compressor Initialization**: Create LZ4 or Zstd compressor (with optional dictionary)
498/// 6. **Encryptor Initialization**: If requested, derive key from password using PBKDF2
499/// 7. **Stream Processing**: Process disk stream (if provided), then memory stream (if provided)
500///    - Each stream independently chunks, compresses, encrypts, deduplicates, and indexes
501/// 8. **Master Index Writing**: Serialize master index (all PageEntry records) to end of file
502/// 9. **Header Writing**: Seek to start, write complete header with metadata and offsets
503/// 10. **Flush**: Ensure all data is written to disk
504///
505/// # Parameters
506///
507/// - `config`: Packing configuration parameters (see [`PackConfig`])
508/// - `progress_callback`: Optional callback for progress reporting
509///   - Called frequently during stream processing (~once per 64 KiB)
510///   - Signature: `Fn(logical_pos: u64, total_size: u64)`
511///   - Example: `|pos, total| println!("Progress: {:.1}%", (pos as f64 / total as f64) * 100.0)`
512///
513/// # Returns
514///
515/// - `Ok(())`: Snapshot packed successfully
516/// - `Err(Error::Io)`: I/O error (file access, disk full, permission denied)
517/// - `Err(Error::Compression)`: Compression error (unlikely, usually indicates invalid state)
518/// - `Err(Error::Encryption)`: Encryption error (invalid password format, crypto failure)
519///
520/// # Errors
521///
522/// This function can fail for several reasons:
523///
524/// ## I/O Errors
525///
526/// - **Input file not found**: `config.disk` or `config.memory` path doesn't exist
527/// - **Permission denied**: Cannot read input or write output
528/// - **Disk full**: Insufficient space for output file
529/// - **Output exists**: May overwrite existing file without warning
530///
531/// ## Configuration Errors
532///
533/// - **No inputs**: Neither `disk` nor `memory` is provided
534/// - **Missing password**: `encrypt = true` but `password = None`
535/// - **Invalid block size**: Block size too small (<1 KiB) or too large (>16 MiB)
536/// - **Invalid CDC params**: `min_chunk >= avg_chunk >= max_chunk` constraint violated
537///
538/// ## Compression/Encryption Errors
539///
540/// - **Dictionary training failure**: Zstd training fails (rare, usually on corrupted input)
541/// - **Compression failure**: Compressor returns error (rare, usually indicates bug)
542/// - **Encryption failure**: Key derivation or cipher initialization fails
543///
544/// # Examples
545///
546/// ## Basic Usage
547///
548/// ```no_run
549/// use hexz_core::ops::pack::{pack_snapshot, PackConfig};
550/// use std::path::PathBuf;
551///
552/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
553/// let config = PackConfig {
554///     disk: Some(PathBuf::from("disk.raw")),
555///     output: PathBuf::from("snapshot.hxz"),
556///     ..Default::default()
557/// };
558///
559/// pack_snapshot::<fn(u64, u64)>(config, None)?;
560/// # Ok(())
561/// # }
562/// ```
563///
564/// ## With Progress Reporting
565///
566/// ```no_run
567/// use hexz_core::ops::pack::{pack_snapshot, PackConfig};
568/// use std::path::PathBuf;
569///
570/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
571/// let config = PackConfig {
572///     disk: Some(PathBuf::from("ubuntu.qcow2")),
573///     output: PathBuf::from("ubuntu.hxz"),
574///     compression: "zstd".to_string(),
575///     train_dict: true,
576///     ..Default::default()
577/// };
578///
579/// pack_snapshot(config, Some(|pos, total| {
580///     eprint!("\rPacking: {:.1}%", (pos as f64 / total as f64) * 100.0);
581/// }))?;
582/// eprintln!("\nDone!");
583/// # Ok(())
584/// # }
585/// ```
586///
587/// ## Encrypted Snapshot
588///
589/// ```no_run
590/// use hexz_core::ops::pack::{pack_snapshot, PackConfig};
591/// use std::path::PathBuf;
592///
593/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
594/// let config = PackConfig {
595///     disk: Some(PathBuf::from("sensitive.raw")),
596///     output: PathBuf::from("sensitive.hxz"),
597///     encrypt: true,
598///     password: Some("strong_passphrase".to_string()),
599///     ..Default::default()
600/// };
601///
602/// pack_snapshot::<fn(u64, u64)>(config, None)?;
603/// println!("Encrypted snapshot created");
604/// # Ok(())
605/// # }
606/// ```
607///
608/// ## Content-Defined Chunking for Deduplication
609///
610/// ```no_run
611/// use hexz_core::ops::pack::{pack_snapshot, PackConfig};
612/// use std::path::PathBuf;
613///
614/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
615/// let config = PackConfig {
616///     disk: Some(PathBuf::from("incremental-backup.raw")),
617///     output: PathBuf::from("backup.hxz"),
618///     cdc_enabled: true,
619///     min_chunk: 16384,   // 16 KiB
620///     avg_chunk: 65536,   // 64 KiB
621///     max_chunk: 262144,  // 256 KiB
622///     ..Default::default()
623/// };
624///
625/// pack_snapshot::<fn(u64, u64)>(config, None)?;
626/// # Ok(())
627/// # }
628/// ```
629///
630/// # Performance
631///
632/// See module-level documentation for detailed performance characteristics.
633///
634/// Typical throughput for a 64 GB VM image on modern hardware (Intel i7, NVMe SSD):
635///
636/// - **LZ4, no encryption**: ~2 GB/s (~30 seconds total)
637/// - **Zstd level 3, no encryption**: ~500 MB/s (~2 minutes total)
638/// - **Zstd + dictionary + CDC**: ~400 MB/s (~3 minutes including training)
639///
640/// # Atomicity
641///
642/// This operation is NOT atomic. On failure, the output file will be left in a
643/// partially written state. The file header is written last, so incomplete files
644/// will have an all-zero header and will be rejected by readers.
645///
646/// For atomic pack operations, write to a temporary file and perform an atomic
647/// rename after success:
648///
649/// ```no_run
650/// # use hexz_core::ops::pack::{pack_snapshot, PackConfig};
651/// # use std::path::PathBuf;
652/// # use std::fs;
653/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
654/// let mut config = PackConfig {
655///     disk: Some(PathBuf::from("disk.raw")),
656///     output: PathBuf::from("snapshot.st.tmp"),
657///     ..Default::default()
658/// };
659///
660/// pack_snapshot::<fn(u64, u64)>(config.clone(), None)?;
661/// fs::rename("snapshot.st.tmp", "snapshot.hxz")?;
662/// # Ok(())
663/// # }
664/// ```
665///
666/// # Thread Safety
667///
668/// This function is not thread-safe with respect to the output file. Do not call
669/// `pack_snapshot` concurrently with the same output path. Concurrent packing to
670/// different output files is safe.
671///
672/// The progress callback must be `Send + Sync` if you want to call this function
673/// from a non-main thread.
674pub fn pack_snapshot<F>(config: PackConfig, progress_callback: Option<F>) -> Result<()>
675where
676    F: Fn(u64, u64) + Send + Sync,
677{
678    // Validate inputs
679    if config.disk.is_none() && config.memory.is_none() {
680        return Err(Error::Io(std::io::Error::new(
681            std::io::ErrorKind::InvalidInput,
682            "At least one input (disk or memory) must be provided",
683        )));
684    }
685
686    // Train compression dictionary if requested
687    let dictionary = if config.compression == "zstd" && config.train_dict {
688        Some(train_dictionary(
689            config
690                .disk
691                .as_ref()
692                .or(config.memory.as_ref())
693                .ok_or_else(|| {
694                    Error::Io(std::io::Error::new(
695                        std::io::ErrorKind::InvalidInput,
696                        "No input file available for dictionary training",
697                    ))
698                })?,
699            config.block_size,
700        )?)
701    } else {
702        None
703    };
704
705    // Initialize compressor
706    let (compressor, compression_type) =
707        create_compressor_from_str(&config.compression, None, dictionary.clone())?;
708
709    // Initialize encryptor if requested
710    let (encryptor, enc_params): (Option<Box<dyn Encryptor>>, _) = if config.encrypt {
711        let password = config.password.clone().ok_or_else(|| {
712            Error::Io(std::io::Error::new(
713                std::io::ErrorKind::InvalidInput,
714                "Password required for encryption",
715            ))
716        })?;
717        let params = KeyDerivationParams::default();
718        let enc = AesGcmEncryptor::new(password.as_bytes(), &params.salt, params.iterations)?;
719        (Some(Box::new(enc) as Box<dyn Encryptor>), Some(params))
720    } else {
721        (None, None)
722    };
723
724    // Build the snapshot writer with optional encryption
725    let mut builder = SnapshotWriter::builder(&config.output, compressor, compression_type)
726        .block_size(config.block_size)
727        .variable_blocks(config.cdc_enabled);
728
729    if let (Some(enc), Some(params)) = (encryptor, enc_params) {
730        builder = builder.encryption(enc, params);
731    }
732
733    let mut writer = builder.build()?;
734
735    // Write dictionary to file
736    if let Some(d) = &dictionary {
737        writer.write_dictionary(d)?;
738    }
739
740    // Process disk stream
741    if let Some(ref path) = config.disk {
742        process_stream(
743            path.clone(),
744            true,
745            &mut writer,
746            &config,
747            progress_callback.as_ref(),
748        )?;
749    }
750
751    // Process memory stream
752    if let Some(ref path) = config.memory {
753        process_stream(
754            path.clone(),
755            false,
756            &mut writer,
757            &config,
758            progress_callback.as_ref(),
759        )?;
760    }
761
762    writer.finalize(None, None)?;
763
764    Ok(())
765}
766
767/// Trains a Zstd compression dictionary from stratified samples.
768///
769/// Dictionary training analyzes a representative sample of input blocks to build
770/// a shared dictionary that improves compression ratios for structured data
771/// (file systems, databases, logs) by capturing common patterns.
772///
773/// # Algorithm
774///
775/// 1. **Stratified Sampling**: Sample blocks evenly across the file
776///    - Compute step size: `file_size / target_samples`
777///    - Read one block at each sample point
778///    - Ensures coverage of different regions (boot sector, metadata, data)
779///
780/// 2. **Quality Filtering**: Exclude unsuitable blocks
781///    - Skip all-zero blocks (no compressible patterns)
782///    - Compute Shannon entropy (0-8 bits per byte)
783///    - Reject blocks with entropy > `ENTROPY_THRESHOLD` (6.0)
784///    - Rationale: High-entropy data (encrypted, random) doesn't benefit from dictionaries
785///
786/// 3. **Dictionary Training**: Feed filtered samples to Zstd
787///    - Uses Zstd's COVER algorithm (fast_cover variant)
788///    - Analyzes n-grams to find common subsequences
789///    - Outputs dictionary up to `DICT_TRAINING_SIZE` (110 KiB)
790///
791/// # Parameters
792///
793/// - `input_path`: Path to the input file to sample from
794/// - `block_size`: Size of each sample block in bytes
795///
796/// # Returns
797///
798/// - `Ok(Vec<u8>)`: Trained dictionary bytes (empty if training fails or no suitable samples)
799/// - `Err(Error)`: I/O error reading input file
800///
801/// # Performance
802///
803/// - **Sampling time**: ~100-500 ms (depends on file size and disk speed)
804/// - **Training time**: ~2-5 seconds for 4000 samples
805/// - **Memory usage**: ~256 MB (sample corpus in RAM)
806///
807/// # Compression Improvement
808///
809/// - **Typical**: 10-30% better ratio vs. no dictionary
810/// - **Best case**: 50%+ improvement for highly structured data (databases)
811/// - **Worst case**: No improvement or slight regression (already compressed data)
812///
813/// # Edge Cases
814///
815/// - **Empty file**: Returns empty dictionary with warning
816/// - **All high-entropy data**: Returns empty dictionary with warning
817/// - **Small files**: May not reach target sample count (trains on available data)
818///
819/// # Examples
820///
821/// Called internally by `pack_snapshot` when `train_dict` is enabled:
822///
823/// ```text
824/// let dict = train_dictionary(Path::new("disk.raw"), 65536)?;
825/// // dict: Vec<u8> containing the trained zstd dictionary
826/// ```
827fn train_dictionary(input_path: &Path, block_size: u32) -> Result<Vec<u8>> {
828    let mut f = File::open(input_path)?;
829    let file_len = f.metadata()?.len();
830
831    let mut samples = Vec::new();
832    let mut buffer = vec![0u8; block_size as usize];
833    let target_samples = DICT_TRAINING_SIZE;
834
835    let step = if file_len > 0 {
836        (file_len / target_samples as u64).max(block_size as u64)
837    } else {
838        0
839    };
840
841    let mut attempts = 0;
842    while samples.len() < target_samples && attempts < target_samples * 2 {
843        let offset = attempts as u64 * step;
844        if offset >= file_len {
845            break;
846        }
847
848        f.seek(SeekFrom::Start(offset))?;
849        let n = f.read(&mut buffer)?;
850        if n == 0 {
851            break;
852        }
853        let chunk = &buffer[..n];
854        let is_zeros = chunk.iter().all(|&b| b == 0);
855
856        if !is_zeros {
857            let entropy = calculate_entropy(chunk);
858            if entropy < ENTROPY_THRESHOLD {
859                samples.push(chunk.to_vec());
860            }
861        }
862        attempts += 1;
863    }
864
865    if samples.is_empty() {
866        tracing::warn!("Input seems to be empty or high entropy. Dictionary will be empty.");
867        Ok(Vec::new())
868    } else {
869        let dict_bytes = ZstdCompressor::train(&samples, DICT_TRAINING_SIZE)?;
870        tracing::info!("Dictionary trained: {} bytes", dict_bytes.len());
871        Ok(dict_bytes)
872    }
873}
874
875/// Processes a single input stream (disk or memory) via the [`SnapshotWriter`].
876fn process_stream<F>(
877    path: PathBuf,
878    is_disk: bool,
879    writer: &mut SnapshotWriter,
880    config: &PackConfig,
881    progress_callback: Option<&F>,
882) -> Result<()>
883where
884    F: Fn(u64, u64),
885{
886    let f = File::open(&path)?;
887    let len = f.metadata()?.len();
888
889    writer.begin_stream(is_disk, len);
890
891    // Choose chunker based on configuration
892    let chunker: Box<dyn Chunker> = if config.cdc_enabled {
893        let params = DedupeParams {
894            f: (config.avg_chunk as f64).log2() as u32,
895            m: config.min_chunk,
896            z: config.max_chunk,
897            w: 48,
898            v: 8,
899        };
900        Box::new(StreamChunker::new(f, params))
901    } else {
902        Box::new(FixedChunker::new(f, config.block_size as usize))
903    };
904
905    let mut logical_pos = 0u64;
906
907    for chunk_res in chunker {
908        let chunk = chunk_res?;
909        logical_pos += chunk.len() as u64;
910
911        writer.write_data_block(&chunk)?;
912
913        if let Some(callback) = progress_callback {
914            callback(logical_pos, len);
915        }
916    }
917
918    writer.end_stream()?;
919    Ok(())
920}
921
922#[cfg(test)]
923mod tests {
924    use super::*;
925    use std::io::Cursor;
926
927    #[test]
928    fn test_calculate_entropy_empty() {
929        assert_eq!(calculate_entropy(&[]), 0.0);
930    }
931
932    #[test]
933    fn test_calculate_entropy_uniform() {
934        // All same byte - lowest entropy
935        let data = vec![0x42; 1000];
936        let entropy = calculate_entropy(&data);
937        assert!(
938            entropy < 0.01,
939            "Entropy should be near 0.0 for uniform data"
940        );
941    }
942
943    #[test]
944    fn test_calculate_entropy_binary() {
945        // Two values - low entropy
946        let mut data = vec![0u8; 500];
947        data.extend(vec![1u8; 500]);
948        let entropy = calculate_entropy(&data);
949        assert!(
950            entropy > 0.9 && entropy < 1.1,
951            "Entropy should be ~1.0 for binary data"
952        );
953    }
954
955    #[test]
956    fn test_calculate_entropy_random() {
957        // All 256 values - high entropy
958        let data: Vec<u8> = (0..=255).cycle().take(256 * 4).collect();
959        let entropy = calculate_entropy(&data);
960        assert!(
961            entropy > 7.5,
962            "Entropy should be high for all byte values: got {}",
963            entropy
964        );
965    }
966
967    #[test]
968    fn test_calculate_entropy_single_byte() {
969        assert_eq!(calculate_entropy(&[42]), 0.0);
970    }
971
972    #[test]
973    fn test_calculate_entropy_two_different_bytes() {
974        let data = vec![0, 255];
975        let entropy = calculate_entropy(&data);
976        assert!(entropy > 0.9 && entropy < 1.1, "Entropy should be ~1.0");
977    }
978
979    #[test]
980    fn test_fixed_chunker_exact_blocks() {
981        let data = vec![1, 2, 3, 4, 5, 6, 7, 8];
982        let cursor = Cursor::new(data);
983        let chunker = FixedChunker::new(cursor, 4);
984
985        let chunks: Vec<_> = chunker.map(|r| r.unwrap()).collect();
986
987        assert_eq!(chunks.len(), 2);
988        assert_eq!(chunks[0], vec![1, 2, 3, 4]);
989        assert_eq!(chunks[1], vec![5, 6, 7, 8]);
990    }
991
992    #[test]
993    fn test_fixed_chunker_partial_last_block() {
994        let data = vec![1, 2, 3, 4, 5];
995        let cursor = Cursor::new(data);
996        let chunker = FixedChunker::new(cursor, 3);
997
998        let chunks: Vec<_> = chunker.map(|r| r.unwrap()).collect();
999
1000        assert_eq!(chunks.len(), 2);
1001        assert_eq!(chunks[0], vec![1, 2, 3]);
1002        assert_eq!(chunks[1], vec![4, 5]);
1003    }
1004
1005    #[test]
1006    fn test_fixed_chunker_empty_input() {
1007        let data = vec![];
1008        let cursor = Cursor::new(data);
1009        let chunker = FixedChunker::new(cursor, 1024);
1010
1011        let chunks: Vec<_> = chunker.map(|r| r.unwrap()).collect();
1012
1013        assert_eq!(chunks.len(), 0);
1014    }
1015
1016    #[test]
1017    fn test_fixed_chunker_single_byte_blocks() {
1018        let data = vec![1, 2, 3];
1019        let cursor = Cursor::new(data);
1020        let chunker = FixedChunker::new(cursor, 1);
1021
1022        let chunks: Vec<_> = chunker.map(|r| r.unwrap()).collect();
1023
1024        assert_eq!(chunks.len(), 3);
1025        assert_eq!(chunks[0], vec![1]);
1026        assert_eq!(chunks[1], vec![2]);
1027        assert_eq!(chunks[2], vec![3]);
1028    }
1029
1030    #[test]
1031    fn test_fixed_chunker_large_block_size() {
1032        let data = vec![1, 2, 3, 4, 5];
1033        let cursor = Cursor::new(data.clone());
1034        let chunker = FixedChunker::new(cursor, 10000);
1035
1036        let chunks: Vec<_> = chunker.map(|r| r.unwrap()).collect();
1037
1038        assert_eq!(chunks.len(), 1);
1039        assert_eq!(chunks[0], data);
1040    }
1041
1042    #[test]
1043    fn test_pack_config_default() {
1044        let config = PackConfig::default();
1045
1046        assert_eq!(config.compression, "lz4");
1047        assert!(!config.encrypt);
1048        assert_eq!(config.password, None);
1049        assert!(!config.train_dict);
1050        assert_eq!(config.block_size, 65536);
1051        assert!(!config.cdc_enabled);
1052        assert_eq!(config.min_chunk, 16384);
1053        assert_eq!(config.avg_chunk, 65536);
1054        assert_eq!(config.max_chunk, 131072);
1055    }
1056
1057    #[test]
1058    fn test_pack_config_clone() {
1059        let config1 = PackConfig {
1060            disk: Some(PathBuf::from("/dev/sda")),
1061            output: PathBuf::from("output.hxz"),
1062            compression: "zstd".to_string(),
1063            encrypt: true,
1064            password: Some("secret".to_string()),
1065            ..Default::default()
1066        };
1067
1068        let config2 = config1.clone();
1069
1070        assert_eq!(config2.disk, config1.disk);
1071        assert_eq!(config2.output, config1.output);
1072        assert_eq!(config2.compression, config1.compression);
1073        assert_eq!(config2.encrypt, config1.encrypt);
1074        assert_eq!(config2.password, config1.password);
1075    }
1076
1077    #[test]
1078    fn test_pack_config_debug() {
1079        let config = PackConfig::default();
1080        let debug_str = format!("{:?}", config);
1081
1082        assert!(debug_str.contains("PackConfig"));
1083        assert!(debug_str.contains("lz4"));
1084    }
1085
1086    #[test]
1087    fn test_entropy_threshold_filtering() {
1088        // Test data with entropy below threshold (compressible)
1089        let low_entropy_data = vec![0u8; 1024];
1090        assert!(calculate_entropy(&low_entropy_data) < ENTROPY_THRESHOLD);
1091
1092        // Test data with entropy above threshold (random)
1093        let high_entropy_data: Vec<u8> = (0..1024).map(|i| ((i * 7) % 256) as u8).collect();
1094        let entropy = calculate_entropy(&high_entropy_data);
1095        // This might not always be above threshold depending on the pattern,
1096        // but we can still test that entropy calculation works
1097        assert!((0.0..=8.0).contains(&entropy));
1098    }
1099
1100    #[test]
1101    fn test_entropy_calculation_properties() {
1102        // Entropy should increase with more unique values
1103        let data1 = vec![0u8; 100];
1104        let data2 = [0u8, 1u8].repeat(50);
1105        let mut data3 = Vec::new();
1106        for i in 0..100 {
1107            data3.push((i % 10) as u8);
1108        }
1109
1110        let entropy1 = calculate_entropy(&data1);
1111        let entropy2 = calculate_entropy(&data2);
1112        let entropy3 = calculate_entropy(&data3);
1113
1114        assert!(
1115            entropy1 < entropy2,
1116            "More unique values should increase entropy"
1117        );
1118        assert!(
1119            entropy2 < entropy3,
1120            "Even more unique values should further increase entropy"
1121        );
1122    }
1123
1124    #[test]
1125    fn test_fixed_chunker_with_different_sizes() {
1126        let data = vec![0u8; 10000];
1127
1128        // Test with various chunk sizes
1129        for chunk_size in [64, 256, 1024, 4096, 65536] {
1130            let cursor = Cursor::new(data.clone());
1131            let chunker = FixedChunker::new(cursor, chunk_size);
1132
1133            let chunks: Vec<_> = chunker.map(|r| r.unwrap()).collect();
1134
1135            // Verify total data matches
1136            let total_len: usize = chunks.iter().map(|c| c.len()).sum();
1137            assert_eq!(
1138                total_len,
1139                data.len(),
1140                "Total chunked data should match original for chunk_size={}",
1141                chunk_size
1142            );
1143
1144            // Verify all except possibly last chunk have correct size
1145            for (i, chunk) in chunks.iter().enumerate() {
1146                if i < chunks.len() - 1 {
1147                    assert_eq!(
1148                        chunk.len(),
1149                        chunk_size,
1150                        "Non-final chunks should be exactly chunk_size"
1151                    );
1152                } else {
1153                    assert!(
1154                        chunk.len() <= chunk_size,
1155                        "Final chunk should be <= chunk_size"
1156                    );
1157                }
1158            }
1159        }
1160    }
1161}
hexz_core/ops/pack.rs

hexz_core/ops/
pack.rs