hexz_cli/cmd/data/
analyze.rs

1//! Analyze archive structure and optimize CDC parameters using DCAM.
2//!
3//! This command performs offline analysis of disk images to scientifically
4//! determine optimal content-defined chunking (CDC) parameters. It uses DCAM
5//! (Deduplication Change-Estimation Analytical Model), a mathematical framework
6//! that predicts deduplication effectiveness without performing full chunking,
7//! enabling fast parameter optimization.
8//!
9//! # DCAM Algorithm Overview
10//!
11//! DCAM estimates deduplication efficiency by:
12//!
13//! 1. **Baseline Pass**: Chunks a sample with LBFS parameters (8 KiB average)
14//! 2. **Change Probability**: Calculates `c` (fraction of unique data)
15//! 3. **Greedy Search**: Tests parameter combinations to minimize deduped size
16//! 4. **Prediction**: Uses analytical model to estimate full-file deduplication
17//!
18//! The key insight is that deduplication effectiveness depends on:
19//! - `f`: Fingerprint bits (determines average chunk size = 2^f)
20//! - `m`: Minimum chunk size (prevents pathologically small chunks)
21//! - `c`: Change probability (intrinsic data characteristic)
22//!
23//! DCAM predicts the deduplication ratio without actually deduplicating the
24//! entire file, making optimization practical for large disk images.
25//!
26//! # Greedy Search Algorithm
27//!
28//! The `find_optimal_parameters` function implements a hill-climbing search:
29//!
30//! **Algorithm:**
31//! ```text
32//! current = baseline parameters (f=13, m=256)
33//! best_ratio = predict_ratio(current)
34//!
35//! while improved:
36//!   for each neighbor of current (f±1, m×2, m÷2):
37//!     ratio = predict_ratio(neighbor)
38//!     if ratio < best_ratio:
39//!       current = neighbor
40//!       best_ratio = ratio
41//!       improved = true
42//!   endfor
43//! endwhile
44//!
45//! return current
46//! ```
47//!
48//! **Search Space:**
49//! - `f`: [8, 20] → average chunk size [256 B, 1 MB]
50//! - `m`: [64, 16384] → minimum chunk size [64 B, 16 KiB]
51//! - Constraint: `m < z` where `z = 2^(f+3)` (max chunk size)
52//!
53//! **Termination:**
54//! - Converges when no neighbor improves the ratio
55//! - Maximum 100 iterations (typically converges in 5-15)
56//!
57//! # Sampling Strategy
58//!
59//! To reduce analysis time, only 512 MiB is sampled:
60//! - For files > 513 MiB: Skip first 1 MiB (to avoid partition tables/headers)
61//! - For files ≤ 513 MiB: Analyze entire file
62//!
63//! This sampling is sufficient because deduplication characteristics are
64//! typically uniform across a disk image (same filesystem, similar files).
65//!
66//! # Use Cases
67//!
68//! - **Pre-Snapshot Optimization**: Determine optimal CDC parameters before packing
69//! - **Workload Characterization**: Understand data redundancy patterns
70//! - **Compression Tuning**: Compare fixed vs. variable block effectiveness
71//! - **Research**: Validate DCAM model predictions on real-world data
72//!
73//! # Recommended Workflow
74//!
75//! ```bash
76//! # 1. Analyze disk image
77//! hexz analyze disk.img
78//! # Output: Recommends f=14 (16 KiB avg), m=1024 (1 KiB min)
79//!
80//! # 2. Pack with recommended parameters
81//! hexz pack --disk disk.img --output snapshot.st --cdc \
82//!   --min-chunk 1024 --avg-chunk 16384 --max-chunk 32768
83//!
84//! # 3. Verify compression ratio
85//! hexz info snapshot.st
86//! # Output: Compression ratio should match DCAM prediction
87//! ```
88//!
89//! # Performance Characteristics
90//!
91//! - **Sampling Time**: ~2-5 seconds for 512 MiB sample
92//! - **Baseline Pass**: Single CDC chunking at ~200 MB/s
93//! - **Greedy Search**: 10-30 iterations × DCAM prediction (~1 μs each)
94//! - **Total Time**: Typically 5-10 seconds for large disk images
95
96use anyhow::{Context, Result};
97use hexz_core::algo::dedup::cdc;
98use hexz_core::algo::dedup::dcam::{self, DedupeParams};
99use indicatif::HumanBytes;
100use std::fs::File;
101use std::io::{Read, Seek, SeekFrom};
102use std::path::PathBuf;
103
104/// Size of the sample read from disk for analysis (512 MiB).
105///
106/// **Architectural intent:** Balances analysis accuracy with speed. Larger
107/// samples improve accuracy for heterogeneous data but increase runtime.
108/// 512 MiB is sufficient to characterize most workload patterns while keeping
109/// analysis under 10 seconds on modern hardware.
110const ANALYSIS_SAMPLE_SIZE: u64 = 512 * 1024 * 1024;
111
112/// Executes the analyze command to optimize CDC parameters using DCAM.
113///
114/// Reads a sample of the input file, performs a baseline CDC chunking pass to
115/// measure deduplication characteristics, calculates the change probability `c`,
116/// and then uses a greedy search algorithm to find optimal chunking parameters
117/// (fingerprint bits `f` and minimum chunk size `m`). Displays the baseline,
118/// recommended parameters, and predicted deduplication ratio.
119///
120/// # Arguments
121///
122/// * `input` - Path to the disk image file (raw, qcow2, or any binary file)
123///
124/// # Output Format
125///
126/// ```text
127/// Analyzing disk.img using DCAM...
128/// Reading 512.0 MB sample for analysis...
129/// Running Baseline CDC Pass (Avg Chunk: 8KB)...
130///   Processed: 512.0 MB
131///   Unique:    384.0 MB (75.0%)
132///   Chunks:    65536
133///   Estimated Change Prob (c): 0.750000
134///
135/// Optimizing parameters using DCAM...
136///
137/// --- Optimization Results ---
138/// Parameter                 | Baseline (LBFS) | Recommended
139/// --------------------------|-----------------|----------------
140/// Fingerprint Bits (f)      | 13              | 14
141/// Min Chunk Size (m)        | 256             | 1024
142/// Avg Chunk Size            | 8.0 KB          | 16.0 KB
143///
144/// --- Predictions ---
145/// Predicted Ratio: 0.7234
146/// Est. Final Size: 7.2 GB
147/// Est. Savings:    2.8 GB
148/// ```
149///
150/// # Algorithm Details
151///
152/// The function implements these steps:
153///
154/// 1. **File Reading**: Opens input file and reads up to 512 MiB sample
155/// 2. **Header Skipping**: For large files, skips first 1 MiB to avoid partition metadata
156/// 3. **Baseline Chunking**: Runs FastCDC with LBFS parameters (f=13, m=256)
157/// 4. **Statistics Collection**: Counts total bytes, unique bytes, and chunks
158/// 5. **Change Probability**: Calculates `c = unique_bytes / total_bytes`
159/// 6. **Greedy Optimization**: Calls `find_optimal_parameters` to search parameter space
160/// 7. **Prediction**: Uses DCAM model to estimate deduplication ratio
161/// 8. **Result Display**: Prints comparison table and predicted savings
162///
163/// # Errors
164///
165/// Returns an error if:
166/// - Input file cannot be opened (file not found, permission denied)
167/// - File metadata cannot be read
168/// - File read operations fail (I/O error, disk full)
169/// - CDC analysis fails (invalid data, algorithm error)
170///
171/// Note: Empty files are handled gracefully with an early return.
172///
173/// # Examples
174///
175/// ```no_run
176/// use std::path::PathBuf;
177/// use hexz_cli::cmd::data::analyze;
178///
179/// // Analyze a disk image
180/// analyze::run(PathBuf::from("vm-disk.img"))?;
181///
182/// // Analyze a large backup file
183/// analyze::run(PathBuf::from("/backup/system.tar"))?;
184/// # Ok::<(), anyhow::Error>(())
185/// ```
186pub fn run(input: PathBuf) -> Result<()> {
187    println!("Analyzing {} using DCAM...", input.display());
188
189    let mut file = File::open(&input).context("Failed to open input file")?;
190    let file_len = file.metadata()?.len();
191
192    if file_len == 0 {
193        println!("File is empty.");
194        return Ok(());
195    }
196
197    // 1. Read Sample
198    let read_len = std::cmp::min(file_len, ANALYSIS_SAMPLE_SIZE);
199    let mut buffer = vec![0u8; read_len as usize];
200
201    println!("Reading {} sample for analysis...", HumanBytes(read_len));
202
203    // If file is large, skip the first 1MB to avoid headers/partition tables
204    if file_len > ANALYSIS_SAMPLE_SIZE + 1024 * 1024 {
205        file.seek(SeekFrom::Start(1024 * 1024))?;
206    }
207    file.read_exact(&mut buffer)?;
208
209    // 2. Baseline Pass (LBFS)
210    let baseline = DedupeParams::lbfs_baseline();
211    println!("Running Baseline CDC Pass (Avg Chunk: 8KB)...");
212
213    let stats = cdc::analyze_stream(&buffer[..], &baseline)?;
214
215    println!("  Processed: {}", HumanBytes(stats.total_bytes));
216    println!(
217        "  Unique:    {} ({:.1}%)",
218        HumanBytes(stats.unique_bytes),
219        (stats.unique_bytes as f64 / stats.total_bytes as f64) * 100.0
220    );
221    println!("  Chunks:    {}", stats.chunk_count);
222
223    // 3. Calculate c (Change Probability)
224    let c = dcam::calculate_c(stats.unique_bytes, stats.total_bytes, &baseline);
225    println!("  Estimated Change Prob (c): {:.6}", c);
226
227    // 4. DCAM Greedy Search
228    println!("\nOptimizing parameters using DCAM...");
229    let best_params = find_optimal_parameters(stats.total_bytes, c);
230
231    // 5. Report
232    let predicted_ratio = dcam::predict_ratio(stats.total_bytes, c, &best_params);
233    let predicted_size = (file_len as f64 * predicted_ratio) as u64;
234
235    println!("\n--- Optimization Results ---");
236    println!(
237        "{:<25} | {:<15} | {:<15}",
238        "Parameter", "Baseline (LBFS)", "Recommended"
239    );
240    println!("{:-<26}|{:-<16}|{:-<16}", "", "", "");
241
242    println!(
243        "{:<25} | {:<15} | {:<15}",
244        "Fingerprint Bits (f)", baseline.f, best_params.f
245    );
246    println!(
247        "{:<25} | {:<15} | {:<15}",
248        "Min Chunk Size (m)", baseline.m, best_params.m
249    );
250    println!(
251        "{:<25} | {:<15} | {:<15}",
252        "Avg Chunk Size",
253        HumanBytes(1 << baseline.f),
254        HumanBytes(1 << best_params.f)
255    );
256
257    println!("\n--- Predictions ---");
258    println!("Predicted Ratio: {:.4}", predicted_ratio);
259    println!("Est. Final Size: {}", HumanBytes(predicted_size));
260    println!(
261        "Est. Savings:    {}",
262        HumanBytes(file_len.saturating_sub(predicted_size))
263    );
264
265    Ok(())
266}
267
268/// Greedy search algorithm to find optimal fingerprint bits and minimum chunk size.
269///
270/// Performs hill-climbing optimization in the (f, m) parameter space to minimize
271/// the predicted deduplication ratio. Starts from the LBFS baseline and explores
272/// neighbors (f±1, m×2, m÷2) until no improvement is found.
273///
274/// # Algorithm
275///
276/// The search uses a simple greedy strategy:
277/// 1. Start with current best parameters (initially LBFS baseline)
278/// 2. Evaluate all valid neighbors in the parameter space
279/// 3. Move to the neighbor with the best (lowest) predicted ratio
280/// 4. Repeat until no neighbor improves the ratio
281///
282/// # Search Space Constraints
283///
284/// - **Fingerprint bits (f)**: [8, 20]
285///   - f=8 → 256 B average chunk (too small, high overhead)
286///   - f=20 → 1 MB average chunk (too large, poor deduplication)
287///
288/// - **Minimum chunk size (m)**: [64, 16384]
289///   - m=64 → allows very small chunks (high metadata overhead)
290///   - m=16384 → prevents most chunks (degrades to fixed-size)
291///
292/// - **Constraint**: m < z where z = 2^(f+3) (max chunk size)
293///
294/// # Convergence
295///
296/// Typically converges in 5-15 iterations. The maximum iteration limit (100)
297/// prevents infinite loops on pathological inputs.
298///
299/// # Arguments
300///
301/// * `file_size` - Total size of the file being analyzed (used by DCAM model)
302/// * `c` - Change probability (fraction of unique data after baseline pass)
303///
304/// # Returns
305///
306/// Optimal `DedupeParams` with fields:
307/// - `f`: Fingerprint bits (determines average chunk size)
308/// - `m`: Minimum chunk size in bytes
309/// - `z`: Maximum chunk size in bytes (derived from f)
310///
311/// # Performance
312///
313/// Each iteration performs O(4) DCAM predictions (~1 μs each), so total
314/// search time is typically <1 ms even for worst-case convergence.
315fn find_optimal_parameters(file_size: u64, c: f64) -> DedupeParams {
316    let mut current = DedupeParams::lbfs_baseline();
317    let mut best_ratio = dcam::predict_ratio(file_size, c, &current);
318
319    // Search bounds
320    let min_f = 8; // 256B
321    let max_f = 20; // 1MB
322    let min_m = 64;
323    let max_m = 16 * 1024;
324
325    let mut improved = true;
326    let mut iterations = 0;
327
328    while improved && iterations < 100 {
329        improved = false;
330        iterations += 1;
331
332        let mut best_neighbor = current;
333
334        // 1. Explore f neighbors (f-1, f+1)
335        for f_cand in [current.f.saturating_sub(1), current.f + 1] {
336            if f_cand < min_f || f_cand > max_f {
337                continue;
338            }
339
340            let mut candidate = current;
341            candidate.f = f_cand;
342            candidate.z = 1 << (f_cand + 3);
343
344            let ratio = dcam::predict_ratio(file_size, c, &candidate);
345            if ratio < best_ratio {
346                best_ratio = ratio;
347                best_neighbor = candidate;
348                improved = true;
349            }
350        }
351
352        // 2. Explore m neighbors (m/2, m*2)
353        for m_cand in [current.m / 2, current.m * 2] {
354            if m_cand < min_m || m_cand > max_m || m_cand >= current.z {
355                continue;
356            }
357
358            let mut candidate = current;
359            candidate.m = m_cand;
360
361            let ratio = dcam::predict_ratio(file_size, c, &candidate);
362            if ratio < best_ratio {
363                best_ratio = ratio;
364                best_neighbor = candidate;
365                improved = true;
366            }
367        }
368
369        current = best_neighbor;
370    }
371
372    current
373}
hexz_cli/cmd/data/analyze.rs

hexz_cli/cmd/data/
analyze.rs