hexz_cli/cmd/data/analyze.rs
1//! Analyze archive structure and optimize CDC parameters using DCAM.
2//!
3//! This command performs offline analysis of disk images to scientifically
4//! determine optimal content-defined chunking (CDC) parameters. It uses DCAM
5//! (Deduplication Change-Estimation Analytical Model), a mathematical framework
6//! that predicts deduplication effectiveness without performing full chunking,
7//! enabling fast parameter optimization.
8//!
9//! # DCAM Algorithm Overview
10//!
11//! DCAM estimates deduplication efficiency by:
12//!
13//! 1. **Baseline Pass**: Chunks a sample with LBFS parameters (8 KiB average)
14//! 2. **Change Probability**: Calculates `c` (fraction of unique data)
15//! 3. **Greedy Search**: Tests parameter combinations to minimize deduped size
16//! 4. **Prediction**: Uses analytical model to estimate full-file deduplication
17//!
18//! The key insight is that deduplication effectiveness depends on:
19//! - `f`: Fingerprint bits (determines average chunk size = 2^f)
20//! - `m`: Minimum chunk size (prevents pathologically small chunks)
21//! - `c`: Change probability (intrinsic data characteristic)
22//!
23//! DCAM predicts the deduplication ratio without actually deduplicating the
24//! entire file, making optimization practical for large disk images.
25//!
26//! # Greedy Search Algorithm
27//!
28//! The `find_optimal_parameters` function implements a hill-climbing search:
29//!
30//! **Algorithm:**
31//! ```text
32//! current = baseline parameters (f=13, m=256)
33//! best_ratio = predict_ratio(current)
34//!
35//! while improved:
36//! for each neighbor of current (f±1, m×2, m÷2):
37//! ratio = predict_ratio(neighbor)
38//! if ratio < best_ratio:
39//! current = neighbor
40//! best_ratio = ratio
41//! improved = true
42//! endfor
43//! endwhile
44//!
45//! return current
46//! ```
47//!
48//! **Search Space:**
49//! - `f`: [8, 20] → average chunk size [256 B, 1 MB]
50//! - `m`: [64, 16384] → minimum chunk size [64 B, 16 KiB]
51//! - Constraint: `m < z` where `z = 2^(f+3)` (max chunk size)
52//!
53//! **Termination:**
54//! - Converges when no neighbor improves the ratio
55//! - Maximum 100 iterations (typically converges in 5-15)
56//!
57//! # Sampling Strategy
58//!
59//! To reduce analysis time, only 512 MiB is sampled:
60//! - For files > 513 MiB: Skip first 1 MiB (to avoid partition tables/headers)
61//! - For files ≤ 513 MiB: Analyze entire file
62//!
63//! This sampling is sufficient because deduplication characteristics are
64//! typically uniform across a disk image (same filesystem, similar files).
65//!
66//! # Use Cases
67//!
68//! - **Pre-Snapshot Optimization**: Determine optimal CDC parameters before packing
69//! - **Workload Characterization**: Understand data redundancy patterns
70//! - **Compression Tuning**: Compare fixed vs. variable block effectiveness
71//! - **Research**: Validate DCAM model predictions on real-world data
72//!
73//! # Recommended Workflow
74//!
75//! ```bash
76//! # 1. Analyze disk image
77//! hexz analyze disk.img
78//! # Output: Recommends f=14 (16 KiB avg), m=1024 (1 KiB min)
79//!
80//! # 2. Pack with recommended parameters
81//! hexz pack --disk disk.img --output snapshot.st --cdc \
82//! --min-chunk 1024 --avg-chunk 16384 --max-chunk 32768
83//!
84//! # 3. Verify compression ratio
85//! hexz info snapshot.st
86//! # Output: Compression ratio should match DCAM prediction
87//! ```
88//!
89//! # Performance Characteristics
90//!
91//! - **Sampling Time**: ~2-5 seconds for 512 MiB sample
92//! - **Baseline Pass**: Single CDC chunking at ~200 MB/s
93//! - **Greedy Search**: 10-30 iterations × DCAM prediction (~1 μs each)
94//! - **Total Time**: Typically 5-10 seconds for large disk images
95
96use anyhow::{Context, Result};
97use hexz_core::algo::dedup::cdc;
98use hexz_core::algo::dedup::dcam::{self, DedupeParams};
99use indicatif::HumanBytes;
100use std::fs::File;
101use std::io::{Read, Seek, SeekFrom};
102use std::path::PathBuf;
103
104/// Size of the sample read from disk for analysis (512 MiB).
105///
106/// **Architectural intent:** Balances analysis accuracy with speed. Larger
107/// samples improve accuracy for heterogeneous data but increase runtime.
108/// 512 MiB is sufficient to characterize most workload patterns while keeping
109/// analysis under 10 seconds on modern hardware.
110const ANALYSIS_SAMPLE_SIZE: u64 = 512 * 1024 * 1024;
111
112/// Executes the analyze command to optimize CDC parameters using DCAM.
113///
114/// Reads a sample of the input file, performs a baseline CDC chunking pass to
115/// measure deduplication characteristics, calculates the change probability `c`,
116/// and then uses a greedy search algorithm to find optimal chunking parameters
117/// (fingerprint bits `f` and minimum chunk size `m`). Displays the baseline,
118/// recommended parameters, and predicted deduplication ratio.
119///
120/// # Arguments
121///
122/// * `input` - Path to the disk image file (raw, qcow2, or any binary file)
123///
124/// # Output Format
125///
126/// ```text
127/// Analyzing disk.img using DCAM...
128/// Reading 512.0 MB sample for analysis...
129/// Running Baseline CDC Pass (Avg Chunk: 8KB)...
130/// Processed: 512.0 MB
131/// Unique: 384.0 MB (75.0%)
132/// Chunks: 65536
133/// Estimated Change Prob (c): 0.750000
134///
135/// Optimizing parameters using DCAM...
136///
137/// --- Optimization Results ---
138/// Parameter | Baseline (LBFS) | Recommended
139/// --------------------------|-----------------|----------------
140/// Fingerprint Bits (f) | 13 | 14
141/// Min Chunk Size (m) | 256 | 1024
142/// Avg Chunk Size | 8.0 KB | 16.0 KB
143///
144/// --- Predictions ---
145/// Predicted Ratio: 0.7234
146/// Est. Final Size: 7.2 GB
147/// Est. Savings: 2.8 GB
148/// ```
149///
150/// # Algorithm Details
151///
152/// The function implements these steps:
153///
154/// 1. **File Reading**: Opens input file and reads up to 512 MiB sample
155/// 2. **Header Skipping**: For large files, skips first 1 MiB to avoid partition metadata
156/// 3. **Baseline Chunking**: Runs FastCDC with LBFS parameters (f=13, m=256)
157/// 4. **Statistics Collection**: Counts total bytes, unique bytes, and chunks
158/// 5. **Change Probability**: Calculates `c = unique_bytes / total_bytes`
159/// 6. **Greedy Optimization**: Calls `find_optimal_parameters` to search parameter space
160/// 7. **Prediction**: Uses DCAM model to estimate deduplication ratio
161/// 8. **Result Display**: Prints comparison table and predicted savings
162///
163/// # Errors
164///
165/// Returns an error if:
166/// - Input file cannot be opened (file not found, permission denied)
167/// - File metadata cannot be read
168/// - File read operations fail (I/O error, disk full)
169/// - CDC analysis fails (invalid data, algorithm error)
170///
171/// Note: Empty files are handled gracefully with an early return.
172///
173/// # Examples
174///
175/// ```no_run
176/// use std::path::PathBuf;
177/// use hexz_cli::cmd::data::analyze;
178///
179/// // Analyze a disk image
180/// analyze::run(PathBuf::from("vm-disk.img"))?;
181///
182/// // Analyze a large backup file
183/// analyze::run(PathBuf::from("/backup/system.tar"))?;
184/// # Ok::<(), anyhow::Error>(())
185/// ```
186pub fn run(input: PathBuf) -> Result<()> {
187 println!("Analyzing {} using DCAM...", input.display());
188
189 let mut file = File::open(&input).context("Failed to open input file")?;
190 let file_len = file.metadata()?.len();
191
192 if file_len == 0 {
193 println!("File is empty.");
194 return Ok(());
195 }
196
197 // 1. Read Sample
198 let read_len = std::cmp::min(file_len, ANALYSIS_SAMPLE_SIZE);
199 let mut buffer = vec![0u8; read_len as usize];
200
201 println!("Reading {} sample for analysis...", HumanBytes(read_len));
202
203 // If file is large, skip the first 1MB to avoid headers/partition tables
204 if file_len > ANALYSIS_SAMPLE_SIZE + 1024 * 1024 {
205 file.seek(SeekFrom::Start(1024 * 1024))?;
206 }
207 file.read_exact(&mut buffer)?;
208
209 // 2. Baseline Pass (LBFS)
210 let baseline = DedupeParams::lbfs_baseline();
211 println!("Running Baseline CDC Pass (Avg Chunk: 8KB)...");
212
213 let stats = cdc::analyze_stream(&buffer[..], &baseline)?;
214
215 println!(" Processed: {}", HumanBytes(stats.total_bytes));
216 println!(
217 " Unique: {} ({:.1}%)",
218 HumanBytes(stats.unique_bytes),
219 (stats.unique_bytes as f64 / stats.total_bytes as f64) * 100.0
220 );
221 println!(" Chunks: {}", stats.chunk_count);
222
223 // 3. Calculate c (Change Probability)
224 let c = dcam::calculate_c(stats.unique_bytes, stats.total_bytes, &baseline);
225 println!(" Estimated Change Prob (c): {:.6}", c);
226
227 // 4. DCAM Greedy Search
228 println!("\nOptimizing parameters using DCAM...");
229 let best_params = find_optimal_parameters(stats.total_bytes, c);
230
231 // 5. Report
232 let predicted_ratio = dcam::predict_ratio(stats.total_bytes, c, &best_params);
233 let predicted_size = (file_len as f64 * predicted_ratio) as u64;
234
235 println!("\n--- Optimization Results ---");
236 println!(
237 "{:<25} | {:<15} | {:<15}",
238 "Parameter", "Baseline (LBFS)", "Recommended"
239 );
240 println!("{:-<26}|{:-<16}|{:-<16}", "", "", "");
241
242 println!(
243 "{:<25} | {:<15} | {:<15}",
244 "Fingerprint Bits (f)", baseline.f, best_params.f
245 );
246 println!(
247 "{:<25} | {:<15} | {:<15}",
248 "Min Chunk Size (m)", baseline.m, best_params.m
249 );
250 println!(
251 "{:<25} | {:<15} | {:<15}",
252 "Avg Chunk Size",
253 HumanBytes(1 << baseline.f),
254 HumanBytes(1 << best_params.f)
255 );
256
257 println!("\n--- Predictions ---");
258 println!("Predicted Ratio: {:.4}", predicted_ratio);
259 println!("Est. Final Size: {}", HumanBytes(predicted_size));
260 println!(
261 "Est. Savings: {}",
262 HumanBytes(file_len.saturating_sub(predicted_size))
263 );
264
265 Ok(())
266}
267
268/// Greedy search algorithm to find optimal fingerprint bits and minimum chunk size.
269///
270/// Performs hill-climbing optimization in the (f, m) parameter space to minimize
271/// the predicted deduplication ratio. Starts from the LBFS baseline and explores
272/// neighbors (f±1, m×2, m÷2) until no improvement is found.
273///
274/// # Algorithm
275///
276/// The search uses a simple greedy strategy:
277/// 1. Start with current best parameters (initially LBFS baseline)
278/// 2. Evaluate all valid neighbors in the parameter space
279/// 3. Move to the neighbor with the best (lowest) predicted ratio
280/// 4. Repeat until no neighbor improves the ratio
281///
282/// # Search Space Constraints
283///
284/// - **Fingerprint bits (f)**: [8, 20]
285/// - f=8 → 256 B average chunk (too small, high overhead)
286/// - f=20 → 1 MB average chunk (too large, poor deduplication)
287///
288/// - **Minimum chunk size (m)**: [64, 16384]
289/// - m=64 → allows very small chunks (high metadata overhead)
290/// - m=16384 → prevents most chunks (degrades to fixed-size)
291///
292/// - **Constraint**: m < z where z = 2^(f+3) (max chunk size)
293///
294/// # Convergence
295///
296/// Typically converges in 5-15 iterations. The maximum iteration limit (100)
297/// prevents infinite loops on pathological inputs.
298///
299/// # Arguments
300///
301/// * `file_size` - Total size of the file being analyzed (used by DCAM model)
302/// * `c` - Change probability (fraction of unique data after baseline pass)
303///
304/// # Returns
305///
306/// Optimal `DedupeParams` with fields:
307/// - `f`: Fingerprint bits (determines average chunk size)
308/// - `m`: Minimum chunk size in bytes
309/// - `z`: Maximum chunk size in bytes (derived from f)
310///
311/// # Performance
312///
313/// Each iteration performs O(4) DCAM predictions (~1 μs each), so total
314/// search time is typically <1 ms even for worst-case convergence.
315fn find_optimal_parameters(file_size: u64, c: f64) -> DedupeParams {
316 let mut current = DedupeParams::lbfs_baseline();
317 let mut best_ratio = dcam::predict_ratio(file_size, c, ¤t);
318
319 // Search bounds
320 let min_f = 8; // 256B
321 let max_f = 20; // 1MB
322 let min_m = 64;
323 let max_m = 16 * 1024;
324
325 let mut improved = true;
326 let mut iterations = 0;
327
328 while improved && iterations < 100 {
329 improved = false;
330 iterations += 1;
331
332 let mut best_neighbor = current;
333
334 // 1. Explore f neighbors (f-1, f+1)
335 for f_cand in [current.f.saturating_sub(1), current.f + 1] {
336 if f_cand < min_f || f_cand > max_f {
337 continue;
338 }
339
340 let mut candidate = current;
341 candidate.f = f_cand;
342 candidate.z = 1 << (f_cand + 3);
343
344 let ratio = dcam::predict_ratio(file_size, c, &candidate);
345 if ratio < best_ratio {
346 best_ratio = ratio;
347 best_neighbor = candidate;
348 improved = true;
349 }
350 }
351
352 // 2. Explore m neighbors (m/2, m*2)
353 for m_cand in [current.m / 2, current.m * 2] {
354 if m_cand < min_m || m_cand > max_m || m_cand >= current.z {
355 continue;
356 }
357
358 let mut candidate = current;
359 candidate.m = m_cand;
360
361 let ratio = dcam::predict_ratio(file_size, c, &candidate);
362 if ratio < best_ratio {
363 best_ratio = ratio;
364 best_neighbor = candidate;
365 improved = true;
366 }
367 }
368
369 current = best_neighbor;
370 }
371
372 current
373}