Skip to main content

hexz_ops/
predict.rs

1use std::collections::HashSet;
2use std::fs::File;
3use std::io::{Read, Seek, SeekFrom};
4use std::path::PathBuf;
5
6use hexz_common::{Error, Result};
7use serde::Serialize;
8
9use hexz_core::algo::compression::Compressor;
10use hexz_core::algo::compression::lz4::Lz4Compressor;
11use hexz_core::algo::compression::zstd::ZstdCompressor;
12use hexz_core::algo::dedup::cdc::analyze_stream;
13use hexz_core::algo::dedup::dcam::{DedupeParams, optimize_params};
14
15use crate::pack::calculate_entropy;
16use crate::write::is_zero_chunk;
17
18/// Configuration for the predict command.
19#[derive(Debug, Clone)]
20pub struct PredictConfig {
21    /// Path to the raw data file to analyze.
22    pub path: PathBuf,
23    /// Block size in bytes for fixed-chunk analysis.
24    pub block_size: usize,
25    /// CDC minimum chunk size (auto-detected if None).
26    pub min_chunk: Option<u32>,
27    /// CDC average chunk size (auto-detected if None).
28    pub avg_chunk: Option<u32>,
29    /// CDC maximum chunk size (auto-detected if None).
30    pub max_chunk: Option<u32>,
31    /// Number of evenly-spaced blocks to sample for compression estimates.
32    pub sample_count: usize,
33    /// Max bytes to feed to `analyze_stream` for CDC analysis.
34    pub dedup_scan_limit: u64,
35}
36
37impl Default for PredictConfig {
38    fn default() -> Self {
39        Self {
40            path: PathBuf::new(),
41            block_size: 65536,
42            min_chunk: None,
43            avg_chunk: None,
44            max_chunk: None,
45            sample_count: 4000,
46            dedup_scan_limit: 256 * 1024 * 1024,
47        }
48    }
49}
50
51/// Results from analyzing a raw file for hexz packing potential.
52#[derive(Debug, Serialize)]
53pub struct PredictReport {
54    /// Path to the analyzed file.
55    pub file_path: String,
56    /// Total file size in bytes.
57    pub file_size: u64,
58    /// Block size used for sampling.
59    pub block_size: usize,
60    /// Number of blocks sampled.
61    pub blocks_sampled: usize,
62
63    /// Percentage of sampled blocks that are all-zero.
64    pub zero_block_pct: f64,
65    /// Mean Shannon entropy of non-zero blocks.
66    pub mean_entropy: f64,
67    /// Percentage of non-zero blocks with entropy > 6.0.
68    pub high_entropy_pct: f64,
69
70    /// LZ4 compression ratio (compressed / raw).
71    pub lz4_ratio: f64,
72    /// LZ4 savings as a percentage.
73    pub lz4_savings_pct: f64,
74    /// Zstd compression ratio (compressed / raw).
75    pub zstd_ratio: f64,
76    /// Zstd savings as a percentage.
77    pub zstd_savings_pct: f64,
78    /// Estimated file size after LZ4 compression.
79    pub estimated_lz4_size: u64,
80    /// Estimated file size after Zstd compression.
81    pub estimated_zstd_size: u64,
82
83    /// Fixed-block dedup ratio (unique / total sampled bytes).
84    pub fixed_dedup_ratio: f64,
85    /// Fixed-block dedup savings as a percentage.
86    pub fixed_dedup_savings_pct: f64,
87
88    /// Number of bytes scanned for CDC analysis.
89    pub cdc_scan_bytes: u64,
90    /// CDC minimum chunk size in bytes.
91    pub cdc_min_chunk: u32,
92    /// CDC average chunk size in bytes.
93    pub cdc_avg_chunk: u32,
94    /// CDC maximum chunk size in bytes.
95    pub cdc_max_chunk: u32,
96    /// Total CDC chunks found.
97    pub cdc_chunks_total: u64,
98    /// Unique CDC chunks found.
99    pub cdc_chunks_unique: u64,
100    /// CDC dedup ratio (unique / total).
101    pub cdc_dedup_ratio: f64,
102    /// CDC dedup savings as a percentage.
103    pub cdc_dedup_savings_pct: f64,
104
105    /// Estimated packed size with LZ4 + fixed-block dedup.
106    pub estimated_packed_size_lz4_fixed: u64,
107    /// Estimated packed size with Zstd + CDC dedup.
108    pub estimated_packed_size_zstd_cdc: u64,
109    /// Best achievable savings as a percentage.
110    pub overall_best_savings_pct: f64,
111}
112
113/// Analyze a raw data file and estimate hexz packing savings.
114pub fn predict(config: &PredictConfig) -> Result<PredictReport> {
115    let mut f = File::open(&config.path)?;
116    let file_size = f.metadata()?.len();
117
118    if file_size == 0 {
119        return Err(Error::Format("File is empty".to_string()));
120    }
121
122    // Phase 1: Sample evenly-spaced blocks for compression + entropy estimates
123    let step = (file_size / config.sample_count as u64).max(config.block_size as u64);
124    let mut buf = vec![0u8; config.block_size];
125
126    let lz4 = Lz4Compressor::new();
127    let zstd = ZstdCompressor::new(3, None);
128
129    let mut blocks_sampled: usize = 0;
130    let mut zero_count: usize = 0;
131    let mut entropy_sum: f64 = 0.0;
132    let mut high_entropy_count: usize = 0;
133    let mut lz4_compressed_total: u64 = 0;
134    let mut zstd_compressed_total: u64 = 0;
135    let mut raw_sampled_total: u64 = 0;
136
137    // Fixed dedup tracking via blake3 hash set
138    let mut seen_hashes: HashSet<u64> = HashSet::new();
139    let mut unique_sampled_bytes: u64 = 0;
140
141    let mut attempt: u64 = 0;
142    while blocks_sampled < config.sample_count {
143        let offset = attempt * step;
144        if offset >= file_size {
145            break;
146        }
147
148        _ = f.seek(SeekFrom::Start(offset))?;
149        let n = f.read(&mut buf)?;
150        if n == 0 {
151            break;
152        }
153        let chunk = &buf[..n];
154        blocks_sampled += 1;
155        raw_sampled_total += n as u64;
156
157        // Dedup tracking for all blocks (including zeros)
158        let digest = *blake3::hash(chunk).as_bytes();
159        let hash = u64::from_le_bytes(
160            digest[..8]
161                .try_into()
162                .map_err(|_| Error::Format("hash slice conversion failed".to_string()))?,
163        );
164        if seen_hashes.insert(hash) {
165            unique_sampled_bytes += n as u64;
166        }
167
168        if is_zero_chunk(chunk) {
169            zero_count += 1;
170            attempt += 1;
171            continue;
172        }
173
174        let entropy = calculate_entropy(chunk);
175        entropy_sum += entropy;
176        if entropy > 6.0 {
177            high_entropy_count += 1;
178        }
179
180        // Compression measurement
181        if let Ok(compressed) = lz4.compress(chunk) {
182            lz4_compressed_total += compressed.len() as u64;
183        } else {
184            lz4_compressed_total += n as u64;
185        }
186        if let Ok(compressed) = zstd.compress(chunk) {
187            zstd_compressed_total += compressed.len() as u64;
188        } else {
189            zstd_compressed_total += n as u64;
190        }
191
192        attempt += 1;
193    }
194
195    let non_zero_count = blocks_sampled - zero_count;
196
197    let zero_block_pct = if blocks_sampled > 0 {
198        zero_count as f64 / blocks_sampled as f64
199    } else {
200        0.0
201    };
202
203    let mean_entropy = if non_zero_count > 0 {
204        entropy_sum / non_zero_count as f64
205    } else {
206        0.0
207    };
208
209    let high_entropy_pct = if non_zero_count > 0 {
210        high_entropy_count as f64 / non_zero_count as f64
211    } else {
212        0.0
213    };
214
215    // Account for zero blocks in compression estimates
216    let zero_compressed_approx = zero_count as u64 * 20;
217    let lz4_total_with_zeros = lz4_compressed_total + zero_compressed_approx;
218    let zstd_total_with_zeros = zstd_compressed_total + zero_compressed_approx;
219
220    let lz4_ratio = if raw_sampled_total > 0 {
221        lz4_total_with_zeros as f64 / raw_sampled_total as f64
222    } else {
223        1.0
224    };
225    let zstd_ratio = if raw_sampled_total > 0 {
226        zstd_total_with_zeros as f64 / raw_sampled_total as f64
227    } else {
228        1.0
229    };
230
231    let lz4_savings_pct = (1.0 - lz4_ratio) * 100.0;
232    let zstd_savings_pct = (1.0 - zstd_ratio) * 100.0;
233    let estimated_lz4_size = (file_size as f64 * lz4_ratio) as u64;
234    let estimated_zstd_size = (file_size as f64 * zstd_ratio) as u64;
235
236    // Fixed dedup ratio
237    let fixed_dedup_ratio = if raw_sampled_total > 0 {
238        unique_sampled_bytes as f64 / raw_sampled_total as f64
239    } else {
240        1.0
241    };
242    let fixed_dedup_savings_pct = (1.0 - fixed_dedup_ratio) * 100.0;
243
244    // Phase 2: CDC analysis — auto-detect params via DCAM if not specified
245    _ = f.seek(SeekFrom::Start(0))?;
246
247    // First pass: analyze with LBFS baseline to get change rate
248    let baseline = DedupeParams::lbfs_baseline();
249    let baseline_stats = analyze_stream(f.try_clone()?, &baseline)?;
250
251    let cdc_params = if let (Some(min), Some(avg), Some(max)) =
252        (config.min_chunk, config.avg_chunk, config.max_chunk)
253    {
254        // User provided all three — use them directly
255        let f_bits = (avg as f64).log2().round() as u32;
256        DedupeParams {
257            f: f_bits,
258            m: min,
259            z: max,
260            w: 48,
261            v: 52,
262        }
263    } else {
264        // DCAM auto-detection
265        let optimized = optimize_params(file_size, baseline_stats.unique_bytes, &baseline, false);
266        let mut params = optimized.params;
267        if let Some(min) = config.min_chunk {
268            params.m = min;
269        }
270        if let Some(avg) = config.avg_chunk {
271            params.f = (avg as f64).log2().round() as u32;
272        }
273        if let Some(max) = config.max_chunk {
274            params.z = max;
275        }
276        params
277    };
278
279    // Second pass: analyze with resolved params
280    let scan_limit = config.dedup_scan_limit.min(file_size);
281    _ = f.seek(SeekFrom::Start(0))?;
282    let reader = f.by_ref().take(scan_limit);
283
284    let cdc_stats = analyze_stream(reader, &cdc_params)?;
285
286    // Direct measured dedup ratio from the scan
287    let cdc_dedup_ratio = if cdc_stats.chunk_count > 0 && cdc_stats.unique_chunk_count > 0 {
288        cdc_stats.unique_bytes as f64
289            / (cdc_stats.unique_bytes as f64 * cdc_stats.chunk_count as f64
290                / cdc_stats.unique_chunk_count as f64)
291    } else {
292        1.0
293    };
294    let cdc_dedup_savings_pct = (1.0 - cdc_dedup_ratio) * 100.0;
295
296    // Combined estimates
297    let estimated_packed_size_lz4_fixed = (file_size as f64 * lz4_ratio * fixed_dedup_ratio) as u64;
298    let estimated_packed_size_zstd_cdc = (file_size as f64 * zstd_ratio * cdc_dedup_ratio) as u64;
299
300    let best_packed = estimated_packed_size_lz4_fixed.min(estimated_packed_size_zstd_cdc);
301    let overall_best_savings_pct = (1.0 - best_packed as f64 / file_size as f64) * 100.0;
302
303    Ok(PredictReport {
304        file_path: config.path.display().to_string(),
305        file_size,
306        block_size: config.block_size,
307        blocks_sampled,
308
309        zero_block_pct,
310        mean_entropy,
311        high_entropy_pct,
312
313        lz4_ratio,
314        lz4_savings_pct,
315        zstd_ratio,
316        zstd_savings_pct,
317        estimated_lz4_size,
318        estimated_zstd_size,
319
320        fixed_dedup_ratio,
321        fixed_dedup_savings_pct,
322
323        cdc_scan_bytes: scan_limit,
324        cdc_min_chunk: cdc_params.m,
325        cdc_avg_chunk: 1 << cdc_params.f,
326        cdc_max_chunk: cdc_params.z,
327        cdc_chunks_total: cdc_stats.chunk_count,
328        cdc_chunks_unique: cdc_stats.unique_chunk_count,
329        cdc_dedup_ratio,
330        cdc_dedup_savings_pct,
331
332        estimated_packed_size_lz4_fixed,
333        estimated_packed_size_zstd_cdc,
334        overall_best_savings_pct,
335    })
336}