1use std::collections::HashSet;
2use std::fs::File;
3use std::io::{Read, Seek, SeekFrom};
4use std::path::PathBuf;
5
6use hexz_common::{Error, Result};
7use serde::Serialize;
8
9use hexz_core::algo::compression::Compressor;
10use hexz_core::algo::compression::lz4::Lz4Compressor;
11use hexz_core::algo::compression::zstd::ZstdCompressor;
12use hexz_core::algo::dedup::cdc::analyze_stream;
13use hexz_core::algo::dedup::dcam::{DedupeParams, optimize_params};
14
15use crate::pack::calculate_entropy;
16use crate::write::is_zero_chunk;
17
18#[derive(Debug, Clone)]
20pub struct PredictConfig {
21 pub path: PathBuf,
23 pub block_size: usize,
25 pub min_chunk: Option<u32>,
27 pub avg_chunk: Option<u32>,
29 pub max_chunk: Option<u32>,
31 pub sample_count: usize,
33 pub dedup_scan_limit: u64,
35}
36
37impl Default for PredictConfig {
38 fn default() -> Self {
39 Self {
40 path: PathBuf::new(),
41 block_size: 65536,
42 min_chunk: None,
43 avg_chunk: None,
44 max_chunk: None,
45 sample_count: 4000,
46 dedup_scan_limit: 256 * 1024 * 1024,
47 }
48 }
49}
50
51#[derive(Debug, Serialize)]
53pub struct PredictReport {
54 pub file_path: String,
56 pub file_size: u64,
58 pub block_size: usize,
60 pub blocks_sampled: usize,
62
63 pub zero_block_pct: f64,
65 pub mean_entropy: f64,
67 pub high_entropy_pct: f64,
69
70 pub lz4_ratio: f64,
72 pub lz4_savings_pct: f64,
74 pub zstd_ratio: f64,
76 pub zstd_savings_pct: f64,
78 pub estimated_lz4_size: u64,
80 pub estimated_zstd_size: u64,
82
83 pub fixed_dedup_ratio: f64,
85 pub fixed_dedup_savings_pct: f64,
87
88 pub cdc_scan_bytes: u64,
90 pub cdc_min_chunk: u32,
92 pub cdc_avg_chunk: u32,
94 pub cdc_max_chunk: u32,
96 pub cdc_chunks_total: u64,
98 pub cdc_chunks_unique: u64,
100 pub cdc_dedup_ratio: f64,
102 pub cdc_dedup_savings_pct: f64,
104
105 pub estimated_packed_size_lz4_fixed: u64,
107 pub estimated_packed_size_zstd_cdc: u64,
109 pub overall_best_savings_pct: f64,
111}
112
113pub fn predict(config: &PredictConfig) -> Result<PredictReport> {
115 let mut f = File::open(&config.path)?;
116 let file_size = f.metadata()?.len();
117
118 if file_size == 0 {
119 return Err(Error::Format("File is empty".to_string()));
120 }
121
122 let step = (file_size / config.sample_count as u64).max(config.block_size as u64);
124 let mut buf = vec![0u8; config.block_size];
125
126 let lz4 = Lz4Compressor::new();
127 let zstd = ZstdCompressor::new(3, None);
128
129 let mut blocks_sampled: usize = 0;
130 let mut zero_count: usize = 0;
131 let mut entropy_sum: f64 = 0.0;
132 let mut high_entropy_count: usize = 0;
133 let mut lz4_compressed_total: u64 = 0;
134 let mut zstd_compressed_total: u64 = 0;
135 let mut raw_sampled_total: u64 = 0;
136
137 let mut seen_hashes: HashSet<u64> = HashSet::new();
139 let mut unique_sampled_bytes: u64 = 0;
140
141 let mut attempt: u64 = 0;
142 while blocks_sampled < config.sample_count {
143 let offset = attempt * step;
144 if offset >= file_size {
145 break;
146 }
147
148 _ = f.seek(SeekFrom::Start(offset))?;
149 let n = f.read(&mut buf)?;
150 if n == 0 {
151 break;
152 }
153 let chunk = &buf[..n];
154 blocks_sampled += 1;
155 raw_sampled_total += n as u64;
156
157 let digest = *blake3::hash(chunk).as_bytes();
159 let hash = u64::from_le_bytes(
160 digest[..8]
161 .try_into()
162 .map_err(|_| Error::Format("hash slice conversion failed".to_string()))?,
163 );
164 if seen_hashes.insert(hash) {
165 unique_sampled_bytes += n as u64;
166 }
167
168 if is_zero_chunk(chunk) {
169 zero_count += 1;
170 attempt += 1;
171 continue;
172 }
173
174 let entropy = calculate_entropy(chunk);
175 entropy_sum += entropy;
176 if entropy > 6.0 {
177 high_entropy_count += 1;
178 }
179
180 if let Ok(compressed) = lz4.compress(chunk) {
182 lz4_compressed_total += compressed.len() as u64;
183 } else {
184 lz4_compressed_total += n as u64;
185 }
186 if let Ok(compressed) = zstd.compress(chunk) {
187 zstd_compressed_total += compressed.len() as u64;
188 } else {
189 zstd_compressed_total += n as u64;
190 }
191
192 attempt += 1;
193 }
194
195 let non_zero_count = blocks_sampled - zero_count;
196
197 let zero_block_pct = if blocks_sampled > 0 {
198 zero_count as f64 / blocks_sampled as f64
199 } else {
200 0.0
201 };
202
203 let mean_entropy = if non_zero_count > 0 {
204 entropy_sum / non_zero_count as f64
205 } else {
206 0.0
207 };
208
209 let high_entropy_pct = if non_zero_count > 0 {
210 high_entropy_count as f64 / non_zero_count as f64
211 } else {
212 0.0
213 };
214
215 let zero_compressed_approx = zero_count as u64 * 20;
217 let lz4_total_with_zeros = lz4_compressed_total + zero_compressed_approx;
218 let zstd_total_with_zeros = zstd_compressed_total + zero_compressed_approx;
219
220 let lz4_ratio = if raw_sampled_total > 0 {
221 lz4_total_with_zeros as f64 / raw_sampled_total as f64
222 } else {
223 1.0
224 };
225 let zstd_ratio = if raw_sampled_total > 0 {
226 zstd_total_with_zeros as f64 / raw_sampled_total as f64
227 } else {
228 1.0
229 };
230
231 let lz4_savings_pct = (1.0 - lz4_ratio) * 100.0;
232 let zstd_savings_pct = (1.0 - zstd_ratio) * 100.0;
233 let estimated_lz4_size = (file_size as f64 * lz4_ratio) as u64;
234 let estimated_zstd_size = (file_size as f64 * zstd_ratio) as u64;
235
236 let fixed_dedup_ratio = if raw_sampled_total > 0 {
238 unique_sampled_bytes as f64 / raw_sampled_total as f64
239 } else {
240 1.0
241 };
242 let fixed_dedup_savings_pct = (1.0 - fixed_dedup_ratio) * 100.0;
243
244 _ = f.seek(SeekFrom::Start(0))?;
246
247 let baseline = DedupeParams::lbfs_baseline();
249 let baseline_stats = analyze_stream(f.try_clone()?, &baseline)?;
250
251 let cdc_params = if let (Some(min), Some(avg), Some(max)) =
252 (config.min_chunk, config.avg_chunk, config.max_chunk)
253 {
254 let f_bits = (avg as f64).log2().round() as u32;
256 DedupeParams {
257 f: f_bits,
258 m: min,
259 z: max,
260 w: 48,
261 v: 52,
262 }
263 } else {
264 let optimized = optimize_params(file_size, baseline_stats.unique_bytes, &baseline, false);
266 let mut params = optimized.params;
267 if let Some(min) = config.min_chunk {
268 params.m = min;
269 }
270 if let Some(avg) = config.avg_chunk {
271 params.f = (avg as f64).log2().round() as u32;
272 }
273 if let Some(max) = config.max_chunk {
274 params.z = max;
275 }
276 params
277 };
278
279 let scan_limit = config.dedup_scan_limit.min(file_size);
281 _ = f.seek(SeekFrom::Start(0))?;
282 let reader = f.by_ref().take(scan_limit);
283
284 let cdc_stats = analyze_stream(reader, &cdc_params)?;
285
286 let cdc_dedup_ratio = if cdc_stats.chunk_count > 0 && cdc_stats.unique_chunk_count > 0 {
288 cdc_stats.unique_bytes as f64
289 / (cdc_stats.unique_bytes as f64 * cdc_stats.chunk_count as f64
290 / cdc_stats.unique_chunk_count as f64)
291 } else {
292 1.0
293 };
294 let cdc_dedup_savings_pct = (1.0 - cdc_dedup_ratio) * 100.0;
295
296 let estimated_packed_size_lz4_fixed = (file_size as f64 * lz4_ratio * fixed_dedup_ratio) as u64;
298 let estimated_packed_size_zstd_cdc = (file_size as f64 * zstd_ratio * cdc_dedup_ratio) as u64;
299
300 let best_packed = estimated_packed_size_lz4_fixed.min(estimated_packed_size_zstd_cdc);
301 let overall_best_savings_pct = (1.0 - best_packed as f64 / file_size as f64) * 100.0;
302
303 Ok(PredictReport {
304 file_path: config.path.display().to_string(),
305 file_size,
306 block_size: config.block_size,
307 blocks_sampled,
308
309 zero_block_pct,
310 mean_entropy,
311 high_entropy_pct,
312
313 lz4_ratio,
314 lz4_savings_pct,
315 zstd_ratio,
316 zstd_savings_pct,
317 estimated_lz4_size,
318 estimated_zstd_size,
319
320 fixed_dedup_ratio,
321 fixed_dedup_savings_pct,
322
323 cdc_scan_bytes: scan_limit,
324 cdc_min_chunk: cdc_params.m,
325 cdc_avg_chunk: 1 << cdc_params.f,
326 cdc_max_chunk: cdc_params.z,
327 cdc_chunks_total: cdc_stats.chunk_count,
328 cdc_chunks_unique: cdc_stats.unique_chunk_count,
329 cdc_dedup_ratio,
330 cdc_dedup_savings_pct,
331
332 estimated_packed_size_lz4_fixed,
333 estimated_packed_size_zstd_cdc,
334 overall_best_savings_pct,
335 })
336}