Skip to main content

hexz_cli/cmd/data/
diff.rs

1//! Compare block hashes between two Hexz archives.
2//!
3//! Reports how much data is shared between two archives at the block-hash
4//! level, and — when the archives contain checkpoint manifests — at the
5//! logical checkpoint-delta level (XOR delta tensors).
6//!
7//! # Common Usage
8//!
9//! ```bash
10//! hexz diff base.hxz finetuned.hxz
11//! ```
12
13use anyhow::{Context, Result};
14use hexz_core::format::header::Header;
15use hexz_core::format::index::{IndexPage, MasterIndex};
16use hexz_ops::inspect::inspect_archive;
17use indicatif::HumanBytes;
18use std::collections::HashMap;
19use std::fs::File;
20use std::io::{Read, Seek, SeekFrom};
21use std::path::Path;
22
23use crate::ui::color::palette;
24
25/// Per-block classification for one archive, derived from a single index scan.
26struct BlockSummary {
27    /// Unique data block hashes → uncompressed logical length.
28    ///
29    /// Using a map (not a set) lets us compute byte totals from set operations
30    /// without re-reading the index.
31    data: HashMap<[u8; 32], u64>,
32    /// Bytes covered by parent-ref blocks (logically shared with the parent).
33    parent_ref_bytes: u64,
34    /// Number of parent-ref block entries.
35    parent_ref_blocks: usize,
36}
37
38fn scan(path: &Path) -> Result<BlockSummary> {
39    let mut f = File::open(path)?;
40    let header = Header::read_from(&mut f)?;
41    let master = MasterIndex::read_from(&mut f, header.index_offset)?;
42
43    let mut data: HashMap<[u8; 32], u64> = HashMap::new();
44    let mut parent_ref_bytes = 0u64;
45    let mut parent_ref_blocks = 0usize;
46
47    for page_meta in &master.main_pages {
48        let _ = f.seek(SeekFrom::Start(page_meta.offset))?;
49        let mut buf = vec![0u8; page_meta.length as usize];
50        f.read_exact(&mut buf)?;
51        let page: IndexPage = bincode::deserialize(&buf)?;
52        for block in page.blocks {
53            if block.is_parent_ref() {
54                parent_ref_blocks += 1;
55                parent_ref_bytes += block.logical_len as u64;
56            } else if !block.is_sparse() && block.hash != [0u8; 32] {
57                // or_insert keeps the first logical_len seen for a given hash;
58                // blocks with the same hash always have the same content/size.
59                let _ = data.entry(block.hash).or_insert(block.logical_len as u64);
60            }
61        }
62    }
63
64    Ok(BlockSummary {
65        data,
66        parent_ref_bytes,
67        parent_ref_blocks,
68    })
69}
70
71/// XOR-delta checkpoint statistics parsed from archive B's manifest.
72struct CheckpointDelta {
73    /// Number of tensors using XOR delta encoding.
74    xor_delta_count: usize,
75    /// Sum of `base_length` across all XOR delta tensors (uncompressed base in A).
76    xor_base_bytes: u64,
77    /// True when B's declared parent matches A by filename.
78    parent_is_a: bool,
79    /// Filename of B's declared parent (for display).
80    parent_name: String,
81}
82
83/// Parse checkpoint manifest from B to extract XOR delta tensor statistics.
84///
85/// Returns `None` if the metadata is not a checkpoint, or if no tensors use
86/// XOR delta encoding.
87fn parse_checkpoint_delta(
88    meta_b: &str,
89    name_a: &str,
90    parent_paths_b: &[String],
91) -> Option<CheckpointDelta> {
92    let obj: serde_json::Value = serde_json::from_str(meta_b).ok()?;
93    let _ = obj.get("hexz_checkpoint")?; // must be a checkpoint manifest
94    let tensors = obj.get("tensors")?.as_object()?;
95
96    let mut xor_delta_count = 0usize;
97    let mut xor_base_bytes = 0u64;
98
99    for (_, tensor) in tensors {
100        let storage = tensor
101            .get("storage")
102            .and_then(|v| v.as_str())
103            .unwrap_or("raw");
104        let base_length = tensor
105            .get("base_length")
106            .and_then(serde_json::Value::as_u64)
107            .unwrap_or(0);
108
109        if storage == "xor_delta" {
110            xor_delta_count += 1;
111            xor_base_bytes += base_length;
112        }
113    }
114
115    if xor_delta_count == 0 {
116        return None;
117    }
118
119    let parent_name = parent_paths_b
120        .first()
121        .and_then(|p| Path::new(p).file_name())
122        .map(|f| f.to_string_lossy().into_owned())
123        .unwrap_or_default();
124    let parent_is_a = parent_name == name_a;
125
126    Some(CheckpointDelta {
127        xor_delta_count,
128        xor_base_bytes,
129        parent_is_a,
130        parent_name,
131    })
132}
133
134/// Compare two archives and report shared vs. unique block data.
135pub fn run(a: &Path, b: &Path) -> Result<()> {
136    let info_a = inspect_archive(a).with_context(|| format!("Failed to read {}", a.display()))?;
137    let info_b = inspect_archive(b).with_context(|| format!("Failed to read {}", b.display()))?;
138
139    let summary_a =
140        scan(a).with_context(|| format!("Failed to read blocks from {}", a.display()))?;
141    let summary_b =
142        scan(b).with_context(|| format!("Failed to read blocks from {}", b.display()))?;
143
144    // Set operations on unique hashes — consistent with the header counts.
145    // parent-ref blocks in B are counted as "shared" (they point at A's data).
146    let shared_data_blocks: usize = summary_b
147        .data
148        .keys()
149        .filter(|h| summary_a.data.contains_key(*h))
150        .count();
151    let shared_data_bytes: u64 = summary_b
152        .data
153        .iter()
154        .filter(|(h, _)| summary_a.data.contains_key(*h))
155        .map(|(_, &len)| len)
156        .sum();
157
158    let shared_blocks = shared_data_blocks + summary_b.parent_ref_blocks;
159    let shared_bytes = shared_data_bytes + summary_b.parent_ref_bytes;
160
161    let new_b_blocks: usize = summary_b
162        .data
163        .keys()
164        .filter(|h| !summary_a.data.contains_key(*h))
165        .count();
166    let new_b_bytes: u64 = summary_b
167        .data
168        .iter()
169        .filter(|(h, _)| !summary_a.data.contains_key(*h))
170        .map(|(_, &len)| len)
171        .sum();
172
173    let only_a_blocks: usize = summary_a
174        .data
175        .keys()
176        .filter(|h| !summary_b.data.contains_key(*h))
177        .count();
178    let only_a_bytes: u64 = summary_a
179        .data
180        .iter()
181        .filter(|(h, _)| !summary_b.data.contains_key(*h))
182        .map(|(_, &len)| len)
183        .sum();
184
185    // Optional: checkpoint delta stats from B's manifest.
186    let name_a_str = a
187        .file_name()
188        .unwrap_or(a.as_os_str())
189        .to_string_lossy()
190        .into_owned();
191    let name_b_str = b
192        .file_name()
193        .unwrap_or(b.as_os_str())
194        .to_string_lossy()
195        .into_owned();
196
197    let cp_delta = info_b
198        .metadata
199        .as_deref()
200        .and_then(|m| parse_checkpoint_delta(m, &name_a_str, &info_b.parent_paths));
201
202    // --- Render ---
203    let p = palette();
204
205    let max_name = name_a_str.len().max(name_b_str.len());
206
207    // Pre-format all alignment-sensitive strings as plain text so ANSI codes
208    // don't skew column widths.
209    let name_a_col = format!("{name_a_str:<max_name$}");
210    let name_b_col = format!("{name_b_str:<max_name$}");
211    let size_a_col = format!("{:>10}", HumanBytes(info_a.file_size));
212    let size_b_col = format!("{:>10}", HumanBytes(info_b.file_size));
213    let blk_a_col = format!("{:>6}", summary_a.data.len() + summary_a.parent_ref_blocks);
214    let blk_b_col = format!("{:>6}", summary_b.data.len() + summary_b.parent_ref_blocks);
215
216    // Label column: wide enough for "Only in <longest_name>:"
217    let lbl_w = "Only in ".len() + max_name + 1;
218    let shared_lbl = format!("{:<lbl_w$}", "Shared:");
219    let new_b_lbl = format!("{:<lbl_w$}", format!("New in {name_b_str}:"));
220    let only_a_lbl = format!("{:<lbl_w$}", format!("Only in {name_a_str}:"));
221    let shared_size_col = format!("{:>10}", HumanBytes(shared_bytes));
222    let new_b_size_col = format!("{:>10}", HumanBytes(new_b_bytes));
223    let only_a_size_col = format!("{:>10}", HumanBytes(only_a_bytes));
224    let shared_blk_col = format!("{shared_blocks:>6}");
225    let new_b_blk_col = format!("{new_b_blocks:>6}");
226    let only_a_blk_col = format!("{only_a_blocks:>6}");
227
228    let total_b_bytes = (shared_bytes + new_b_bytes).max(1);
229    let pct = |n: u64| n as f64 / total_b_bytes as f64 * 100.0;
230
231    let is_thin_b = summary_b.parent_ref_blocks > 0;
232    let is_xor_delta = cp_delta.is_some();
233
234    let thin_note = if is_thin_b {
235        format!(
236            "  {}({} via parent refs){}",
237            p.gray, summary_b.parent_ref_blocks, p.reset
238        )
239    } else {
240        String::new()
241    };
242
243    // "Storage saved" means different things depending on archive type:
244    //   • plain / thin: bytes already in A that B doesn't need to re-store
245    //   • XOR delta: how much smaller B is on disk vs A (proxy for delta compression saving)
246    let (saved_label, saved_bytes) = if is_xor_delta {
247        (
248            "Delta saving:",
249            info_a.file_size.saturating_sub(info_b.file_size),
250        )
251    } else {
252        ("Storage saved:", shared_bytes)
253    };
254    let saved_lbl = format!("{saved_label:<lbl_w$}");
255    let saved_size_col = format!("{:>10}", HumanBytes(saved_bytes));
256
257    // B header tag.
258    let b_delta_tag = if is_xor_delta {
259        format!("  {}(XOR delta checkpoint){}", p.dim, p.reset)
260    } else {
261        String::new()
262    };
263
264    // File header
265    println!();
266    println!(
267        "  {}{}{}  {}{}{}  {} blocks",
268        p.bold, name_a_col, p.reset, p.green, size_a_col, p.reset, blk_a_col
269    );
270    println!(
271        "  {}{}{}  {}{}{}  {} blocks{}",
272        p.bold, name_b_col, p.reset, p.green, size_b_col, p.reset, blk_b_col, b_delta_tag
273    );
274    println!();
275
276    if is_xor_delta {
277        // For XOR delta archives the block-hash comparison is always 0% (XOR produces
278        // unique hashes) and always shows the same logical sizes (XOR preserves tensor
279        // size). Suppress those rows — they would be identical noise for every step.
280        // The checkpoint delta section below carries all the meaningful information.
281    } else {
282        // Block-level comparison rows (plain / thin archives only).
283        println!(
284            "  {}{}{}  {}{}{}  {} blocks  {}({:.0}%){}{}",
285            p.cyan,
286            shared_lbl,
287            p.reset,
288            p.green,
289            shared_size_col,
290            p.reset,
291            shared_blk_col,
292            p.bold,
293            pct(shared_bytes),
294            p.reset,
295            thin_note,
296        );
297        println!(
298            "  {}{}{}  {}{}{}  {} blocks",
299            p.cyan, new_b_lbl, p.reset, p.yellow, new_b_size_col, p.reset, new_b_blk_col,
300        );
301        if !is_thin_b {
302            println!(
303                "  {}{}{}  {}{}{}  {} blocks",
304                p.cyan, only_a_lbl, p.reset, p.dim, only_a_size_col, p.reset, only_a_blk_col,
305            );
306        }
307        println!();
308    }
309
310    // Checkpoint delta section (only when B has XOR-delta tensors).
311    if let Some(ref d) = cp_delta {
312        let base_name = if d.parent_is_a {
313            &name_a_str
314        } else {
315            &d.parent_name
316        };
317        let base_tag = if d.parent_is_a {
318            format!("{}{}{}", p.yellow, base_name, p.reset)
319        } else {
320            // B derives from someone else, not A — flag it clearly.
321            format!(
322                "{}{}{} {}(not {}){}",
323                p.yellow, base_name, p.reset, p.gray, name_a_str, p.reset
324            )
325        };
326
327        let compression_ratio = d.xor_base_bytes as f64 / info_b.file_size as f64;
328
329        println!(
330            "  {}Checkpoint delta{}  ({} tensors use XOR delta off {})",
331            p.bold, p.reset, d.xor_delta_count, base_tag,
332        );
333        println!(
334            "    {}{}{} base  →  {}{}{} on disk  {}({:.1}×  compression){}",
335            p.green,
336            HumanBytes(d.xor_base_bytes),
337            p.reset,
338            p.yellow,
339            HumanBytes(info_b.file_size),
340            p.reset,
341            p.dim,
342            compression_ratio,
343            p.reset,
344        );
345        println!(
346            "    {}{}{} {}required for reconstruction{}",
347            p.yellow, base_name, p.reset, p.dim, p.reset,
348        );
349        println!();
350    }
351
352    println!(
353        "  {}{}{}  {}{}{}",
354        p.cyan, saved_lbl, p.reset, p.green, saved_size_col, p.reset
355    );
356    println!();
357
358    Ok(())
359}