Skip to main content

hexz_cli/cmd/data/
diff.rs

1//! Compare block hashes between two Hexz archives.
2//!
3//! Reports how much data is shared between two snapshots, how many blocks are
4//! unique to each, and the implied storage savings from deduplication.
5//!
6//! # Common Usage
7//!
8//! ```bash
9//! hexz diff base.hxz finetuned.hxz
10//! ```
11
12use anyhow::{Context, Result};
13use hexz_core::format::header::Header;
14use hexz_core::format::index::{IndexPage, MasterIndex};
15use hexz_ops::inspect::inspect_snapshot;
16use indicatif::HumanBytes;
17use std::collections::HashSet;
18use std::fs::File;
19use std::io::{Read, Seek, SeekFrom};
20use std::path::{Path, PathBuf};
21
22/// Per-block classification for one archive, derived from a single index scan.
23struct BlockSummary {
24    /// Hashes of blocks with actual data stored in this file.
25    hashes: HashSet<[u8; 32]>,
26    /// Bytes covered by parent-ref blocks (shared with parent by definition).
27    parent_ref_bytes: u64,
28    /// Number of parent-ref blocks.
29    parent_ref_blocks: usize,
30    /// Bytes of data blocks whose hash is not in `hashes` of the other file.
31    unique_bytes: u64,
32    unique_blocks: usize,
33}
34
35fn scan(path: &Path) -> Result<BlockSummary> {
36    let mut f = File::open(path)?;
37    let header = Header::read_from(&mut f)?;
38    let master = MasterIndex::read_from(&mut f, header.index_offset)?;
39
40    let mut hashes = HashSet::new();
41    let mut parent_ref_bytes = 0u64;
42    let mut parent_ref_blocks = 0usize;
43
44    for page_meta in &master.primary_pages {
45        f.seek(SeekFrom::Start(page_meta.offset))?;
46        let mut buf = vec![0u8; page_meta.length as usize];
47        f.read_exact(&mut buf)?;
48        let page: IndexPage = bincode::deserialize(&buf)?;
49        for block in page.blocks {
50            if block.is_parent_ref() {
51                parent_ref_blocks += 1;
52                parent_ref_bytes += block.logical_len as u64;
53            } else if !block.is_sparse() && block.hash != [0u8; 32] {
54                hashes.insert(block.hash);
55            }
56        }
57    }
58
59    Ok(BlockSummary {
60        hashes,
61        parent_ref_bytes,
62        parent_ref_blocks,
63        unique_bytes: 0,
64        unique_blocks: 0,
65    })
66}
67
68/// Compare two archives and report shared vs. unique block data.
69pub fn run(a: PathBuf, b: PathBuf) -> Result<()> {
70    let info_a = inspect_snapshot(&a).with_context(|| format!("Failed to read {}", a.display()))?;
71    let info_b = inspect_snapshot(&b).with_context(|| format!("Failed to read {}", b.display()))?;
72
73    let mut summary_a =
74        scan(&a).with_context(|| format!("Failed to read blocks from {}", a.display()))?;
75    let mut summary_b =
76        scan(&b).with_context(|| format!("Failed to read blocks from {}", b.display()))?;
77
78    // Classify each file's data blocks as shared or unique relative to the other.
79    // parent-ref blocks in B are shared with A by definition (they point at the parent).
80    let mut shared_blocks = summary_b.parent_ref_blocks;
81    let mut shared_bytes = summary_b.parent_ref_bytes;
82
83    // Scan B's data blocks against A's hash set.
84    {
85        let mut f = File::open(&b)?;
86        let header = Header::read_from(&mut f)?;
87        let master = MasterIndex::read_from(&mut f, header.index_offset)?;
88
89        for page_meta in &master.primary_pages {
90            f.seek(SeekFrom::Start(page_meta.offset))?;
91            let mut buf = vec![0u8; page_meta.length as usize];
92            f.read_exact(&mut buf)?;
93            let page: IndexPage = bincode::deserialize(&buf)?;
94            for block in page.blocks {
95                if block.is_parent_ref() || block.is_sparse() || block.hash == [0u8; 32] {
96                    continue;
97                }
98                if summary_a.hashes.contains(&block.hash) {
99                    shared_blocks += 1;
100                    shared_bytes += block.logical_len as u64;
101                } else {
102                    summary_b.unique_blocks += 1;
103                    summary_b.unique_bytes += block.logical_len as u64;
104                }
105            }
106        }
107    }
108
109    // Scan A's data blocks against B's hash set for unique-to-A count.
110    {
111        let mut f = File::open(&a)?;
112        let header = Header::read_from(&mut f)?;
113        let master = MasterIndex::read_from(&mut f, header.index_offset)?;
114
115        for page_meta in &master.primary_pages {
116            f.seek(SeekFrom::Start(page_meta.offset))?;
117            let mut buf = vec![0u8; page_meta.length as usize];
118            f.read_exact(&mut buf)?;
119            let page: IndexPage = bincode::deserialize(&buf)?;
120            for block in page.blocks {
121                if block.is_parent_ref() || block.is_sparse() || block.hash == [0u8; 32] {
122                    continue;
123                }
124                if !summary_b.hashes.contains(&block.hash) {
125                    summary_a.unique_blocks += 1;
126                    summary_a.unique_bytes += block.logical_len as u64;
127                }
128            }
129        }
130    }
131
132    // --- Render ---
133    let name_a = a.file_name().unwrap_or(a.as_os_str()).to_string_lossy();
134    let name_b = b.file_name().unwrap_or(b.as_os_str()).to_string_lossy();
135    let max_name = name_a.len().max(name_b.len());
136
137    let total_a_data_blocks = summary_a.hashes.len();
138    let total_b_data_blocks = summary_b.hashes.len() + summary_b.parent_ref_blocks;
139
140    println!();
141    println!(
142        "  {:<width$}  {:>10}  {:>6} blocks",
143        name_a,
144        HumanBytes(info_a.file_size),
145        total_a_data_blocks,
146        width = max_name,
147    );
148    println!(
149        "  {:<width$}  {:>10}  {:>6} blocks",
150        name_b,
151        HumanBytes(info_b.file_size),
152        total_b_data_blocks,
153        width = max_name,
154    );
155    println!();
156
157    let total_b_bytes = (shared_bytes + summary_b.unique_bytes).max(1);
158    let pct = |n: u64| n as f64 / total_b_bytes as f64 * 100.0;
159
160    // When B is a thin snapshot, its parent-ref blocks cover data owned by A.
161    // Those hashes aren't stored in B's index, so A's blocks appear "not found in B"
162    // even though they're shared. Suppress the misleading "only in A" count in that case.
163    let is_thin_b = summary_b.parent_ref_blocks > 0;
164    let thin_note = if is_thin_b {
165        format!("  ({} via parent refs)", summary_b.parent_ref_blocks)
166    } else {
167        String::new()
168    };
169
170    println!(
171        "  Shared:        {:>10}  {:>6} blocks  ({:.0}%){}",
172        HumanBytes(shared_bytes),
173        shared_blocks,
174        pct(shared_bytes),
175        thin_note,
176    );
177    println!(
178        "  New in {:<width$}  {:>10}  {:>6} blocks",
179        format!("{}:", name_b),
180        HumanBytes(summary_b.unique_bytes),
181        summary_b.unique_blocks,
182        width = max_name + 1,
183    );
184    if !is_thin_b {
185        println!(
186            "  Only in {:<width$}  {:>10}  {:>6} blocks",
187            format!("{}:", name_a),
188            HumanBytes(summary_a.unique_bytes),
189            summary_a.unique_blocks,
190            width = max_name + 1,
191        );
192    }
193    println!();
194    println!("  Storage saved: {}", HumanBytes(shared_bytes));
195    println!();
196
197    Ok(())
198}