Skip to main content

hexz_ops/
inspect.rs

1//! Archive inspection and metadata extraction.
2
3use hexz_common::Result;
4use hexz_core::format::header::{CompressionType, Header};
5use hexz_core::format::index::{IndexPage, MasterIndex};
6use std::collections::HashSet;
7use std::fs::File;
8use std::io::{Read, Seek, SeekFrom};
9use std::path::Path;
10
11/// Per-block statistics for an archive.
12#[derive(Debug, Default, serde::Serialize)]
13pub struct BlockStats {
14    /// Number of data blocks.
15    pub data_blocks: usize,
16    /// Total uncompressed bytes in data blocks.
17    pub data_bytes: u64,
18    /// Number of parent-reference blocks.
19    pub parent_ref_blocks: usize,
20    /// Total bytes represented by parent references.
21    pub parent_ref_bytes: u64,
22    /// Number of zero (sparse) blocks.
23    pub zero_blocks: usize,
24    /// Total bytes represented by zero blocks.
25    pub zero_bytes: u64,
26
27    /// Smallest data block size in bytes.
28    pub min_block_size: u32,
29    /// Largest data block size in bytes.
30    pub max_block_size: u32,
31    /// Average data block size in bytes.
32    pub avg_block_size: u32,
33
34    /// Number of unique (non-duplicate) blocks.
35    pub unique_blocks: usize,
36    /// Number of deduplicated blocks.
37    pub dedup_blocks: usize,
38    /// Bytes saved by deduplication.
39    pub dedup_bytes_saved: u64,
40
41    /// Total compressed bytes for data blocks.
42    pub compressed_data_bytes: u64,
43}
44
45/// Feature flags describing what an archive contains.
46#[derive(Debug, Default)]
47#[allow(clippy::struct_excessive_bools)]
48pub struct ArchiveFeatures {
49    /// Whether the archive is encrypted.
50    pub encrypted: bool,
51    /// Whether a main stream is present.
52    pub has_main: bool,
53    /// Whether an auxiliary stream is present.
54    pub has_auxiliary: bool,
55    /// Whether variable-size blocks are enabled.
56    pub variable_blocks: bool,
57    /// Whether a cryptographic signature is present.
58    pub signature_present: bool,
59    /// Whether a compression dictionary is present.
60    pub dictionary_present: bool,
61}
62
63/// Metadata extracted from an archive file.
64pub struct ArchiveInfo {
65    /// Archive format version.
66    pub version: u32,
67    /// Block size in bytes.
68    pub block_size: u32,
69    /// Compression algorithm used.
70    pub compression: CompressionType,
71    /// Paths to parent archives for delta dedup.
72    pub parent_paths: Vec<String>,
73    /// Feature flags for this archive.
74    pub features: ArchiveFeatures,
75    /// Total uncompressed size of the main stream.
76    pub main_size: u64,
77    /// Total uncompressed size of the auxiliary stream.
78    pub auxiliary_size: u64,
79    /// On-disk file size in bytes.
80    pub file_size: u64,
81    /// Byte offset of the master index.
82    pub index_offset: u64,
83    /// Number of index pages for the main stream.
84    pub main_pages: usize,
85    /// Number of index pages for the auxiliary stream.
86    pub auxiliary_pages: usize,
87    /// Byte offset of embedded metadata, if any.
88    pub metadata_offset: Option<u64>,
89    /// Length of embedded metadata in bytes.
90    pub metadata_length: Option<u32>,
91    /// Decoded metadata string, if present.
92    pub metadata: Option<String>,
93    /// Detailed block-level statistics.
94    pub block_stats: Option<BlockStats>,
95}
96
97impl std::fmt::Debug for ArchiveInfo {
98    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
99        f.debug_struct("ArchiveInfo")
100            .field("version", &self.version)
101            .field("block_size", &self.block_size)
102            .field("compression", &self.compression)
103            .field("file_size", &self.file_size)
104            .finish_non_exhaustive()
105    }
106}
107
108impl ArchiveInfo {
109    /// Returns the total uncompressed size (main + auxiliary).
110    pub const fn total_uncompressed(&self) -> u64 {
111        self.main_size + self.auxiliary_size
112    }
113
114    /// Returns the compression ratio (uncompressed / file size).
115    pub fn compression_ratio(&self) -> f64 {
116        if self.file_size > 0 {
117            self.total_uncompressed() as f64 / self.file_size as f64
118        } else {
119            0.0
120        }
121    }
122}
123
124/// Reads and returns metadata and block statistics for an archive file.
125pub fn inspect_archive(path: impl AsRef<Path>) -> Result<ArchiveInfo> {
126    let mut f = File::open(path.as_ref())?;
127    let file_size = f.metadata()?.len();
128
129    let header = Header::read_from(&mut f)?;
130    let master = MasterIndex::read_from(&mut f, header.index_offset)?;
131
132    let metadata = if let (Some(off), Some(len)) = (header.metadata_offset, header.metadata_length)
133    {
134        let mut buf = vec![0u8; len as usize];
135        _ = f.seek(SeekFrom::Start(off))?;
136        f.read_exact(&mut buf)?;
137        Some(String::from_utf8_lossy(&buf).to_string())
138    } else {
139        None
140    };
141
142    let mut stats = BlockStats {
143        min_block_size: u32::MAX,
144        ..Default::default()
145    };
146    let mut seen_offsets: HashSet<u64> = HashSet::new();
147
148    for page_meta in &master.main_pages {
149        _ = f.seek(SeekFrom::Start(page_meta.offset))?;
150        let mut page_bytes = vec![0u8; page_meta.length as usize];
151        f.read_exact(&mut page_bytes)?;
152
153        let page: IndexPage = bincode::deserialize(&page_bytes)?;
154
155        for block in page.blocks {
156            if block.is_parent_ref() {
157                stats.parent_ref_blocks += 1;
158                stats.parent_ref_bytes += block.logical_len as u64;
159            } else if block.is_sparse() {
160                stats.zero_blocks += 1;
161                stats.zero_bytes += block.logical_len as u64;
162            } else {
163                stats.data_blocks += 1;
164                stats.data_bytes += block.logical_len as u64;
165                stats.compressed_data_bytes += block.length as u64;
166
167                if block.logical_len < stats.min_block_size {
168                    stats.min_block_size = block.logical_len;
169                }
170                if block.logical_len > stats.max_block_size {
171                    stats.max_block_size = block.logical_len;
172                }
173
174                if seen_offsets.insert(block.offset) {
175                    stats.unique_blocks += 1;
176                } else {
177                    stats.dedup_blocks += 1;
178                    stats.dedup_bytes_saved += block.logical_len as u64;
179                }
180            }
181        }
182    }
183
184    if stats.data_blocks > 0 {
185        stats.avg_block_size = (stats.data_bytes / stats.data_blocks as u64) as u32;
186    } else {
187        stats.min_block_size = 0;
188    }
189
190    Ok(ArchiveInfo {
191        version: header.version,
192        block_size: header.block_size,
193        compression: header.compression,
194        parent_paths: header.parent_paths,
195        features: ArchiveFeatures {
196            encrypted: header.encryption.is_some(),
197            has_main: header.features.has_main,
198            has_auxiliary: header.features.has_auxiliary,
199            variable_blocks: header.features.variable_blocks,
200            signature_present: header.signature_offset.is_some(),
201            dictionary_present: header.dictionary_offset.is_some(),
202        },
203        main_size: master.main_size,
204        auxiliary_size: master.auxiliary_size,
205        file_size,
206        index_offset: header.index_offset,
207        main_pages: master.main_pages.len(),
208        auxiliary_pages: master.auxiliary_pages.len(),
209        metadata_offset: header.metadata_offset,
210        metadata_length: header.metadata_length,
211        metadata,
212        block_stats: Some(stats),
213    })
214}