hexz_cli/cmd/data/build.rs
1//! Build archive from source with profile-based optimization.
2//!
3//! This command provides a high-level interface for creating Hexz snapshots
4//! with domain-specific optimizations. Unlike the low-level `pack` command,
5//! `build` uses predefined profiles that automatically select compression
6//! algorithms, block sizes, and dictionary training settings optimized for
7//! different workload types.
8//!
9//! # Build Profiles
10//!
11//! Profiles configure compression and chunking parameters based on workload characteristics:
12//!
13//! **Generic Profile (Default):**
14//! - Compression: LZ4 (fast, general-purpose)
15//! - Block size: 64 KiB (balanced for most workloads)
16//! - Dictionary training: Disabled (minimal overhead)
17//! - Use case: General operating system images, file servers, development VMs
18//!
19//! **EDA Profile (Electronic Design Automation):**
20//! - Compression: Zstd level 3 (high ratio for large design files)
21//! - Block size: 128 KiB (optimized for large CAD files and netlists)
22//! - Dictionary training: Enabled (learns patterns from design data)
23//! - Use case: ASIC/FPGA design environments with large binary databases
24//!
25//! **Embedded Profile:**
26//! - Compression: LZ4 (minimal CPU overhead for resource-constrained targets)
27//! - Block size: 16 KiB (smaller blocks reduce memory pressure)
28//! - Dictionary training: Disabled (reduces snapshot creation time)
29//! - Use case: IoT devices, embedded Linux systems, edge computing
30//!
31//! **ML Profile (Machine Learning):**
32//! - Compression: Zstd level 3 (handles large model files efficiently)
33//! - Block size: 256 KiB (optimized for model weights and training data)
34//! - Dictionary training: Enabled (learns patterns from tensor data)
35//! - Use case: ML training environments, GPU workstations, Jupyter notebooks
36//!
37//! # Build Profile Effects
38//!
39//! ## Compression Algorithm Selection
40//!
41//! - **LZ4**: Provides 2-3x compression at 500+ MB/s, ideal for fast boot times
42//! - **Zstd**: Provides 3-5x compression at 200+ MB/s, ideal for storage efficiency
43//!
44//! ## Block Size Selection
45//!
46//! Smaller blocks (16-64 KiB):
47//! - Lower memory usage during decompression
48//! - Finer-grained deduplication
49//! - Higher index overhead
50//!
51//! Larger blocks (128-256 KiB):
52//! - Better compression ratios (larger context window)
53//! - Reduced index size
54//! - Higher decompression memory requirements
55//!
56//! ## Dictionary Training
57//!
58//! When enabled, samples ~1000 blocks and trains a Zstd dictionary:
59//! - Improves compression ratio by 10-30% for repetitive data
60//! - Adds 2-5 seconds to snapshot creation time
61//! - Dictionary size: ~110 KiB stored in snapshot header
62//!
63//! # Use Cases
64//!
65//! - **VM Image Creation**: Build bootable snapshots from disk images
66//! - **Reproducible Environments**: Create snapshots with consistent compression settings
67//! - **Workload Optimization**: Select profiles matched to application characteristics
68//! - **CI/CD Pipelines**: Automate snapshot creation with profile presets
69//!
70//! # Common Usage Patterns
71//!
72//! ```bash
73//! # Build generic snapshot from disk image
74//! hexz build --source disk.img --output snapshot.st
75//!
76//! # Build EDA workstation with optimal compression
77//! hexz build --source eda-vm.img --output eda.st --profile eda
78//!
79//! # Build ML environment with encryption
80//! hexz build --source ml.img --output ml.st --profile ml --encrypt
81//!
82//! # Build with content-defined chunking for deduplication
83//! hexz build --source app.img --output app.st --cdc
84//! ```
85
86use anyhow::Result;
87use hexz_common::config::BuildProfile;
88use std::path::PathBuf;
89
90/// Executes the build command to create a snapshot using profile-based settings.
91///
92/// This command maps a high-level build profile to low-level packing parameters
93/// (compression algorithm, block size, dictionary training) and delegates to the
94/// `pack` command for actual snapshot creation. This provides a simplified
95/// interface for users who want optimized settings without manual tuning.
96///
97/// # Arguments
98///
99/// * `source` - Path to the source disk image (raw or qcow2 format)
100/// * `memory` - Optional path to memory dump file to include in snapshot
101/// * `output` - Output path for the generated `.st` snapshot file
102/// * `profile` - Build profile name: "generic", "eda", "embedded", or "ml"
103/// * `encrypt` - Enable AES-256-GCM encryption (prompts for password)
104/// * `cdc` - Enable content-defined chunking for variable-sized blocks
105///
106/// # Profile Parameter Mapping
107///
108/// The function resolves the profile name to a `BuildProfile` enum and extracts:
109/// - Compression algorithm (`compression_algo()`)
110/// - Block size in bytes (`block_size()`)
111/// - Dictionary training recommendation (`recommended_dict_training()`)
112///
113/// These parameters are then passed to `pack::run()` along with CDC settings.
114///
115/// # CDC (Content-Defined Chunking) Parameters
116///
117/// When `cdc` is enabled, variable-sized blocks are used with FastCDC:
118/// - `min_chunk`: 16 KiB minimum chunk size
119/// - `avg_chunk`: 64 KiB average chunk size (default block size)
120/// - `max_chunk`: 128 KiB maximum chunk size
121///
122/// These defaults can be overridden by calling `pack::run()` directly.
123///
124/// # Errors
125///
126/// Returns an error if:
127/// - The source file cannot be opened or read
128/// - The output path is not writable
129/// - The encryption password is invalid (if encryption is enabled)
130/// - Compression or packing operations fail
131/// - Disk I/O errors occur during processing
132///
133/// # Examples
134///
135/// ```no_run
136/// use std::path::PathBuf;
137/// use hexz_cli::cmd::data::build;
138///
139/// // Build generic snapshot without encryption
140/// build::run(
141/// PathBuf::from("disk.img"),
142/// None,
143/// PathBuf::from("snapshot.hxz"),
144/// Some("generic".to_string()),
145/// false, // no encryption
146/// false, // no CDC
147/// )?;
148///
149/// // Build ML profile with encryption and CDC
150/// build::run(
151/// PathBuf::from("ml-vm.img"),
152/// None,
153/// PathBuf::from("ml.hxz"),
154/// Some("ml".to_string()),
155/// true, // encrypt
156/// true, // enable CDC
157/// )?;
158/// # Ok::<(), anyhow::Error>(())
159/// ```
160pub fn run(
161 source: PathBuf,
162 memory: Option<PathBuf>,
163 output: PathBuf,
164 profile: Option<String>,
165 encrypt: bool,
166 cdc: bool,
167) -> Result<()> {
168 // 1. Resolve profile
169 let build_profile = match profile.as_deref() {
170 Some("eda") => BuildProfile::Eda,
171 Some("embedded") => BuildProfile::Embedded,
172 Some("ml") => BuildProfile::Ml,
173 Some("generic") | None => BuildProfile::Generic,
174 Some(other) => {
175 eprintln!(
176 "Warning: Unknown profile '{}', falling back to generic",
177 other
178 );
179 BuildProfile::Generic
180 }
181 };
182
183 println!("Building snapshot with profile: {:?}", build_profile);
184
185 // 2. Map profile to parameters
186 let compression = build_profile.compression_algo().to_string();
187 let block_size = build_profile.block_size();
188 let train_dict = build_profile.recommended_dict_training();
189
190 // 3. Delegate to pack
191 // Note: We currently map `source` directly to `disk`.
192 // Future work: Detect if `source` is a directory and pack it (e.g. tar/squashfs)
193 // or use `virt-make-fs`.
194 super::pack::run(
195 Some(source),
196 memory,
197 output,
198 compression,
199 encrypt,
200 train_dict,
201 block_size,
202 cdc,
203 16384, // min_chunk default
204 65536, // avg_chunk default
205 131072, // max_chunk default
206 false, // silent
207 )
208}