1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
mod cmd_cat;
mod cmd_extract;
mod cmd_info;
mod cmd_list;
mod cmd_verify;
mod cmd_wrap;
mod util;
use std::path::PathBuf;
use anyhow::Result;
use clap::{Parser, Subcommand};
/// Tar archives with random-access zstd and an embedded index.
///
/// tarzan reads and writes `.tar.zst` archives augmented with a table of
/// contents stored as a zstd skippable frame. Standard zstd tools can
/// decompress a tarzan archive normally; tarzan-aware tools can also list
/// contents and extract single files without a full decompression pass.
#[derive(Debug, Parser)]
#[command(name = "tarzan", version)]
struct Cli {
#[command(subcommand)]
command: Commands,
}
#[derive(Debug, Subcommand)]
enum Commands {
/// Wrap an existing tar stream into a tarzan `.tar.zst` archive.
///
/// Reads a raw tar stream and writes a tarzan-formatted archive,
/// splitting the body into independently decodable zstd frames and
/// appending a TOC frame. Designed for pipelines such as
/// `tar -cf - ./dir | tarzan wrap -f out.tar.zst`.
Wrap {
/// Input tar stream. `-` or omitted reads from stdin.
#[arg(value_name = "TAR")]
input: Option<PathBuf>,
/// Output archive path. `-` or omitted writes to stdout.
#[arg(short = 'f', long = "file", value_name = "ARCHIVE")]
file: Option<PathBuf>,
/// Chunk boundary size. Accepts plain bytes or K/M/G suffixes.
/// Smaller chunks improve random-access granularity at some cost
/// to compression ratio; larger chunks compress better.
#[arg(long = "chunk-size", default_value = "4M", value_parser = parse_size)]
chunk_size: usize,
/// Zstd compression level (1 = fastest, 22 = best).
#[arg(long = "level", default_value_t = 3)]
level: i32,
/// List each member to stderr after wrapping (tar's `-cvf`). Only
/// effective when output is a file; for stdout/pipes the listing
/// is suppressed since the archive can't be re-read.
#[arg(short = 'v', long = "verbose")]
verbose: bool,
},
/// List archive contents using only the embedded TOC.
///
/// Reads the TOC skippable frame at the tail of the archive without
/// decompressing any chunk data, so it runs in roughly constant time
/// regardless of archive size.
#[command(visible_aliases = ["t", "ls"])]
List {
/// Archive to list.
#[arg(short = 'f', long = "file", value_name = "ARCHIVE")]
file: PathBuf,
/// Show mode, owner/group, size, and mtime in addition to the
/// path, like `tar -tvf`. Symlink and hard-link entries show
/// their target as `path -> target`.
#[arg(short = 'v', long = "verbose", conflicts_with = "json")]
verbose: bool,
/// Emit the TOC as a pretty-printed JSON array. Each entry
/// includes path, type, size, mode, uid/gid, mtime, link
/// target, and chunk locations.
#[arg(long = "json")]
json: bool,
/// Show `-v` timestamps in UTC instead of local time, like
/// `tar --utc -tvf`.
#[arg(long = "utc")]
utc: bool,
/// Restrict the listing to these paths or directory prefixes;
/// omit to list everything. Matching is by exact path,
/// directory-prefix, or shell glob.
#[arg(value_name = "PATH")]
paths: Vec<String>,
},
/// Stream a single member from the archive to stdout.
///
/// Uses the TOC to seek directly to the member's chunks; only those
/// chunks are decompressed.
Cat {
/// Archive to read from.
#[arg(short = 'f', long = "file", value_name = "ARCHIVE")]
file: PathBuf,
/// Path of the member within the archive.
#[arg(value_name = "PATH")]
path: String,
},
/// Extract archive members onto the filesystem.
///
/// Decompresses only the chunks needed for the requested members
/// (or all of them, if no positional paths are given). Refuses to
/// extract absolute paths or paths containing `..` so extraction
/// stays inside the destination directory.
#[command(visible_alias = "x")]
Extract {
/// Archive to extract from.
#[arg(short = 'f', long = "file", value_name = "ARCHIVE")]
file: PathBuf,
/// Destination directory (created if missing). Defaults to the
/// current working directory.
#[arg(
short = 'C',
long = "directory",
value_name = "DIR",
default_value = "."
)]
directory: PathBuf,
/// Drop N leading path components from each archive entry,
/// like `tar --strip-components`.
#[arg(long = "strip-components", value_name = "N", default_value_t = 0)]
strip_components: usize,
/// Skip members matching this shell-glob pattern. Repeatable.
#[arg(long = "exclude", value_name = "PATTERN")]
exclude: Vec<String>,
/// Do not restore recorded modification times; extracted
/// entries keep whatever timestamp the filesystem assigns at
/// creation.
#[arg(long = "no-mtime")]
no_mtime: bool,
/// Print each member to stderr as it is extracted.
#[arg(short = 'v', long = "verbose")]
verbose: bool,
/// Restrict extraction to these paths or directory prefixes;
/// omit to extract everything. Matching is by exact path,
/// directory-prefix, or shell glob.
#[arg(value_name = "PATH")]
paths: Vec<String>,
},
/// Print archive metadata: size, member count, compression ratio,
/// TOC location, identity-frame version.
///
/// Reads only the TOC frame; runs in constant time regardless of
/// archive size.
Info {
/// Archive to inspect.
#[arg(short = 'f', long = "file", value_name = "ARCHIVE")]
file: PathBuf,
/// Emit metadata as a JSON object instead of the text table.
#[arg(long = "json")]
json: bool,
},
/// Verify SHA-256 checksums recorded in the TOC.
///
/// Decompresses each chunk and compares its SHA-256 against the value
/// recorded at archive creation time. Exits non-zero if any chunk
/// fails to verify.
Verify {
/// Archive to verify.
#[arg(short = 'f', long = "file", value_name = "ARCHIVE")]
file: PathBuf,
/// Restrict verification to a single member path; omit to verify
/// every member.
#[arg(value_name = "PATH")]
path: Option<String>,
/// Print an `OK` line for every successfully-verified member.
/// Without this flag, verify is silent on success and only
/// reports mismatches.
#[arg(short = 'v', long = "verbose")]
verbose: bool,
},
}
fn parse_size(value: &str) -> Result<usize, String> {
let value = value.trim();
if value.is_empty() {
return Err("chunk size cannot be empty".to_owned());
}
let split_idx = value
.find(|ch: char| !ch.is_ascii_digit())
.unwrap_or(value.len());
let (digits, suffix) = value.split_at(split_idx);
if digits.is_empty() {
return Err("chunk size must start with digits".to_owned());
}
let base = digits
.parse::<usize>()
.map_err(|error| format!("invalid chunk size number: {error}"))?;
let scale = match suffix.to_ascii_lowercase().as_str() {
"" | "b" => 1usize,
"k" | "kb" => 1024usize,
"m" | "mb" => 1024usize * 1024,
"g" | "gb" => 1024usize * 1024 * 1024,
_ => return Err(format!("invalid chunk size suffix: {suffix}")),
};
base.checked_mul(scale)
.ok_or_else(|| "chunk size is too large".to_owned())
}
/// Treat `-` (or absence) as the stdin/stdout sentinel.
fn resolve_stream(path: Option<PathBuf>) -> Option<PathBuf> {
path.filter(|p| p.as_os_str() != "-")
}
fn main() -> Result<()> {
// Rust programs ignore SIGPIPE by default, so a write to a closed pipe
// returns EPIPE and `println!` panics. Restore the default handler so the
// OS kills the process cleanly instead (matches the behaviour users expect
// from `tarzan list | head`).
#[cfg(unix)]
// SAFETY: called once before any threads are spawned; SIG_DFL is valid.
unsafe {
libc::signal(libc::SIGPIPE, libc::SIG_DFL);
}
tracing_subscriber::fmt()
.with_env_filter(
tracing_subscriber::EnvFilter::try_from_default_env().unwrap_or_else(|_| "warn".into()),
)
.with_target(false)
.compact()
.init();
let cli = Cli::parse();
match cli.command {
Commands::Wrap {
input,
file,
chunk_size,
level,
verbose,
} => {
let input = resolve_stream(input);
let output = resolve_stream(file);
cmd_wrap::run(
input.as_deref(),
output.as_deref(),
chunk_size,
level,
verbose,
)
}
Commands::Info { file, json } => cmd_info::run(&file, json),
Commands::List {
file,
verbose,
json,
utc,
paths,
} => cmd_list::run(&file, verbose, json, utc, &paths),
Commands::Cat { file, path } => cmd_cat::run(&file, &path),
Commands::Extract {
file,
directory,
strip_components,
exclude,
no_mtime,
verbose,
paths,
} => cmd_extract::run(
&file,
&directory,
strip_components,
exclude,
paths,
!no_mtime,
verbose,
),
Commands::Verify {
file,
path,
verbose,
} => cmd_verify::run(&file, path.as_deref(), verbose),
}
}