tarzan 0.4.0

Random-access, seekable .tar.zst archives with an embedded table-of-contents index
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
use std::fs::{self, File};
use std::io::{BufWriter, Write};
use std::path::{Path, PathBuf};

use anyhow::{Context, Result, anyhow, bail};
use filetime::FileTime;
use glob::Pattern;
use tracing::warn;

use crate::filter::PathFilter;
use crate::format::toc::{EntryType, TocMember};
use crate::reader::TarzanReader;

/// Options controlling [`TarzanReader::extract_to_dir`].
#[derive(Debug, Clone)]
pub struct ExtractOptions {
    /// Number of leading path components to drop from each member, like
    /// `tar --strip-components=N`. Members with too few components after
    /// the strip are skipped.
    pub strip_components: usize,
    /// Shell-glob patterns; matching members are skipped.
    pub excludes: Vec<String>,
    /// If non-empty, only members matching at least one pattern by exact
    /// path, directory-prefix, or shell-glob are extracted.
    pub includes: Vec<String>,
    /// Restore each member's recorded mtime. When false, extracted
    /// entries keep whatever timestamp the filesystem assigns at
    /// creation. Defaults to true.
    pub restore_mtime: bool,
    /// If a regular-file member fails to extract because of a corrupted
    /// data chunk (zstd decode error, unexpected EOF mid-frame, …), log
    /// a warning and continue with the remaining members rather than
    /// aborting the whole extraction. Defaults to false.
    pub skip_bad_chunks: bool,
}

impl Default for ExtractOptions {
    fn default() -> Self {
        Self {
            strip_components: 0,
            excludes: Vec::new(),
            includes: Vec::new(),
            restore_mtime: true,
            skip_bad_chunks: false,
        }
    }
}

/// Filesystem actions deferred to a second pass after the main walk:
/// directory mtimes (children must be in place first) and hard links
/// (their target file must be extracted first).
#[derive(Default)]
struct Deferred {
    /// (directory path, atime to apply, mtime to apply)
    dir_times: Vec<(PathBuf, FileTime, FileTime)>,
    /// (member path for diagnostics, link source, link target)
    hard_links: Vec<(String, PathBuf, PathBuf)>,
}

impl TarzanReader {
    /// Extracts archive members onto the filesystem under `dest`.
    ///
    /// Creates `dest` (and any missing parent directories) as needed.
    /// Refuses to extract members whose path is absolute or contains a
    /// `..` component, to keep the result inside `dest`.
    ///
    /// Hard links are reconstructed once their target file is on disk.
    /// Character/block devices and FIFOs are skipped with a warning.
    ///
    /// `on_extracted` is invoked after each member is successfully
    /// written, with the member's archive path. Useful for verbose
    /// progress output.
    pub fn extract_to_dir<F>(
        &mut self,
        dest: &Path,
        opts: &ExtractOptions,
        mut on_extracted: F,
    ) -> Result<()>
    where
        F: FnMut(&str),
    {
        let includes = PathFilter::new(&opts.includes).context("invalid include/filter pattern")?;
        let excludes = compile_patterns(&opts.excludes).context("invalid exclude pattern")?;

        fs::create_dir_all(dest)
            .with_context(|| format!("creating destination {}", dest.display()))?;

        let mut deferred = Deferred::default();

        // Clone the member list so the loop can call `&mut self` methods
        // (extraction seeks the source) while iterating.
        let members = self.members().to_vec();
        for member in &members {
            if !includes.matches(&member.path) {
                continue;
            }
            if member_excluded(&member.path, &excludes) {
                continue;
            }
            let rel = match member_relative_path(member, opts.strip_components)? {
                Some(p) if !p.as_os_str().is_empty() => p,
                _ => continue,
            };
            let target = dest.join(&rel);
            self.extract_one(member, &target, dest, opts, &mut deferred)?;
            on_extracted(&member.path);
        }

        // Hard links: every regular file is on disk now, so their targets
        // resolve. Created before directory mtimes are stamped, since
        // adding a link bumps the containing directory's mtime.
        for (member_path, source, target) in deferred.hard_links {
            if let Some(parent) = target.parent() {
                fs::create_dir_all(parent)
                    .with_context(|| format!("creating {}", parent.display()))?;
            }
            if !source.exists() {
                warn!(
                    path = %member_path,
                    source = %source.display(),
                    "hard-link target was not extracted; skipping"
                );
                continue;
            }
            // Replace any existing entry so hard_link does not fail with EEXIST.
            let _ = fs::remove_file(&target);
            fs::hard_link(&source, &target).with_context(|| {
                format!(
                    "creating hard link {} -> {}",
                    target.display(),
                    source.display()
                )
            })?;
        }

        // Directory mtimes last: writing children (files, subdirs, hard
        // links) bumps the parent's mtime back to "now".
        for (path, atime, mtime) in deferred.dir_times {
            filetime::set_file_times(&path, atime, mtime)
                .with_context(|| format!("setting file times on directory {}", path.display()))?;
        }

        Ok(())
    }

    fn extract_one(
        &mut self,
        member: &TocMember,
        target: &Path,
        dest: &Path,
        opts: &ExtractOptions,
        deferred: &mut Deferred,
    ) -> Result<()> {
        if let Some(parent) = target.parent() {
            fs::create_dir_all(parent).with_context(|| format!("creating {}", parent.display()))?;
        }
        let mtime = member_mtime(member);
        let atime = member_atime(member, mtime);
        match member.entry_type {
            EntryType::Dir => {
                fs::create_dir_all(target)
                    .with_context(|| format!("creating dir {}", target.display()))?;
                set_unix_mode(target, member.mode)?;
                apply_member_xattrs(target, member)?;
                if opts.restore_mtime {
                    deferred
                        .dir_times
                        .push((target.to_path_buf(), atime, mtime));
                }
            }
            EntryType::File => {
                let file = File::create(target)
                    .with_context(|| format!("creating file {}", target.display()))?;
                let mut writer = BufWriter::new(file);
                match self.extract_member(&member.path, &mut writer) {
                    Ok(()) => {
                        writer.flush()?;
                        set_unix_mode(target, member.mode)?;
                        apply_member_xattrs(target, member)?;
                        if opts.restore_mtime {
                            filetime::set_file_times(target, atime, mtime).with_context(|| {
                                format!("setting file times on {}", target.display())
                            })?;
                        }
                    }
                    Err(err) if opts.skip_bad_chunks => {
                        // Drop the writer first so the partial file is closed
                        // before we remove it.
                        drop(writer);
                        let _ = fs::remove_file(target);
                        warn!(
                            path = %member.path,
                            error = format!("{err:#}"),
                            "skipping member with unreadable data (--skip-bad-chunks)"
                        );
                    }
                    Err(err) => return Err(err),
                }
            }
            EntryType::Symlink => {
                create_member_symlink(member, target)?;
                if opts.restore_mtime {
                    filetime::set_symlink_file_times(target, atime, mtime).with_context(|| {
                        format!("setting mtime on symlink {}", target.display())
                    })?;
                }
            }
            EntryType::HardLink => {
                // The link's target is another member, by archive path.
                // Defer creation until that file has been written; no
                // mtime fixup — a hard link shares the target's inode,
                // which already carries the right timestamp.
                match member_link_target_relative_path(member, opts.strip_components)? {
                    Some(src_rel) if !src_rel.as_os_str().is_empty() => {
                        deferred.hard_links.push((
                            member.path.clone(),
                            dest.join(src_rel),
                            target.to_path_buf(),
                        ));
                    }
                    _ => warn!(
                        path = %member.path,
                        "hard-link target stripped away; skipping"
                    ),
                }
            }
            EntryType::CharDevice | EntryType::BlockDevice | EntryType::Fifo | EntryType::Other => {
                if matches!(member.entry_type, EntryType::Other)
                    && let Some(raw) = member.raw_type_byte
                {
                    warn!(
                        path = %member.path,
                        raw_type = format!("{} (0x{raw:02x})", raw as char),
                        "skipping unsupported entry type"
                    );
                } else {
                    warn!(path = %member.path, "skipping unsupported entry type");
                }
            }
        }
        Ok(())
    }
}

fn compile_patterns(raw: &[String]) -> Result<Vec<Pattern>> {
    raw.iter()
        .map(|s| {
            Pattern::new(normalize_for_match(s)).map_err(|e| anyhow!("invalid pattern `{s}`: {e}"))
        })
        .collect()
}

fn normalize_for_match(s: &str) -> &str {
    s.trim_start_matches("./").trim_end_matches('/')
}

fn member_excluded(path: &str, compiled: &[Pattern]) -> bool {
    let p = normalize_for_match(path);
    compiled.iter().any(|g| g.matches(p))
}

fn member_relative_path(member: &TocMember, strip: usize) -> Result<Option<PathBuf>> {
    #[cfg(unix)]
    if let Some(raw) = &member.path_bytes {
        return normalize_member_path_bytes(raw, strip);
    }
    normalize_member_path(&member.path, strip)
}

fn member_link_target_relative_path(member: &TocMember, strip: usize) -> Result<Option<PathBuf>> {
    #[cfg(unix)]
    if let Some(raw) = &member.link_target_bytes {
        return normalize_member_path_bytes(raw, strip);
    }
    let link_target = member
        .link_target
        .as_deref()
        .ok_or_else(|| anyhow!("hard link {} has no link_target", member.path))?;
    normalize_member_path(link_target, strip)
}

fn member_mtime(member: &TocMember) -> FileTime {
    FileTime::from_unix_time(member.mtime, member.mtime_ns.unwrap_or(0))
}

fn member_atime(member: &TocMember, fallback: FileTime) -> FileTime {
    match member.atime {
        Some(sec) => FileTime::from_unix_time(sec, member.atime_ns.unwrap_or(0)),
        None => fallback,
    }
}

#[cfg(unix)]
fn apply_member_xattrs(target: &Path, member: &TocMember) -> Result<()> {
    if let Some(xattrs) = &member.xattrs {
        for (name, value) in xattrs {
            xattr::set(target, name, value)
                .with_context(|| format!("setting xattr {name} on {}", target.display()))?;
        }
    }
    Ok(())
}

#[cfg(not(unix))]
fn apply_member_xattrs(_target: &Path, _member: &TocMember) -> Result<()> {
    Ok(())
}

fn normalize_member_path(p: &str, strip: usize) -> Result<Option<PathBuf>> {
    if p.starts_with('/') {
        bail!("absolute path in archive (refusing to extract): {p}");
    }
    let mut parts: Vec<&str> = Vec::new();
    for part in p.split('/') {
        match part {
            "" | "." => continue,
            ".." => bail!("path contains `..` (refusing to extract): {p}"),
            s => parts.push(s),
        }
    }
    if parts.len() <= strip {
        return Ok(None);
    }
    Ok(Some(parts[strip..].iter().copied().collect()))
}

#[cfg(unix)]
fn normalize_member_path_bytes(raw: &[u8], strip: usize) -> Result<Option<PathBuf>> {
    use std::ffi::OsStr;
    use std::os::unix::ffi::OsStrExt;

    if raw.starts_with(b"/") {
        bail!("absolute path in archive (refusing to extract)");
    }
    let mut parts: Vec<&[u8]> = Vec::new();
    for part in raw.split(|b| *b == b'/') {
        match part {
            b"" | b"." => continue,
            b".." => bail!("path contains `..` (refusing to extract)"),
            s => parts.push(s),
        }
    }
    if parts.len() <= strip {
        return Ok(None);
    }

    let mut path = PathBuf::new();
    for part in &parts[strip..] {
        path.push(OsStr::from_bytes(part));
    }
    Ok(Some(path))
}

#[cfg(unix)]
fn set_unix_mode(target: &Path, mode: u32) -> Result<()> {
    use std::os::unix::fs::PermissionsExt;
    // Mask to the standard 12 bits; ignore high bits that may encode entry type.
    let perms = fs::Permissions::from_mode(mode & 0o7777);
    fs::set_permissions(target, perms)
        .with_context(|| format!("setting mode on {}", target.display()))?;
    Ok(())
}

#[cfg(not(unix))]
fn set_unix_mode(_target: &Path, _mode: u32) -> Result<()> {
    Ok(())
}

#[cfg(unix)]
fn create_symlink(link_target: &str, target: &Path) -> Result<()> {
    std::os::unix::fs::symlink(link_target, target)
        .with_context(|| format!("creating symlink {}", target.display()))?;
    Ok(())
}

#[cfg(unix)]
fn create_member_symlink(member: &TocMember, target: &Path) -> Result<()> {
    use std::ffi::OsStr;
    use std::os::unix::ffi::OsStrExt;

    if let Some(raw) = &member.link_target_bytes {
        std::os::unix::fs::symlink(OsStr::from_bytes(raw), target)
            .with_context(|| format!("creating symlink {}", target.display()))?;
        return Ok(());
    }
    let link_target = member
        .link_target
        .as_deref()
        .ok_or_else(|| anyhow!("symlink {} has no link_target", member.path))?;
    create_symlink(link_target, target)
}

#[cfg(not(unix))]
fn create_symlink(_link_target: &str, target: &Path) -> Result<()> {
    bail!(
        "symlinks not supported on this platform ({})",
        target.display()
    )
}

#[cfg(not(unix))]
fn create_member_symlink(member: &TocMember, target: &Path) -> Result<()> {
    let link_target = member
        .link_target
        .as_deref()
        .ok_or_else(|| anyhow!("symlink {} has no link_target", member.path))?;
    create_symlink(link_target, target)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn normalize_rejects_absolute_path() {
        let err = normalize_member_path("/etc/passwd", 0).unwrap_err();
        assert!(err.to_string().contains("absolute"), "{err}");
    }

    #[test]
    fn normalize_rejects_dotdot_components() {
        let err = normalize_member_path("../escaped.txt", 0).unwrap_err();
        assert!(err.to_string().contains(".."), "{err}");

        let err = normalize_member_path("foo/../../bar", 0).unwrap_err();
        assert!(err.to_string().contains(".."), "{err}");
    }

    #[test]
    fn normalize_strips_dot_and_empty_components() {
        let p = normalize_member_path("./foo/./bar", 0).unwrap().unwrap();
        assert_eq!(p, PathBuf::from("foo/bar"));
    }

    #[test]
    fn normalize_applies_strip_components() {
        let p = normalize_member_path("./a/b/c.txt", 1).unwrap().unwrap();
        assert_eq!(p, PathBuf::from("b/c.txt"));

        let p = normalize_member_path("./a/b/c.txt", 2).unwrap().unwrap();
        assert_eq!(p, PathBuf::from("c.txt"));
    }

    #[test]
    fn normalize_skips_when_strip_consumes_all() {
        assert!(normalize_member_path("./a", 1).unwrap().is_none());
        assert!(normalize_member_path("./a/b", 2).unwrap().is_none());
        assert!(normalize_member_path("./a/b", 5).unwrap().is_none());
    }

    #[test]
    fn excludes_match_glob() {
        let raw = vec!["*.csv".to_owned()];
        let compiled = compile_patterns(&raw).unwrap();
        assert!(member_excluded("data/numbers.csv", &compiled));
        assert!(!member_excluded("data/blob.bin", &compiled));
    }
}