cindy 0.2.1

Managing infrastructure at breakneck speed.
Documentation
//! Extract an archive into a directory on the remote machine.
//!
//! Supports tar (plain / gzip / xz / zstd), zip and 7z, all through
//! pure-Rust codecs. The source [`Format`] is inferred from the
//! archive's extension unless given explicitly.
//!
//! Extraction is **not** strictly idempotent: it always unpacks and
//! reports [`Return::Changed`] (it can't cheaply prove the destination
//! already holds an identical tree).

use std::path::{Path, PathBuf};

use crate as cindy;
use crate::Context;

use super::Format;

#[derive(Clone, Default, PartialEq, Eq)]
#[crate::wire]
pub struct State {
    /// Archive file to extract.
    pub src: PathBuf,
    /// Directory to extract into (created if missing).
    pub dest: PathBuf,
    /// Archive format. `None` ⇒ infer from `src`'s extension.
    pub format: Option<Format>,
}

/// How an archive entry compares to what's already at the destination.
///
/// The comparison is deliberately cheap — existence and (for files)
/// byte size — rather than a full content hash. It's a human-facing
/// summary, not a correctness gate, so a same-size-different-content
/// edit reading as `Unchanged` is an acceptable trade for not hashing
/// every entry of a large archive.
#[derive(Clone, Copy, PartialEq, Eq)]
enum EntryStatus {
    /// No file/dir exists at the destination path yet.
    Added,
    /// A file exists but its size differs from the archived entry.
    Changed,
    /// Already present at the same size (or an existing directory).
    Unchanged,
}

/// One archive entry's metadata, format-agnostic, used for the summary.
struct Entry {
    /// Path relative to the archive root.
    path: PathBuf,
    is_dir: bool,
    /// Uncompressed size (meaningless for directories).
    size: u64,
}

impl Entry {
    /// Classify this entry against the on-disk destination tree.
    fn status(&self, dest: &Path) -> EntryStatus {
        let target = dest.join(&self.path);
        match std::fs::symlink_metadata(&target) {
            Err(_) => EntryStatus::Added,
            Ok(_) if self.is_dir => EntryStatus::Unchanged,
            Ok(meta) if meta.is_file() && meta.len() == self.size => EntryStatus::Unchanged,
            Ok(_) => EntryStatus::Changed,
        }
    }
}

/// Maximum number of changed/added entries to name individually before
/// collapsing the rest into a "… and N more" line, so extracting a
/// huge archive doesn't flood stderr.
const SUMMARY_LIST_CAP: usize = 20;

/// Render a compact, `path`-style summary of what the extraction will
/// change at `dest`, to stderr (informational; ignore write errors).
///
/// Lines are prefixed `+` (added), `~` (changed) and counted; unchanged
/// entries are tallied but not listed. This is the archive-appropriate
/// granularity: per-entry visibility without a full `path` struct diff
/// for every one of potentially thousands of files.
fn render_summary(entries: &[Entry], dest: &Path) {
    let mut added = 0usize;
    let mut changed = 0usize;
    let mut unchanged = 0usize;
    let mut listed = 0usize;

    for entry in entries {
        let (mark, count) = match entry.status(dest) {
            EntryStatus::Added => ('+', &mut added),
            EntryStatus::Changed => ('~', &mut changed),
            EntryStatus::Unchanged => {
                unchanged += 1;
                continue;
            }
        };
        *count += 1;
        if listed < SUMMARY_LIST_CAP {
            eprintln!("  {mark} {}", entry.path.display());
            listed += 1;
        }
    }

    let extra = added + changed - listed;
    if extra > 0 {
        eprintln!("  … and {extra} more");
    }
    eprintln!("  {added} added, {changed} changed, {unchanged} unchanged");
}

/// List a tar stream's entries (paths + sizes + dir flags) without
/// extracting them.
fn list_tar(tar_bytes: &[u8]) -> crate::Result<Vec<Entry>> {
    let mut archive = tar::Archive::new(std::io::Cursor::new(tar_bytes));
    let mut out = Vec::new();
    for entry in archive.entries().context("Couldn't read tar entries")? {
        let entry = entry.context("Couldn't read a tar entry")?;
        let path = entry.path().context("tar entry has no path")?.into_owned();
        let is_dir = entry.header().entry_type().is_dir();
        out.push(Entry {
            path,
            is_dir,
            size: entry.size(),
        });
    }
    Ok(out)
}

/// List a ZIP archive's entries without extracting them.
fn list_zip(src: &Path) -> crate::Result<Vec<Entry>> {
    let file = std::fs::File::open(src).context(format!("Couldn't open {}", src.display()))?;
    let mut archive =
        zip::ZipArchive::new(file).context(format!("Couldn't open zip {}", src.display()))?;
    let mut out = Vec::with_capacity(archive.len());
    for i in 0..archive.len() {
        let f = archive.by_index(i).context("Couldn't read a zip entry")?;
        out.push(Entry {
            path: PathBuf::from(f.name()),
            is_dir: f.is_dir(),
            size: f.size(),
        });
    }
    Ok(out)
}

/// List a 7z archive's entries (header only, no decompression).
fn list_7z(src: &Path) -> crate::Result<Vec<Entry>> {
    let archive = sevenz_rust2::Archive::open(src)
        .map_err(|e| anyhow_serde::Error::msg(format!("Couldn't read 7z header: {e}")))?;
    Ok(archive
        .files
        .iter()
        .map(|f| Entry {
            path: PathBuf::from(f.name()),
            is_dir: f.is_directory(),
            size: f.size(),
        })
        .collect())
}

/// Unpack a raw tar byte stream into `dest`.
fn extract_tar(tar_bytes: &[u8], dest: &Path) -> crate::Result<()> {
    let mut archive = tar::Archive::new(std::io::Cursor::new(tar_bytes));
    archive
        .unpack(dest)
        .context(format!("Couldn't unpack tar into {}", dest.display()))
}

/// Extract a ZIP archive into `dest`.
fn extract_zip(src: &Path, dest: &Path) -> crate::Result<()> {
    let file = std::fs::File::open(src).context(format!("Couldn't open {}", src.display()))?;
    let mut archive =
        zip::ZipArchive::new(file).context(format!("Couldn't open zip {}", src.display()))?;
    archive
        .extract(dest)
        .context(format!("Couldn't extract zip into {}", dest.display()))
}

/// Extract a 7z archive into `dest`.
fn extract_7z(src: &Path, dest: &Path) -> crate::Result<()> {
    sevenz_rust2::decompress_file(src, dest).context(format!(
        "Couldn't extract 7z {} into {}",
        src.display(),
        dest.display()
    ))
}

/// Extract an archive into a directory on the remote machine.
#[crate::remote]
pub fn unarchive(state: State) -> crate::Result<super::Return> {
    let format = match state.format {
        Some(f) => f,
        None => Format::from_path(&state.src)?,
    };

    // Establish the destination directory via the `path` module, so the
    // dir is created with correct transition handling (replacing a stale
    // file/symlink at `dest`) and the diff is emitted. The archive's
    // individual entries are then written by the format extractor — that
    // tree can't be modelled as a single `path::State`, so extraction
    // stays here.
    // Establish the destination directory owned by the worker's
    // identity at a conventional `0o755`; `path` requires a total spec.
    let (user, group) = super::current_owner_names();
    super::path::directory_raw::inner(state.dest.clone(), user, group, 0o755.into())?;

    eprintln!(
        "unarchive {} -> {} ({:?})",
        state.src.display(),
        state.dest.display(),
        format,
    );

    // List the entries first to render a compact per-entry summary
    // against the current destination tree, then run the bulk extractor
    // (the listing is header/metadata only — for tar it reuses the
    // already-decompressed bytes, so nothing is decompressed twice).
    match format {
        Format::Tar(codec) => {
            let raw = std::fs::read(&state.src)
                .context(format!("Couldn't read {}", state.src.display()))?;
            let tar_bytes =
                super::decompress::decompress::inner(raw, codec.unwrap_or(super::Codec::Store))?;
            render_summary(&list_tar(&tar_bytes)?, &state.dest);
            extract_tar(&tar_bytes, &state.dest)?;
        }
        Format::Zip => {
            render_summary(&list_zip(&state.src)?, &state.dest);
            extract_zip(&state.src, &state.dest)?;
        }
        Format::SevenZ => {
            render_summary(&list_7z(&state.src)?, &state.dest);
            extract_7z(&state.src, &state.dest)?;
        }
    }

    Ok(super::Return::Changed)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::builtin::{Format, archive};

    /// Lay out a small source tree, archive it, extract it, and assert
    /// the bytes survive the round-trip — for every supported format.
    fn round_trip(format: Format, ext: &str) {
        let tmp = tempfile::tempdir().unwrap();
        let src = tmp.path().join("payload");
        std::fs::create_dir_all(src.join("nested")).unwrap();
        std::fs::write(src.join("a.txt"), b"hello world").unwrap();
        std::fs::write(src.join("nested/b.bin"), [0u8, 1, 2, 3, 255]).unwrap();

        let archive_path = tmp.path().join(format!("out{ext}"));
        let changed = archive::archive::inner(archive::State {
            sources: vec![src.clone()],
            dest: archive_path.clone(),
            format: Some(format),
            // The `path` module (which archive delegates the write to)
            // creates files `0o000` by default; we need to read this
            // back via unarchive, so request a readable mode.
            mode: Some(0o644.into()),
        })
        .unwrap_or_else(|e| panic!("archive {format:?} failed: {e}"));
        assert!(changed.changed(), "archive should report Changed");
        assert!(archive_path.exists(), "archive file should exist");

        let out = tmp.path().join("extracted");
        unarchive::inner(State {
            src: archive_path,
            dest: out.clone(),
            format: Some(format),
        })
        .unwrap_or_else(|e| panic!("unarchive {format:?} failed: {e}"));

        let a = std::fs::read(out.join("payload/a.txt"))
            .unwrap_or_else(|e| panic!("missing a.txt for {format:?}: {e}"));
        assert_eq!(a, b"hello world", "a.txt content mismatch for {format:?}");
        let b = std::fs::read(out.join("payload/nested/b.bin"))
            .unwrap_or_else(|e| panic!("missing b.bin for {format:?}: {e}"));
        assert_eq!(
            b,
            [0u8, 1, 2, 3, 255],
            "b.bin content mismatch for {format:?}"
        );
    }

    #[test]
    fn round_trip_tar() {
        round_trip(Format::Tar(None), ".tar");
    }

    #[test]
    fn round_trip_tar_gz() {
        round_trip(Format::Tar(Some(crate::builtin::Codec::Gzip)), ".tar.gz");
    }

    #[test]
    fn round_trip_tar_xz() {
        round_trip(Format::Tar(Some(crate::builtin::Codec::Xz)), ".tar.xz");
    }

    #[test]
    fn round_trip_tar_zst() {
        round_trip(Format::Tar(Some(crate::builtin::Codec::Zstd)), ".tar.zst");
    }

    #[test]
    fn round_trip_zip() {
        round_trip(Format::Zip, ".zip");
    }

    #[test]
    fn round_trip_7z() {
        round_trip(Format::SevenZ, ".7z");
    }

    #[test]
    fn entry_status_classification() {
        let tmp = tempfile::tempdir().unwrap();
        let dest = tmp.path();
        std::fs::write(dest.join("same.txt"), b"1234").unwrap();
        std::fs::write(dest.join("resized.txt"), b"12").unwrap();
        std::fs::create_dir(dest.join("adir")).unwrap();

        // Absent at destination → Added.
        let added = Entry {
            path: "new.txt".into(),
            is_dir: false,
            size: 9,
        };
        assert!(matches!(added.status(dest), EntryStatus::Added));

        // Present, same size → Unchanged.
        let same = Entry {
            path: "same.txt".into(),
            is_dir: false,
            size: 4,
        };
        assert!(matches!(same.status(dest), EntryStatus::Unchanged));

        // Present, different size → Changed.
        let resized = Entry {
            path: "resized.txt".into(),
            is_dir: false,
            size: 99,
        };
        assert!(matches!(resized.status(dest), EntryStatus::Changed));

        // Existing directory → Unchanged.
        let adir = Entry {
            path: "adir".into(),
            is_dir: true,
            size: 0,
        };
        assert!(matches!(adir.status(dest), EntryStatus::Unchanged));
    }

    #[test]
    fn format_inference() {
        use crate::builtin::Codec;
        let cases = [
            ("x.tar", Format::Tar(None)),
            ("x.tar.gz", Format::Tar(Some(Codec::Gzip))),
            ("x.tgz", Format::Tar(Some(Codec::Gzip))),
            ("x.tar.xz", Format::Tar(Some(Codec::Xz))),
            ("x.tar.zst", Format::Tar(Some(Codec::Zstd))),
            ("x.zip", Format::Zip),
            ("x.7z", Format::SevenZ),
        ];
        for (name, want) in cases {
            assert_eq!(Format::from_path(std::path::Path::new(name)).unwrap(), want);
        }
        assert!(Format::from_path(std::path::Path::new("x.rar")).is_err());
    }
}