tarzan 0.2.0

Random-access, seekable .tar.zst archives with an embedded table-of-contents index
Documentation
use anyhow::{Context, Result, bail};
use serde::{Deserialize, Serialize};

use super::{FRAME_TYPE_TOC, encode_skippable_frame, identity::IDENTITY_MAGIC};

pub const TOC_VERSION_V1: u8 = 1;

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TocFrame {
    pub tarzan_version: u8,
    pub members: Vec<TocMember>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TocMember {
    pub path: String,
    #[serde(rename = "type")]
    pub entry_type: EntryType,
    pub size: u64,
    pub mode: u32,
    pub uid: u64,
    pub gid: u64,
    pub mtime: i64,
    pub tar_offset: u64,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub link_target: Option<String>,
    /// SHA-256 of the member's file content (no headers, no padding), as
    /// 64-character lowercase hex. Populated for regular files; `None` for
    /// directories, symlinks, hard links, and device nodes. Format and value
    /// match `sha256sum`'s output, so users can verify against on-disk files
    /// without invoking tarzan.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub content_sha256: Option<String>,
    pub chunks: Vec<ChunkInfo>,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum EntryType {
    File,
    Dir,
    Symlink,
    HardLink,
    CharDevice,
    BlockDevice,
    Fifo,
    Other,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChunkInfo {
    pub compressed_offset: u64,
    pub compressed_size: u64,
    pub uncompressed_size: u64,
    /// Offset of this member's bytes within the frame's decompressed output.
    /// Zero unless the member shares a frame with other (small) members; see
    /// the grouping notes in the format documentation.
    #[serde(default, skip_serializing_if = "is_zero")]
    pub frame_offset: u64,
}

fn is_zero(n: &u64) -> bool {
    *n == 0
}

/// Encodes a `TocFrame` as a tarzan skippable frame ready to append to an archive.
///
/// Payload layout: `TRZN` + `FRAME_TYPE_TOC` + `TOC_VERSION_V1` + zstd-compressed JSON.
pub fn encode_toc_frame(toc: &TocFrame, level: i32) -> Result<Vec<u8>> {
    let json = serde_json::to_vec(toc).context("failed to serialize TOC to JSON")?;
    let compressed = zstd::bulk::compress(&json, level).context("failed to compress TOC JSON")?;
    let payload = [
        IDENTITY_MAGIC.as_slice(),
        &[FRAME_TYPE_TOC, TOC_VERSION_V1],
        compressed.as_slice(),
    ]
    .concat();
    Ok(encode_skippable_frame(&payload))
}

/// Hard cap on the decompressed size of the TOC's JSON payload. A producer
/// can legally write a zstd skippable frame up to ~4 GiB, which could
/// decompress to many times that — refusing here defends against accidental
/// zip-bomb-shaped TOCs and against malicious input. 1 GiB of JSON
/// corresponds to roughly 4 million members at typical entry sizes; if you
/// genuinely have a larger archive, raise the cap deliberately.
pub const MAX_TOC_DECOMPRESSED_BYTES: u64 = 1024 * 1024 * 1024;

/// Decodes a TOC-frame payload (everything after the 8-byte skippable-frame header).
///
/// Expects: `TRZN` + `FRAME_TYPE_TOC` + version byte + zstd-compressed JSON.
pub fn decode_toc_payload(payload: &[u8]) -> Result<TocFrame> {
    use std::io::Read;
    if payload.len() < 6 {
        bail!(
            "TOC payload too short: {} bytes (expected ≥6)",
            payload.len()
        );
    }
    if payload[0..4] != IDENTITY_MAGIC {
        bail!("TOC payload does not begin with TRZN");
    }
    if payload[4] != FRAME_TYPE_TOC {
        bail!("unexpected frame type in TOC payload: {:#04x}", payload[4]);
    }
    let version = payload[5];
    if version != TOC_VERSION_V1 {
        bail!("unsupported TOC version: {version}");
    }
    // Take 1 extra byte past the limit so we can distinguish "exactly at
    // limit" from "over limit" with a single read.
    let mut decoder = zstd::stream::read::Decoder::new(std::io::Cursor::new(&payload[6..]))
        .context("failed to create zstd decoder for TOC")?;
    let mut json = Vec::new();
    decoder
        .by_ref()
        .take(MAX_TOC_DECOMPRESSED_BYTES + 1)
        .read_to_end(&mut json)
        .context("failed to decompress TOC JSON")?;
    if json.len() as u64 > MAX_TOC_DECOMPRESSED_BYTES {
        bail!("decompressed TOC exceeds the {MAX_TOC_DECOMPRESSED_BYTES}-byte safety cap");
    }
    serde_json::from_slice(&json).context("failed to deserialize TOC JSON")
}