forjar 1.6.1

Rust-native Infrastructure as Code — bare-metal first, BLAKE3 state, provenance tracing
Documentation
//! FJ-1346: FAR (Forjar ARchive) binary format — encode and decode.
//!
//! Layout: magic → manifest_len → zstd(manifest_yaml) → chunk_count
//!       → chunk_table(hash+offset+len) → zstd(chunks) → sig_len → sig

use serde::{Deserialize, Serialize};
use std::io::{Read, Write};

/// 12-byte magic identifying a FAR archive.
pub const FAR_MAGIC: &[u8; 12] = b"FORJAR-FAR\x00\x01";

/// Upper bound on the compressed manifest length declared in the header.
///
/// The manifest is a small zstd-compressed YAML blob; a real one is a few KB.
/// We cap at 16 MiB so a corrupt/malicious header claiming a huge `manifest_len`
/// cannot drive an unbounded allocation (OOM abort / capacity-overflow panic).
const MAX_MANIFEST_LEN: usize = 16 * 1024 * 1024;

/// On-disk size in bytes of one serialized chunk-table entry: hash(32) + offset(8) + length(8).
const CHUNK_ENTRY_BYTES: u64 = 32 + 8 + 8;

/// A single chunk entry in the chunk table.
#[derive(Debug, Clone, PartialEq)]
pub struct ChunkEntry {
    /// BLAKE3 hash of the chunk content.
    pub hash: [u8; 32],
    /// Byte offset within the archive data section.
    pub offset: u64,
    /// Compressed length in bytes.
    pub length: u64,
}

/// Manifest embedded in a FAR archive.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct FarManifest {
    /// Package or artifact name.
    pub name: String,
    /// Version string.
    pub version: String,
    /// Target architecture (e.g., "x86_64").
    pub arch: String,
    /// Content-addressed store hash.
    pub store_hash: String,
    /// Merkle tree hash for streaming verification.
    pub tree_hash: String,
    /// Number of files in the archive.
    pub file_count: u64,
    /// Total uncompressed size in bytes.
    pub total_size: u64,
    /// Per-file entries with path, size, and hash.
    pub files: Vec<FarFileEntry>,
    /// Build provenance metadata.
    pub provenance: FarProvenance,
    /// Optional kernel contract metadata (for ML models).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub kernel_contracts: Option<KernelContractInfo>,
}

/// Kernel contract metadata embedded in a FAR manifest.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct KernelContractInfo {
    /// Model type identifier (e.g., "llama", "qwen2").
    pub model_type: String,
    /// Required kernel operations.
    pub required_ops: Vec<String>,
    /// Contract coverage percentage.
    pub coverage_pct: f64,
}

/// A file entry within the FAR manifest.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct FarFileEntry {
    /// Relative file path within the archive.
    pub path: String,
    /// File size in bytes.
    pub size: u64,
    /// BLAKE3 hash of the file content.
    pub blake3: String,
}

/// Provenance metadata for the FAR archive.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct FarProvenance {
    /// Provider that produced this archive (e.g., "apt", "conda").
    pub origin_provider: String,
    /// Upstream reference for traceability.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub origin_ref: Option<String>,
    /// Upstream content hash at build time.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub origin_hash: Option<String>,
    /// ISO 8601 creation timestamp.
    pub created_at: String,
    /// Generator string (e.g., "forjar 1.0.0").
    pub generator: String,
}

/// Encode a FAR archive to a writer.
///
/// Writes: magic → manifest_len(u64) → zstd(manifest_yaml)
///       → chunk_count(u64) → chunk_table → zstd(chunks)
///       → signature_len(u64=0)
pub fn encode_far<W: Write>(
    manifest: &FarManifest,
    chunks: &[([u8; 32], Vec<u8>)],
    mut writer: W,
) -> Result<(), String> {
    // Magic
    writer
        .write_all(FAR_MAGIC)
        .map_err(|e| format!("write magic: {e}"))?;

    // Manifest → YAML → zstd
    let yaml =
        serde_yaml_ng::to_string(manifest).map_err(|e| format!("serialize manifest: {e}"))?;
    let compressed =
        zstd::encode_all(yaml.as_bytes(), 3).map_err(|e| format!("zstd manifest: {e}"))?;

    writer
        .write_all(&(compressed.len() as u64).to_le_bytes())
        .map_err(|e| format!("write manifest_len: {e}"))?;
    writer
        .write_all(&compressed)
        .map_err(|e| format!("write manifest: {e}"))?;

    // Chunk count
    let chunk_count = chunks.len() as u64;
    writer
        .write_all(&chunk_count.to_le_bytes())
        .map_err(|e| format!("write chunk_count: {e}"))?;

    // Compress all chunks and build table
    let mut compressed_chunks: Vec<Vec<u8>> = Vec::with_capacity(chunks.len());
    for (_, data) in chunks {
        let cc = zstd::encode_all(data.as_slice(), 3).map_err(|e| format!("zstd chunk: {e}"))?;
        compressed_chunks.push(cc);
    }

    // Chunk table: hash(32) + offset(u64) + length(u64) per entry
    let mut offset: u64 = 0;
    for (i, (hash, _)) in chunks.iter().enumerate() {
        let len = compressed_chunks[i].len() as u64;
        writer
            .write_all(hash)
            .map_err(|e| format!("write chunk hash: {e}"))?;
        writer
            .write_all(&offset.to_le_bytes())
            .map_err(|e| format!("write chunk offset: {e}"))?;
        writer
            .write_all(&len.to_le_bytes())
            .map_err(|e| format!("write chunk length: {e}"))?;
        offset += len;
    }

    // Chunk data
    for cc in &compressed_chunks {
        writer
            .write_all(cc)
            .map_err(|e| format!("write chunk data: {e}"))?;
    }

    // Signature (0 = unsigned)
    writer
        .write_all(&0u64.to_le_bytes())
        .map_err(|e| format!("write sig_len: {e}"))?;

    writer.flush().map_err(|e| format!("flush: {e}"))?;
    Ok(())
}

/// Decode the manifest and chunk table from a FAR archive (streaming — no full load).
pub fn decode_far_manifest<R: Read>(
    mut reader: R,
) -> Result<(FarManifest, Vec<ChunkEntry>), String> {
    // Magic
    let mut magic = [0u8; 12];
    reader
        .read_exact(&mut magic)
        .map_err(|e| format!("read magic: {e}"))?;
    if magic != *FAR_MAGIC {
        return Err("invalid FAR magic".to_string());
    }

    // Manifest length
    let mut len_buf = [0u8; 8];
    reader
        .read_exact(&mut len_buf)
        .map_err(|e| format!("read manifest_len: {e}"))?;
    let manifest_len = u64::from_le_bytes(len_buf) as usize;
    // #17/#24: the declared length is attacker-controlled. Reject absurd values
    // BEFORE allocating, then read via a length-limited reader so the buffer grows
    // only as bytes actually arrive (a short file fails gracefully, never OOMs).
    if manifest_len > MAX_MANIFEST_LEN {
        return Err(format!(
            "manifest too large: {manifest_len} bytes exceeds {MAX_MANIFEST_LEN} limit"
        ));
    }

    // Compressed manifest
    let mut compressed = Vec::new();
    let read = (&mut reader)
        .take(manifest_len as u64)
        .read_to_end(&mut compressed)
        .map_err(|e| format!("read manifest: {e}"))?;
    if read != manifest_len {
        return Err(format!(
            "read manifest: expected {manifest_len} bytes, got {read}"
        ));
    }
    let yaml_bytes =
        zstd::decode_all(compressed.as_slice()).map_err(|e| format!("zstd decompress: {e}"))?;
    let manifest: FarManifest =
        serde_yaml_ng::from_slice(&yaml_bytes).map_err(|e| format!("parse manifest: {e}"))?;

    // Chunk count
    reader
        .read_exact(&mut len_buf)
        .map_err(|e| format!("read chunk_count: {e}"))?;
    let chunk_count = u64::from_le_bytes(len_buf);

    // Chunk table. #18/#25: chunk_count is attacker-controlled, so we do NOT
    // pre-size the Vec from it (Vec::with_capacity(chunk_count * 48) overflows
    // isize / OOMs for huge counts). Instead grow on each successful read — a
    // truncated/corrupt archive then fails via read_exact's Err. We also reject
    // counts that cannot possibly fit in the remaining input as a fast bail-out.
    let mut entries: Vec<ChunkEntry> = Vec::new();
    let max_chunks = u64::MAX / CHUNK_ENTRY_BYTES;
    if chunk_count > max_chunks {
        return Err(format!("chunk_count too large: {chunk_count}"));
    }
    for _ in 0..chunk_count {
        let mut hash = [0u8; 32];
        reader
            .read_exact(&mut hash)
            .map_err(|e| format!("read chunk hash: {e}"))?;
        reader
            .read_exact(&mut len_buf)
            .map_err(|e| format!("read chunk offset: {e}"))?;
        let offset = u64::from_le_bytes(len_buf);
        reader
            .read_exact(&mut len_buf)
            .map_err(|e| format!("read chunk length: {e}"))?;
        let length = u64::from_le_bytes(len_buf);
        entries.push(ChunkEntry {
            hash,
            offset,
            length,
        });
    }

    Ok((manifest, entries))
}