pdfluent 1.0.0-beta.5

Pure-Rust PDF SDK with XFA, PDF/A, digital signatures, and WASM support.
Documentation
//! Diagnostic bundle exporter (M6b.1).
//!
//! Produces a self-contained directory that an enterprise customer can
//! send to support without running anything else. The exporter is
//! deterministic: rerunning with the same inputs yields byte-identical
//! output (modulo the CLI's optional `--include-input` mode, which copies
//! a binary; the bytes there are the customer's own and unchanged).
//!
//! # Bundle layout
//!
//! ```text
//! <out-dir>/
//!   manifest.json   — sdk version, mode, signal tags, file list
//!   classification.json — output of `classify_input` (M6b.3)
//!   env.json        — non-sensitive environment metadata (target triple, target_pointer_width)
//!   errors.log      — append-only error log captured during the run, or empty
//!   input.pdf       — present in `Full` mode only
//!   input.sha256    — hex digest, present in all modes
//! ```
//!
//! # Privacy modes
//!
//! - [`BundleMode::Full`]: includes `input.pdf` and the input hash. Use only
//!   when the customer accepts that the PDF will travel with the bundle.
//! - [`BundleMode::HashOnly`]: omits `input.pdf`, keeps `input.sha256` and
//!   the classification metadata. Recommended default for enterprise.
//! - [`BundleMode::IrOnly`]: future variant intended to add the IR JSON
//!   (M1.3) once the public IR API stabilises (M6b.2). At present it is
//!   equivalent to `HashOnly` and emits a `caveats` field flagging the
//!   missing IR. Documented as a forward stub rather than a silent no-op.
//!
//! # What this module does NOT do
//!
//! - It never phones home. There is no network code in this file.
//! - It never writes absolute filesystem paths from the host (only the
//!   target output directory the caller chose).
//! - It never logs the input PDF's content to stdout / stderr / tracing.

use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};

use super::classify::{classify_input, ClassificationReport};
use super::SCHEMA_VERSION;

/// Privacy mode for the bundle.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum BundleMode {
    /// Include the input PDF.
    Full,
    /// Omit the input PDF; include only the SHA-256 hash and classification.
    HashOnly,
    /// Forward stub — currently equivalent to `HashOnly` plus a caveat noting
    /// that the IR export depends on M6b.2 stabilising the public API.
    IrOnly,
}

impl BundleMode {
    /// Stable string tag for serialisation.
    pub const fn as_str(self) -> &'static str {
        match self {
            Self::Full => "full",
            Self::HashOnly => "hash_only",
            Self::IrOnly => "ir_only",
        }
    }
}

/// Errors from bundle export.
#[derive(Debug)]
pub enum DiagnosticError {
    /// The output directory could not be created.
    CreateDir {
        /// Directory the exporter tried to create.
        path: PathBuf,
        /// Underlying I/O error.
        source: std::io::Error,
    },
    /// A file in the bundle could not be written.
    Write {
        /// File path the exporter tried to write.
        path: PathBuf,
        /// Underlying I/O error.
        source: std::io::Error,
    },
    /// Reading the input PDF failed.
    ReadInput {
        /// Input PDF path the exporter tried to read.
        path: PathBuf,
        /// Underlying I/O error.
        source: std::io::Error,
    },
}

impl std::fmt::Display for DiagnosticError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::CreateDir { path, source } => {
                write!(f, "could not create '{}': {source}", path.display())
            }
            Self::Write { path, source } => {
                write!(f, "could not write '{}': {source}", path.display())
            }
            Self::ReadInput { path, source } => {
                write!(f, "could not read input '{}': {source}", path.display())
            }
        }
    }
}

impl std::error::Error for DiagnosticError {
    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
        match self {
            Self::CreateDir { source, .. }
            | Self::Write { source, .. }
            | Self::ReadInput { source, .. } => Some(source),
        }
    }
}

/// Top-level manifest describing the bundle contents.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BundleManifest {
    /// Always equal to [`SCHEMA_VERSION`] at write time.
    pub schema_version: u32,
    /// SDK version that produced the bundle.
    pub sdk_version: String,
    /// Privacy mode.
    pub mode: BundleMode,
    /// SHA-256 hex digest of the input PDF.
    pub input_sha256: String,
    /// Size of the input in bytes.
    pub input_size: u64,
    /// Files written into the bundle directory, relative paths only.
    pub files: Vec<String>,
    /// Free-form caveats (mode-specific notes a consumer should read).
    pub caveats: Vec<String>,
    /// Build target triple (compile-time).
    pub target_triple: String,
}

/// Result of a bundle export.
#[derive(Debug, Clone)]
pub struct DiagnosticBundle {
    /// Root directory of the written bundle.
    pub root: PathBuf,
    /// The manifest written into `manifest.json`.
    pub manifest: BundleManifest,
    /// The classification report written into `classification.json`.
    pub classification: ClassificationReport,
}

/// Produce a bundle for `input` under `out_dir`, in the given privacy mode.
///
/// The `out_dir` is created if it does not exist. Any pre-existing files
/// with the same names are overwritten — the bundle is intended to be
/// idempotent, and reruns must be byte-identical.
pub fn export_bundle(
    input_path: &Path,
    out_dir: &Path,
    mode: BundleMode,
) -> Result<DiagnosticBundle, DiagnosticError> {
    let bytes = std::fs::read(input_path).map_err(|source| DiagnosticError::ReadInput {
        path: input_path.to_owned(),
        source,
    })?;
    export_bundle_from_bytes(&bytes, out_dir, mode)
}

/// Produce a bundle for raw `bytes` under `out_dir`.
pub fn export_bundle_from_bytes(
    bytes: &[u8],
    out_dir: &Path,
    mode: BundleMode,
) -> Result<DiagnosticBundle, DiagnosticError> {
    std::fs::create_dir_all(out_dir).map_err(|source| DiagnosticError::CreateDir {
        path: out_dir.to_owned(),
        source,
    })?;

    let classification = classify_input(bytes);
    let input_sha256 = sha256_hex(bytes);
    let input_size = bytes.len() as u64;

    let mut files: Vec<String> = vec![
        "manifest.json".into(),
        "classification.json".into(),
        "env.json".into(),
        "errors.log".into(),
        "input.sha256".into(),
    ];
    if matches!(mode, BundleMode::Full) {
        files.push("input.pdf".into());
    }
    files.sort();

    let mut caveats = Vec::new();
    if matches!(mode, BundleMode::IrOnly) {
        caveats.push(
            "ir_only mode is a forward stub; the IR export becomes available once M6b.2 stabilises the public IR API. The bundle is currently equivalent to hash_only.".into(),
        );
    }
    if matches!(mode, BundleMode::HashOnly | BundleMode::IrOnly) {
        caveats.push(
            "input.pdf is intentionally absent in this mode; only input.sha256 ships.".into(),
        );
    }

    let manifest = BundleManifest {
        schema_version: SCHEMA_VERSION,
        sdk_version: crate::api_version().to_string(),
        mode,
        input_sha256: input_sha256.clone(),
        input_size,
        files: files.clone(),
        caveats,
        target_triple: format!("{}-{}", std::env::consts::ARCH, std::env::consts::OS),
    };

    let env = EnvSnapshot {
        schema_version: SCHEMA_VERSION,
        target_pointer_width: usize::BITS,
        cfg_target_os: std::env::consts::OS.to_string(),
        cfg_target_arch: std::env::consts::ARCH.to_string(),
    };

    write_json(&out_dir.join("manifest.json"), &manifest)?;
    write_json(&out_dir.join("classification.json"), &classification)?;
    write_json(&out_dir.join("env.json"), &env)?;
    write_bytes(&out_dir.join("errors.log"), b"")?;
    write_bytes(&out_dir.join("input.sha256"), input_sha256.as_bytes())?;

    if matches!(mode, BundleMode::Full) {
        write_bytes(&out_dir.join("input.pdf"), bytes)?;
    }

    Ok(DiagnosticBundle {
        root: out_dir.to_owned(),
        manifest,
        classification,
    })
}

#[derive(Debug, Serialize, Deserialize)]
struct EnvSnapshot {
    schema_version: u32,
    target_pointer_width: u32,
    cfg_target_os: String,
    cfg_target_arch: String,
}

fn write_json<T: Serialize>(path: &Path, value: &T) -> Result<(), DiagnosticError> {
    let bytes = serde_json::to_vec_pretty(value).expect("serde_json on internal type cannot fail");
    write_bytes(path, &bytes)
}

fn write_bytes(path: &Path, bytes: &[u8]) -> Result<(), DiagnosticError> {
    std::fs::write(path, bytes).map_err(|source| DiagnosticError::Write {
        path: path.to_owned(),
        source,
    })
}

fn sha256_hex(bytes: &[u8]) -> String {
    use sha2::{Digest, Sha256};
    let digest = Sha256::digest(bytes);
    digest.iter().map(|b| format!("{b:02x}")).collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    fn fake_pdf(extra: &[u8]) -> Vec<u8> {
        let mut v = b"%PDF-1.7\n".to_vec();
        v.extend_from_slice(extra);
        v.extend_from_slice(b"\n%%EOF\n");
        v
    }

    fn temp_dir(name: &str) -> PathBuf {
        let mut d = std::env::temp_dir();
        d.push(format!(
            "pdfluent-bundle-test-{name}-{}",
            std::process::id()
        ));
        let _ = std::fs::remove_dir_all(&d);
        d
    }

    #[test]
    fn bundle_full_mode_writes_all_files() {
        let pdf = fake_pdf(b"/XFA layout=\"tb\"");
        let out = temp_dir("full");
        let bundle = export_bundle_from_bytes(&pdf, &out, BundleMode::Full).unwrap();
        for f in &bundle.manifest.files {
            assert!(out.join(f).exists(), "missing file {f}");
        }
        assert!(out.join("input.pdf").exists());
        assert!(out.join("manifest.json").exists());
        assert_eq!(bundle.manifest.mode, BundleMode::Full);
    }

    #[test]
    fn bundle_hash_only_omits_input_pdf() {
        let pdf = fake_pdf(b"/XFA layout=\"tb\"");
        let out = temp_dir("hash");
        let bundle = export_bundle_from_bytes(&pdf, &out, BundleMode::HashOnly).unwrap();
        assert!(!out.join("input.pdf").exists());
        assert!(out.join("input.sha256").exists());
        assert!(!bundle.manifest.files.iter().any(|f| f == "input.pdf"));
    }

    #[test]
    fn bundle_ir_only_marks_caveat() {
        let pdf = fake_pdf(b"/XFA layout=\"tb\"");
        let out = temp_dir("ir-only");
        let bundle = export_bundle_from_bytes(&pdf, &out, BundleMode::IrOnly).unwrap();
        assert!(bundle.manifest.caveats.iter().any(|c| c.contains("M6b.2")));
        assert!(!out.join("input.pdf").exists());
    }

    #[test]
    fn rerun_is_idempotent() {
        let pdf = fake_pdf(b"/XFA layout=\"tb\"");
        let out = temp_dir("idempotent");
        let _b1 = export_bundle_from_bytes(&pdf, &out, BundleMode::HashOnly).unwrap();
        let m1 = std::fs::read(out.join("manifest.json")).unwrap();
        let _b2 = export_bundle_from_bytes(&pdf, &out, BundleMode::HashOnly).unwrap();
        let m2 = std::fs::read(out.join("manifest.json")).unwrap();
        assert_eq!(m1, m2, "manifest.json must be byte-identical on rerun");
    }

    #[test]
    fn input_sha256_matches_known_value() {
        // Empty PDF byte string → known SHA-256 value, exercising the
        // bundle's hash pipeline against the FIPS empty-string vector.
        let out = temp_dir("hash-empty");
        let bundle = export_bundle_from_bytes(b"", &out, BundleMode::HashOnly).unwrap();
        assert_eq!(
            bundle.manifest.input_sha256,
            "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
        );
    }

    #[test]
    fn bundle_classification_round_trip() {
        let pdf = fake_pdf(b"/XFA layout=\"tb\" x-formcalc");
        let out = temp_dir("classify");
        let _b = export_bundle_from_bytes(&pdf, &out, BundleMode::Full).unwrap();
        let raw = std::fs::read(out.join("classification.json")).unwrap();
        let parsed: ClassificationReport = serde_json::from_slice(&raw).unwrap();
        assert_eq!(parsed.tier.as_str(), "C");
    }

    #[test]
    fn manifest_lists_files_sorted() {
        let pdf = fake_pdf(b"/XFA layout=\"tb\"");
        let out = temp_dir("sorted");
        let bundle = export_bundle_from_bytes(&pdf, &out, BundleMode::Full).unwrap();
        let mut sorted = bundle.manifest.files.clone();
        sorted.sort();
        assert_eq!(bundle.manifest.files, sorted);
    }

    #[test]
    fn schema_version_is_one() {
        let out = temp_dir("schema");
        let bundle =
            export_bundle_from_bytes(b"%PDF-1.7\n%%EOF", &out, BundleMode::HashOnly).unwrap();
        assert_eq!(bundle.manifest.schema_version, 1);
    }
}