use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
use super::classify::{classify_input, ClassificationReport};
use super::SCHEMA_VERSION;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[non_exhaustive]
pub enum BundleMode {
Full,
HashOnly,
IrOnly,
}
impl BundleMode {
pub const fn as_str(self) -> &'static str {
match self {
Self::Full => "full",
Self::HashOnly => "hash_only",
Self::IrOnly => "ir_only",
}
}
}
#[derive(Debug)]
pub enum DiagnosticError {
CreateDir {
path: PathBuf,
source: std::io::Error,
},
Write {
path: PathBuf,
source: std::io::Error,
},
ReadInput {
path: PathBuf,
source: std::io::Error,
},
}
impl std::fmt::Display for DiagnosticError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::CreateDir { path, source } => {
write!(f, "could not create '{}': {source}", path.display())
}
Self::Write { path, source } => {
write!(f, "could not write '{}': {source}", path.display())
}
Self::ReadInput { path, source } => {
write!(f, "could not read input '{}': {source}", path.display())
}
}
}
}
impl std::error::Error for DiagnosticError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Self::CreateDir { source, .. }
| Self::Write { source, .. }
| Self::ReadInput { source, .. } => Some(source),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BundleManifest {
pub schema_version: u32,
pub sdk_version: String,
pub mode: BundleMode,
pub input_sha256: String,
pub input_size: u64,
pub files: Vec<String>,
pub caveats: Vec<String>,
pub target_triple: String,
}
#[derive(Debug, Clone)]
pub struct DiagnosticBundle {
pub root: PathBuf,
pub manifest: BundleManifest,
pub classification: ClassificationReport,
}
pub fn export_bundle(
input_path: &Path,
out_dir: &Path,
mode: BundleMode,
) -> Result<DiagnosticBundle, DiagnosticError> {
let bytes = std::fs::read(input_path).map_err(|source| DiagnosticError::ReadInput {
path: input_path.to_owned(),
source,
})?;
export_bundle_from_bytes(&bytes, out_dir, mode)
}
pub fn export_bundle_from_bytes(
bytes: &[u8],
out_dir: &Path,
mode: BundleMode,
) -> Result<DiagnosticBundle, DiagnosticError> {
std::fs::create_dir_all(out_dir).map_err(|source| DiagnosticError::CreateDir {
path: out_dir.to_owned(),
source,
})?;
let classification = classify_input(bytes);
let input_sha256 = sha256_hex(bytes);
let input_size = bytes.len() as u64;
let mut files: Vec<String> = vec![
"manifest.json".into(),
"classification.json".into(),
"env.json".into(),
"errors.log".into(),
"input.sha256".into(),
];
if matches!(mode, BundleMode::Full) {
files.push("input.pdf".into());
}
files.sort();
let mut caveats = Vec::new();
if matches!(mode, BundleMode::IrOnly) {
caveats.push(
"ir_only mode is a forward stub; the IR export becomes available once M6b.2 stabilises the public IR API. The bundle is currently equivalent to hash_only.".into(),
);
}
if matches!(mode, BundleMode::HashOnly | BundleMode::IrOnly) {
caveats.push(
"input.pdf is intentionally absent in this mode; only input.sha256 ships.".into(),
);
}
let manifest = BundleManifest {
schema_version: SCHEMA_VERSION,
sdk_version: crate::api_version().to_string(),
mode,
input_sha256: input_sha256.clone(),
input_size,
files: files.clone(),
caveats,
target_triple: format!("{}-{}", std::env::consts::ARCH, std::env::consts::OS),
};
let env = EnvSnapshot {
schema_version: SCHEMA_VERSION,
target_pointer_width: usize::BITS,
cfg_target_os: std::env::consts::OS.to_string(),
cfg_target_arch: std::env::consts::ARCH.to_string(),
};
write_json(&out_dir.join("manifest.json"), &manifest)?;
write_json(&out_dir.join("classification.json"), &classification)?;
write_json(&out_dir.join("env.json"), &env)?;
write_bytes(&out_dir.join("errors.log"), b"")?;
write_bytes(&out_dir.join("input.sha256"), input_sha256.as_bytes())?;
if matches!(mode, BundleMode::Full) {
write_bytes(&out_dir.join("input.pdf"), bytes)?;
}
Ok(DiagnosticBundle {
root: out_dir.to_owned(),
manifest,
classification,
})
}
#[derive(Debug, Serialize, Deserialize)]
struct EnvSnapshot {
schema_version: u32,
target_pointer_width: u32,
cfg_target_os: String,
cfg_target_arch: String,
}
fn write_json<T: Serialize>(path: &Path, value: &T) -> Result<(), DiagnosticError> {
let bytes = serde_json::to_vec_pretty(value).expect("serde_json on internal type cannot fail");
write_bytes(path, &bytes)
}
fn write_bytes(path: &Path, bytes: &[u8]) -> Result<(), DiagnosticError> {
std::fs::write(path, bytes).map_err(|source| DiagnosticError::Write {
path: path.to_owned(),
source,
})
}
fn sha256_hex(bytes: &[u8]) -> String {
use sha2::{Digest, Sha256};
let digest = Sha256::digest(bytes);
digest.iter().map(|b| format!("{b:02x}")).collect()
}
#[cfg(test)]
mod tests {
use super::*;
fn fake_pdf(extra: &[u8]) -> Vec<u8> {
let mut v = b"%PDF-1.7\n".to_vec();
v.extend_from_slice(extra);
v.extend_from_slice(b"\n%%EOF\n");
v
}
fn temp_dir(name: &str) -> PathBuf {
let mut d = std::env::temp_dir();
d.push(format!(
"pdfluent-bundle-test-{name}-{}",
std::process::id()
));
let _ = std::fs::remove_dir_all(&d);
d
}
#[test]
fn bundle_full_mode_writes_all_files() {
let pdf = fake_pdf(b"/XFA layout=\"tb\"");
let out = temp_dir("full");
let bundle = export_bundle_from_bytes(&pdf, &out, BundleMode::Full).unwrap();
for f in &bundle.manifest.files {
assert!(out.join(f).exists(), "missing file {f}");
}
assert!(out.join("input.pdf").exists());
assert!(out.join("manifest.json").exists());
assert_eq!(bundle.manifest.mode, BundleMode::Full);
}
#[test]
fn bundle_hash_only_omits_input_pdf() {
let pdf = fake_pdf(b"/XFA layout=\"tb\"");
let out = temp_dir("hash");
let bundle = export_bundle_from_bytes(&pdf, &out, BundleMode::HashOnly).unwrap();
assert!(!out.join("input.pdf").exists());
assert!(out.join("input.sha256").exists());
assert!(!bundle.manifest.files.iter().any(|f| f == "input.pdf"));
}
#[test]
fn bundle_ir_only_marks_caveat() {
let pdf = fake_pdf(b"/XFA layout=\"tb\"");
let out = temp_dir("ir-only");
let bundle = export_bundle_from_bytes(&pdf, &out, BundleMode::IrOnly).unwrap();
assert!(bundle.manifest.caveats.iter().any(|c| c.contains("M6b.2")));
assert!(!out.join("input.pdf").exists());
}
#[test]
fn rerun_is_idempotent() {
let pdf = fake_pdf(b"/XFA layout=\"tb\"");
let out = temp_dir("idempotent");
let _b1 = export_bundle_from_bytes(&pdf, &out, BundleMode::HashOnly).unwrap();
let m1 = std::fs::read(out.join("manifest.json")).unwrap();
let _b2 = export_bundle_from_bytes(&pdf, &out, BundleMode::HashOnly).unwrap();
let m2 = std::fs::read(out.join("manifest.json")).unwrap();
assert_eq!(m1, m2, "manifest.json must be byte-identical on rerun");
}
#[test]
fn input_sha256_matches_known_value() {
let out = temp_dir("hash-empty");
let bundle = export_bundle_from_bytes(b"", &out, BundleMode::HashOnly).unwrap();
assert_eq!(
bundle.manifest.input_sha256,
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
);
}
#[test]
fn bundle_classification_round_trip() {
let pdf = fake_pdf(b"/XFA layout=\"tb\" x-formcalc");
let out = temp_dir("classify");
let _b = export_bundle_from_bytes(&pdf, &out, BundleMode::Full).unwrap();
let raw = std::fs::read(out.join("classification.json")).unwrap();
let parsed: ClassificationReport = serde_json::from_slice(&raw).unwrap();
assert_eq!(parsed.tier.as_str(), "C");
}
#[test]
fn manifest_lists_files_sorted() {
let pdf = fake_pdf(b"/XFA layout=\"tb\"");
let out = temp_dir("sorted");
let bundle = export_bundle_from_bytes(&pdf, &out, BundleMode::Full).unwrap();
let mut sorted = bundle.manifest.files.clone();
sorted.sort();
assert_eq!(bundle.manifest.files, sorted);
}
#[test]
fn schema_version_is_one() {
let out = temp_dir("schema");
let bundle =
export_bundle_from_bytes(b"%PDF-1.7\n%%EOF", &out, BundleMode::HashOnly).unwrap();
assert_eq!(bundle.manifest.schema_version, 1);
}
}