#![forbid(unsafe_code)]
#![warn(rust_2024_compatibility, missing_docs, missing_debug_implementations)]
#![allow(
clippy::disallowed_methods,
clippy::disallowed_types,
reason = "the CLI is a synchronous adapter per spec; Tokio is reserved for future async \
service integration"
)]
use std::{
fs::File,
io::{self, Read, Write},
num::NonZeroU32,
path::{Path, PathBuf},
process::ExitCode,
time::Instant,
};
use anyhow::{Context, Result};
use clap::{Args, Parser, Subcommand, ValueEnum};
use pdfv_core::{
BatchReport, BoundedText, BuiltinProfileRepository, FeatureSelection, FlavourSelection,
MaxDisplayedFailures, MetadataRepairOptions, MetadataRepairer, ObjectTypeName, PasswordSecret,
PdfvError, PolicySet, RepairBatchReport, RepairReport, ReportFormat, ResourceLimits,
ValidationFlavour, ValidationOptions, ValidationStatus, ValidationWarning, Validator,
};
use rayon::prelude::*;
use serde::Deserialize;
const EXIT_VALID: u8 = 0;
const EXIT_INVALID: u8 = 1;
const EXIT_PARSE_FAILED: u8 = 2;
const EXIT_ENCRYPTED: u8 = 3;
const EXIT_INCOMPLETE: u8 = 4;
const EXIT_USAGE: u8 = 64;
const EXIT_INTERNAL: u8 = 70;
const MAX_CLI_JOBS: u32 = 256;
const HARD_MAX_FILE_BYTES: u64 = 1024 * 1024 * 1024;
const HARD_MAX_OBJECTS: u64 = 8_388_607;
const HARD_MAX_OBJECT_DEPTH: u32 = 512;
const HARD_MAX_ARRAY_LEN: u64 = 1_000_000;
const HARD_MAX_DICT_ENTRIES: u64 = 100_000;
const HARD_MAX_NAME_BYTES: usize = 4096;
const HARD_MAX_STRING_BYTES: usize = 16 * 1024 * 1024;
const HARD_MAX_PASSWORD_BYTES: usize = 4096;
const HARD_MAX_STREAM_BYTES: u64 = 1024 * 1024 * 1024;
const HARD_MAX_PARSE_FACTS: usize = 1_000_000;
const HARD_MAX_ENCRYPTION_DICT_ENTRIES: u64 = 1024;
const HARD_MAX_MEMORY_SOURCE_THRESHOLD_BYTES: u64 = HARD_MAX_FILE_BYTES;
const MAX_POLICY_FILE_BYTES: u64 = 1024 * 1024;
#[derive(Debug, Parser)]
#[command(name = "pdfv", version, about = "Validate PDF conformance")]
struct Cli {
#[command(subcommand)]
command: Command,
}
#[derive(Debug, Subcommand)]
enum Command {
Validate(Box<ValidateArgs>),
RepairMetadata(Box<RepairMetadataArgs>),
Profiles {
#[command(subcommand)]
command: ProfilesCommand,
},
}
#[derive(Debug, Subcommand)]
enum ProfilesCommand {
List,
}
#[derive(Debug, Args)]
#[allow(
clippy::struct_excessive_bools,
reason = "CLI flags are independent clap switches; grouping them would make the command \
surface less direct"
)]
struct ValidateArgs {
#[arg(value_name = "PATH", required = true)]
paths: Vec<PathBuf>,
#[arg(long, value_enum)]
format: Option<FormatArg>,
#[arg(long, value_parser = parse_flavour_selection)]
flavour: Option<FlavourSelection>,
#[arg(long, alias = "defaultflavour", value_parser = parse_flavour, conflicts_with = "profile")]
default_flavour: Option<ValidationFlavour>,
#[arg(long, value_name = "PATH", conflicts_with = "flavour")]
profile: Option<PathBuf>,
#[arg(long, allow_hyphen_values = true, value_parser = parse_max_failures)]
max_failures: Option<MaxDisplayedFailures>,
#[arg(long, alias = "recurse")]
recursive: bool,
#[arg(long, alias = "nonpdfext")]
non_pdf_extension: bool,
#[arg(long, default_value = "1", value_parser = parse_jobs)]
jobs: NonZeroU32,
#[arg(long, value_name = "PATH")]
config: Option<PathBuf>,
#[arg(long, value_name = "PATH")]
output: Option<PathBuf>,
#[arg(long)]
redact_paths: bool,
#[arg(long)]
record_passes: bool,
#[arg(long, value_name = "FEATURES", num_args = 0..=1, default_missing_value = "all")]
extract: Option<String>,
#[arg(long, value_name = "PATH")]
policy_file: Option<PathBuf>,
#[arg(long, conflicts_with_all = ["password_file", "password_env"])]
password_stdin: bool,
#[arg(long, value_name = "PATH", conflicts_with_all = ["password_stdin", "password_env"])]
password_file: Option<PathBuf>,
#[arg(long, value_name = "ENV_VAR", conflicts_with_all = ["password_stdin", "password_file"])]
password_env: Option<String>,
}
#[derive(Debug, Args)]
struct RepairMetadataArgs {
#[arg(value_name = "PATH", required = true)]
paths: Vec<PathBuf>,
#[arg(long, value_name = "DIR")]
output_dir: PathBuf,
#[arg(long, default_value = "")]
prefix: String,
#[arg(long, value_enum)]
format: Option<FormatArg>,
#[arg(long, value_parser = parse_flavour_selection)]
flavour: Option<FlavourSelection>,
#[arg(long, default_value = "1", value_parser = parse_jobs)]
jobs: NonZeroU32,
#[arg(long, value_name = "PATH")]
output: Option<PathBuf>,
#[arg(long)]
redact_paths: bool,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq, ValueEnum)]
enum FormatArg {
Json,
JsonPretty,
Text,
Xml,
Mrr,
Raw,
Html,
}
impl From<FormatArg> for ReportFormat {
fn from(value: FormatArg) -> Self {
match value {
FormatArg::Json => Self::Json,
FormatArg::JsonPretty => Self::JsonPretty,
FormatArg::Text => Self::Text,
FormatArg::Xml | FormatArg::Mrr => Self::Xml,
FormatArg::Raw => Self::RawXml,
FormatArg::Html => Self::Html,
}
}
}
impl FormatArg {
fn into_report_format(self) -> ReportFormat {
ReportFormat::from(self)
}
}
fn main() -> ExitCode {
let cli = match Cli::try_parse() {
Ok(cli) => cli,
Err(error) => {
let exit = if matches!(error.kind(), clap::error::ErrorKind::DisplayVersion) {
EXIT_VALID
} else {
EXIT_USAGE
};
if let Err(write_error) = error.print() {
let _ = writeln!(io::stderr(), "failed to write CLI error: {write_error}");
return ExitCode::from(EXIT_INTERNAL);
}
return ExitCode::from(exit);
}
};
match run(cli) {
Ok(exit) => ExitCode::from(exit.code()),
Err(error) => {
let exit = exit_for_error(error.downcast_ref::<PdfvError>());
let _ = writeln!(io::stderr(), "{error:#}");
ExitCode::from(exit)
}
}
}
fn run(cli: Cli) -> Result<CliExit> {
match cli.command {
Command::Validate(args) => run_validate(&args),
Command::RepairMetadata(args) => run_repair_metadata(&args),
Command::Profiles {
command: ProfilesCommand::List,
} => run_profiles_list(),
}
}
fn run_validate(args: &ValidateArgs) -> Result<CliExit> {
let started = Instant::now();
let config = args
.config
.as_ref()
.map(|path| load_cli_config(path))
.transpose()?
.unwrap_or_default();
let format = args.format.map_or_else(
|| config.output.format.into(),
FormatArg::into_report_format,
);
let options = validation_options(args, &config)?;
let validator = Validator::new(options).context("failed to initialize validator")?;
let paths = discover_inputs(&args.paths, args.recursive, args.non_pdf_extension)?;
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(usize::try_from(args.jobs.get()).unwrap_or(usize::MAX))
.build()
.context("failed to build validation worker pool")?;
let mut batch = pool.install(|| validate_paths(&validator, &paths));
let reports = &mut batch.reports;
if args.redact_paths || config.output.redact_paths {
redact_report_paths(reports);
}
let exit = reports
.iter()
.map(|report| CliExit::from_status(report.status))
.fold(CliExit::Valid, CliExit::worst);
let exit = if batch.internal_errors > 0 {
CliExit::worst(exit, CliExit::Internal)
} else {
exit
};
if let Some(output_path) = args.output.as_ref().or(config.output.path.as_ref()) {
let mut output = File::create(output_path)
.with_context(|| format!("failed to create {}", output_path.display()))?;
write_reports(format, batch, started, args.recursive, &mut output)?;
output.flush().context("failed to flush report output")?;
} else {
let stdout = io::stdout();
let mut handle = stdout.lock();
write_reports(format, batch, started, args.recursive, &mut handle)?;
handle.flush().context("failed to flush report output")?;
}
Ok(exit)
}
fn run_repair_metadata(args: &RepairMetadataArgs) -> Result<CliExit> {
let started = Instant::now();
let format = args
.format
.map_or(ReportFormat::Json, FormatArg::into_report_format);
let validation_options = ValidationOptions::builder()
.flavour(args.flavour.clone().unwrap_or_default())
.resource_limits(validated_resource_limits(ResourceLimits::default())?)
.build();
let repair_options =
MetadataRepairOptions::new(validation_options, &args.output_dir, args.prefix.clone())?;
let repairer =
MetadataRepairer::new(repair_options).context("failed to initialize repair engine")?;
let paths = discover_inputs(&args.paths, false, false)?;
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(usize::try_from(args.jobs.get()).unwrap_or(usize::MAX))
.build()
.context("failed to build repair worker pool")?;
let mut reports = pool.install(|| repair_paths(&repairer, &paths));
if args.redact_paths {
redact_repair_paths(&mut reports);
}
let batch = RepairBatchReport::from_items(reports, Vec::new(), started.elapsed());
let exit = repair_exit(&batch);
if let Some(output_path) = &args.output {
let mut output = File::create(output_path)
.with_context(|| format!("failed to create {}", output_path.display()))?;
write_repair_reports(format, &batch, &mut output)?;
output
.flush()
.context("failed to flush repair report output")?;
} else {
let stdout = io::stdout();
let mut handle = stdout.lock();
write_repair_reports(format, &batch, &mut handle)?;
handle
.flush()
.context("failed to flush repair report output")?;
}
Ok(exit)
}
fn run_profiles_list() -> Result<CliExit> {
let stdout = io::stdout();
let mut handle = stdout.lock();
for entry in BuiltinProfileRepository::new()
.list_profiles()
.context("failed to list profiles")?
{
writeln!(
handle,
"{}\t{}\t{}%\t{}\t{}\t{}\t{}\t{}",
entry.identity.id.as_str(),
entry.display_flavour.as_str(),
coverage_percent(entry.coverage.executable_rules, entry.coverage.total_rules),
entry.coverage.executable_rules,
entry.coverage.total_rules,
entry.source_pin.as_str(),
entry.source_file.as_str(),
entry.identity.name.as_str(),
)
.context("failed to write profile list")?;
}
handle.flush().context("failed to flush profile list")?;
Ok(CliExit::Valid)
}
fn coverage_percent(executable: u64, total: u64) -> u64 {
executable
.saturating_mul(100)
.checked_div(total)
.unwrap_or(0)
}
fn write_reports<W: Write>(
format: ReportFormat,
batch: ValidationBatch,
started: Instant,
force_batch: bool,
output: &mut W,
) -> Result<()> {
if batch.reports.len() == 1 && batch.internal_errors == 0 && !force_batch {
let reports = batch.reports;
let Some(report) = reports.first() else {
return Err(anyhow::anyhow!("validation produced no reports"));
};
format
.write_report(report, output)
.context("failed to write validation report")?;
} else {
let report = BatchReport::from_items_with_internal_errors(
batch.reports,
batch.warnings,
started.elapsed(),
batch.internal_errors,
);
format
.write_batch(&report, output)
.context("failed to write batch report")?;
}
Ok(())
}
fn write_repair_reports<W: Write>(
format: ReportFormat,
batch: &RepairBatchReport,
output: &mut W,
) -> Result<()> {
if batch.items.len() == 1 {
let Some(report) = batch.items.first() else {
return Err(anyhow::anyhow!("repair produced no reports"));
};
format
.write_repair_report(report, output)
.context("failed to write repair report")?;
} else {
format
.write_repair_batch(batch, output)
.context("failed to write repair batch report")?;
}
Ok(())
}
fn validation_options(args: &ValidateArgs, config: &CliConfig) -> Result<ValidationOptions> {
let flavour = args.profile.as_ref().map_or_else(
|| {
config
.validation
.flavour_selection(args.flavour.clone(), args.default_flavour.clone())
},
|profile_path| {
Ok(FlavourSelection::CustomProfile {
profile_path: profile_path.clone(),
})
},
)?;
let resource_limits = validated_resource_limits(config.resources.clone().unwrap_or_default())?;
let password = resolve_password(args, config, resource_limits.max_password_bytes)?;
let policy = args
.policy_file
.as_ref()
.map(|path| load_policy_file(path))
.transpose()?;
let feature_selection = if policy.is_some() {
parse_feature_selection(args.extract.as_deref().unwrap_or("all"))?
} else {
args.extract
.as_deref()
.map(parse_feature_selection)
.transpose()?
.unwrap_or_default()
};
Ok(ValidationOptions::builder()
.flavour(flavour)
.resource_limits(resource_limits)
.password(password)
.max_failed_assertions_per_rule(
args.max_failures
.or(config.validation.max_failed_assertions_per_rule)
.unwrap_or_default(),
)
.record_passed_assertions(args.record_passes || config.validation.record_passed_assertions)
.feature_selection(feature_selection)
.policy(policy)
.build())
}
fn resolve_password(
args: &ValidateArgs,
config: &CliConfig,
max_password_bytes: usize,
) -> Result<Option<PasswordSecret>> {
let cli_source = PasswordSource::from_args(args)?;
let config_source = config.validation.password_source()?;
let Some(source) = cli_source.or(config_source) else {
return Ok(None);
};
let value = match source {
PasswordSource::Stdin => {
let mut bytes = Vec::new();
let read_limit = u64::try_from(max_password_bytes)
.unwrap_or(u64::MAX)
.saturating_add(1);
io::stdin()
.take(read_limit)
.read_to_end(&mut bytes)
.context("failed to read password from stdin")?;
if bytes.len() > max_password_bytes {
return Err(password_config_error(
"passwordStdin",
"password stdin exceeds byte limit",
)
.into());
}
let value = String::from_utf8(bytes)
.map_err(|_| password_config_error("passwordStdin", "password is not UTF-8"))?;
trim_one_line_ending(value)
}
PasswordSource::File(path) => {
let metadata = std::fs::metadata(&path)
.with_context(|| format!("failed to inspect password file {}", path.display()))?;
if metadata.is_dir() {
return Err(
password_config_error("passwordFile", "password file is a directory").into(),
);
}
let max = u64::try_from(max_password_bytes).unwrap_or(u64::MAX);
if metadata.len() > max {
return Err(password_config_error(
"passwordFile",
"password file exceeds byte limit",
)
.into());
}
let value = std::fs::read_to_string(&path)
.with_context(|| format!("failed to read password file {}", path.display()))?;
trim_one_line_ending(value)
}
PasswordSource::Env(name) => std::env::var(&name)
.with_context(|| format!("failed to read password environment variable {name}"))?,
};
PasswordSecret::new_with_limit(value, max_password_bytes)
.map(Some)
.map_err(anyhow::Error::from)
}
fn trim_one_line_ending(mut value: String) -> String {
if value.ends_with("\r\n") {
value.truncate(value.len().saturating_sub(2));
} else if value.ends_with('\n') || value.ends_with('\r') {
value.truncate(value.len().saturating_sub(1));
}
value
}
fn parse_max_failures(value: &str) -> std::result::Result<MaxDisplayedFailures, String> {
let value = value
.parse::<i64>()
.map_err(|_| String::from("max failures must be -1 or a positive integer"))?;
match value {
-1 => Ok(MaxDisplayedFailures::new(NonZeroU32::MAX)),
1.. if value <= i64::from(u32::MAX) => {
let value =
u32::try_from(value).map_err(|_| String::from("max failures exceeds u32 range"))?;
let non_zero = NonZeroU32::new(value)
.ok_or_else(|| String::from("max failures must be greater than zero"))?;
Ok(MaxDisplayedFailures::new(non_zero))
}
_ => Err(format!(
"max failures must be -1 or an integer in 1..={}",
u32::MAX
)),
}
}
fn parse_jobs(value: &str) -> std::result::Result<NonZeroU32, String> {
let jobs = value
.parse::<u32>()
.map_err(|_| String::from("jobs must be a positive integer"))?;
let jobs =
NonZeroU32::new(jobs).ok_or_else(|| String::from("jobs must be greater than zero"))?;
if jobs.get() > MAX_CLI_JOBS {
return Err(format!("jobs must be in 1..={MAX_CLI_JOBS}"));
}
Ok(jobs)
}
fn parse_flavour_selection(value: &str) -> std::result::Result<FlavourSelection, String> {
if value == "auto" {
return Ok(FlavourSelection::Auto { default: None });
}
parse_flavour(value).map(|flavour| FlavourSelection::Explicit { flavour })
}
fn apply_default_flavour(
selection: FlavourSelection,
default: Option<ValidationFlavour>,
) -> FlavourSelection {
match (selection, default) {
(FlavourSelection::Auto { .. }, Some(default)) => FlavourSelection::Auto {
default: Some(default),
},
(selection, _) => selection,
}
}
fn parse_feature_selection(value: &str) -> Result<FeatureSelection> {
const MAX_FEATURE_FAMILIES: usize = 64;
if value == "all" {
return Ok(FeatureSelection::All);
}
let families = value
.split(',')
.map(str::trim)
.filter(|family| !family.is_empty())
.map(|family| ObjectTypeName::new(family.to_owned()))
.collect::<std::result::Result<Vec<_>, _>>()?;
if families.is_empty() {
return Err(feature_config_error("extract", "feature selection is empty").into());
}
if families.len() > MAX_FEATURE_FAMILIES {
return Err(feature_config_error("extract", "too many feature families").into());
}
Ok(FeatureSelection::Families { families })
}
fn load_policy_file(path: &Path) -> Result<PolicySet> {
let metadata = std::fs::metadata(path)
.with_context(|| format!("failed to inspect policy file {}", path.display()))?;
if metadata.is_dir() {
return Err(feature_config_error("policyFile", "policy file is a directory").into());
}
if metadata.len() > MAX_POLICY_FILE_BYTES {
return Err(feature_config_error("policyFile", "policy file exceeds byte limit").into());
}
let file = File::open(path)
.with_context(|| format!("failed to open policy file {}", path.display()))?;
let mut contents = String::new();
file.take(MAX_POLICY_FILE_BYTES.saturating_add(1))
.read_to_string(&mut contents)
.with_context(|| format!("failed to read policy file {}", path.display()))?;
if u64::try_from(contents.len()).unwrap_or(u64::MAX) > MAX_POLICY_FILE_BYTES {
return Err(feature_config_error("policyFile", "policy file exceeds byte limit").into());
}
let policy: PolicySet = config::Config::builder()
.add_source(config::File::from_str(&contents, config::FileFormat::Yaml))
.build()
.with_context(|| format!("failed to read policy file {}", path.display()))?
.try_deserialize()
.with_context(|| format!("failed to parse policy file {}", path.display()))?;
policy
.validate()
.with_context(|| format!("invalid policy file {}", path.display()))?;
Ok(policy)
}
fn parse_flavour(value: &str) -> std::result::Result<ValidationFlavour, String> {
if let Some(rest) = value.strip_prefix("pdfa-") {
return parse_pdfa_flavour(rest);
}
if let Some(rest) = value.strip_prefix("pdfua-") {
return parse_pdfua_flavour(rest);
}
if let Some(rest) = value.strip_prefix("wtpdf-") {
return parse_wtpdf_flavour(rest);
}
Err(String::from(
"expected auto, pdfa-1b, pdfa-4, pdfua-1, pdfua-2-iso32005, or wtpdf-1-0-reuse",
))
}
fn parse_pdfa_flavour(rest: &str) -> std::result::Result<ValidationFlavour, String> {
let split_at = rest
.find(|character: char| !character.is_ascii_digit())
.unwrap_or(rest.len());
let (part, conformance) = rest.split_at(split_at);
if part.is_empty() {
return Err(String::from("expected PDF/A flavour part"));
}
let part = part
.parse::<u32>()
.map_err(|_| String::from("PDF/A part must be an integer"))?;
let part = NonZeroU32::new(part).ok_or_else(|| String::from("PDF/A part must be non-zero"))?;
let conformance = if conformance.is_empty() {
"none"
} else {
conformance
};
ValidationFlavour::new("pdfa", part, conformance)
.map_err(|error| format!("invalid PDF/A flavour: {error}"))
}
fn parse_pdfua_flavour(rest: &str) -> std::result::Result<ValidationFlavour, String> {
let (part, suffix) = rest
.split_once('-')
.map_or((rest, ""), |(part, suffix)| (part, suffix));
let part = part
.parse::<u32>()
.map_err(|_| String::from("PDF/UA part must be an integer"))?;
let part = NonZeroU32::new(part).ok_or_else(|| String::from("PDF/UA part must be non-zero"))?;
if part.get() == 2 && suffix != "iso32005" {
return Err(String::from("PDF/UA-2 must be spelled pdfua-2-iso32005"));
}
let conformance = if suffix.is_empty() { "none" } else { suffix };
ValidationFlavour::new("pdfua", part, conformance)
.map_err(|error| format!("invalid PDF/UA flavour: {error}"))
}
fn parse_wtpdf_flavour(rest: &str) -> std::result::Result<ValidationFlavour, String> {
let Some(level) = rest.strip_prefix("1-0-") else {
return Err(String::from(
"WTPDF flavour must be wtpdf-1-0-reuse or wtpdf-1-0-accessibility",
));
};
if !matches!(level, "reuse" | "accessibility") {
return Err(String::from(
"WTPDF flavour must be wtpdf-1-0-reuse or wtpdf-1-0-accessibility",
));
}
ValidationFlavour::new("wtpdf", NonZeroU32::MIN, level)
.map_err(|error| format!("invalid WTPDF flavour: {error}"))
}
fn load_cli_config(path: &Path) -> Result<CliConfig> {
config::Config::builder()
.add_source(config::File::from(path).required(true))
.build()
.with_context(|| format!("failed to read config {}", path.display()))?
.try_deserialize()
.with_context(|| format!("failed to parse config {}", path.display()))
}
fn discover_inputs(
paths: &[PathBuf],
recursive: bool,
include_non_pdf_extension: bool,
) -> Result<Vec<PathBuf>> {
let mut discovered = Vec::new();
for path in paths {
if recursive && path.is_dir() {
discover_directory(path, include_non_pdf_extension, &mut discovered)?;
} else {
discovered.push(path.clone());
}
}
if discovered.is_empty() {
return Err(
PdfvError::Configuration(pdfv_core::ConfigError::InvalidValue {
field: "paths",
reason: pdfv_core::BoundedText::new("no PDF inputs discovered", 256)?,
})
.into(),
);
}
Ok(discovered)
}
fn discover_directory(
root: &Path,
include_non_pdf_extension: bool,
discovered: &mut Vec<PathBuf>,
) -> Result<()> {
const MAX_DISCOVERED_FILES: usize = 100_000;
let mut stack = vec![root.to_path_buf()];
while let Some(path) = stack.pop() {
for entry in std::fs::read_dir(&path)
.with_context(|| format!("failed to read directory {}", path.display()))?
{
let entry = entry
.with_context(|| format!("failed to read directory entry in {}", path.display()))?;
let entry_path = entry.path();
let file_type = entry
.file_type()
.with_context(|| format!("failed to inspect {}", entry_path.display()))?;
if file_type.is_dir() {
stack.push(entry_path);
} else if file_type.is_file() && (include_non_pdf_extension || is_pdf_path(&entry_path))
{
if discovered.len() >= MAX_DISCOVERED_FILES {
return Err(
PdfvError::Configuration(pdfv_core::ConfigError::InvalidValue {
field: "paths",
reason: pdfv_core::BoundedText::new(
"recursive discovery exceeded file limit",
256,
)?,
})
.into(),
);
}
discovered.push(entry_path);
}
}
}
discovered.sort();
Ok(())
}
fn is_pdf_path(path: &Path) -> bool {
path.extension()
.and_then(|extension| extension.to_str())
.is_some_and(|extension| extension.eq_ignore_ascii_case("pdf"))
}
fn validate_paths(validator: &Validator, paths: &[PathBuf]) -> ValidationBatch {
let results = paths
.par_iter()
.map(|path| {
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
validator
.validate_path(path)
.with_context(|| format!("failed to validate {}", path.display()))
}));
match result {
Ok(result) => result.map_err(|error| error.to_string()),
Err(_) => Err(format!(
"validation worker panicked while processing {}",
path.display()
)),
}
})
.collect::<Vec<_>>();
let mut batch = ValidationBatch::default();
for result in results {
match result {
Ok(report) => batch.reports.push(report),
Err(message) => {
batch.internal_errors = batch.internal_errors.saturating_add(1);
batch.warnings.push(ValidationWarning::General {
message: BoundedText::new(message, 512).unwrap_or_else(|_| {
BoundedText::new("internal validation error", 128)
.unwrap_or_else(|_| unreachable_bounded_text())
}),
});
}
}
}
batch
}
fn repair_paths(repairer: &MetadataRepairer, paths: &[PathBuf]) -> Vec<RepairReport> {
paths
.par_iter()
.map(|path| {
match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
repairer.repair_path(path)
})) {
Ok(Ok(report)) => report,
Ok(Err(error)) => failed_repair_report(path, &error.to_string()),
Err(_) => failed_repair_report(
path,
&format!("repair worker panicked while processing {}", path.display()),
),
}
})
.collect()
}
fn failed_repair_report(path: &Path, reason: &str) -> RepairReport {
let source =
pdfv_core::InputSummary::new(pdfv_core::InputKind::File, Some(path.to_path_buf()), None);
RepairReport::builder()
.engine_version(pdfv_core::ENGINE_VERSION.to_owned())
.source(source)
.output_path(None)
.status(pdfv_core::RepairStatus::Failed)
.actions(Vec::new())
.refusal(None)
.warnings(vec![ValidationWarning::General {
message: BoundedText::new(reason, 512).unwrap_or_else(|_| unreachable_bounded_text()),
}])
.task_durations(Vec::new())
.build()
}
fn redact_report_paths(reports: &mut [pdfv_core::ValidationReport]) {
for report in reports {
report.source.path = None;
}
}
fn redact_repair_paths(reports: &mut [RepairReport]) {
for report in reports {
report.source.path = None;
report.output_path = None;
}
}
fn repair_exit(batch: &RepairBatchReport) -> CliExit {
if batch.summary.failed > 0 {
CliExit::Internal
} else if batch.summary.refused > 0 {
CliExit::ParseFailed
} else {
CliExit::Valid
}
}
fn validated_resource_limits(limits: ResourceLimits) -> Result<ResourceLimits> {
ensure_limit("maxFileBytes", limits.max_file_bytes, HARD_MAX_FILE_BYTES)?;
ensure_limit("maxObjects", limits.max_objects, HARD_MAX_OBJECTS)?;
ensure_limit_u32(
"maxObjectDepth",
limits.max_object_depth,
HARD_MAX_OBJECT_DEPTH,
)?;
ensure_limit("maxArrayLen", limits.max_array_len, HARD_MAX_ARRAY_LEN)?;
ensure_limit(
"maxDictEntries",
limits.max_dict_entries,
HARD_MAX_DICT_ENTRIES,
)?;
ensure_limit_usize("maxNameBytes", limits.max_name_bytes, HARD_MAX_NAME_BYTES)?;
ensure_limit_usize(
"maxStringBytes",
limits.max_string_bytes,
HARD_MAX_STRING_BYTES,
)?;
ensure_limit_usize(
"maxPasswordBytes",
limits.max_password_bytes,
HARD_MAX_PASSWORD_BYTES,
)?;
ensure_limit_usize(
"maxDecryptedStringBytes",
limits.max_decrypted_string_bytes,
HARD_MAX_STRING_BYTES,
)?;
ensure_limit(
"maxStreamDeclaredBytes",
limits.max_stream_declared_bytes,
HARD_MAX_STREAM_BYTES,
)?;
ensure_limit(
"maxStreamDecodeBytes",
limits.max_stream_decode_bytes,
HARD_MAX_STREAM_BYTES,
)?;
ensure_limit(
"maxDecryptedStreamBytes",
limits.max_decrypted_stream_bytes,
HARD_MAX_STREAM_BYTES,
)?;
ensure_limit(
"maxEncryptionDictEntries",
limits.max_encryption_dict_entries,
HARD_MAX_ENCRYPTION_DICT_ENTRIES,
)?;
ensure_limit(
"memorySourceThresholdBytes",
limits.memory_source_threshold_bytes,
HARD_MAX_MEMORY_SOURCE_THRESHOLD_BYTES,
)?;
ensure_limit_usize(
"maxParseFacts",
limits.max_parse_facts,
HARD_MAX_PARSE_FACTS,
)?;
Ok(limits)
}
fn password_config_error(field: &'static str, reason: &'static str) -> PdfvError {
PdfvError::Configuration(pdfv_core::ConfigError::InvalidValue {
field,
reason: BoundedText::new(reason, 128).unwrap_or_else(|_| unreachable_bounded_text()),
})
}
fn feature_config_error(field: &'static str, reason: &'static str) -> PdfvError {
PdfvError::Configuration(pdfv_core::ConfigError::InvalidValue {
field,
reason: BoundedText::new(reason, 128).unwrap_or_else(|_| unreachable_bounded_text()),
})
}
fn ensure_limit(field: &'static str, value: u64, max: u64) -> Result<()> {
if value <= max {
Ok(())
} else {
Err(config_limit_error(field, max).into())
}
}
fn ensure_limit_u32(field: &'static str, value: u32, max: u32) -> Result<()> {
ensure_limit(field, u64::from(value), u64::from(max))
}
fn ensure_limit_usize(field: &'static str, value: usize, max: usize) -> Result<()> {
let value = u64::try_from(value).unwrap_or(u64::MAX);
let max = u64::try_from(max).unwrap_or(u64::MAX);
ensure_limit(field, value, max)
}
fn config_limit_error(field: &'static str, max: u64) -> PdfvError {
PdfvError::Configuration(pdfv_core::ConfigError::InvalidValue {
field,
reason: BoundedText::new(format!("value exceeds hard cap {max}"), 128)
.unwrap_or_else(|_| unreachable_bounded_text()),
})
}
fn unreachable_bounded_text() -> BoundedText {
BoundedText::new("bounded diagnostic unavailable", 128)
.unwrap_or_else(|_| std::process::abort())
}
#[derive(Debug, Default)]
struct ValidationBatch {
reports: Vec<pdfv_core::ValidationReport>,
warnings: Vec<ValidationWarning>,
internal_errors: u64,
}
#[derive(Clone, Debug, Default, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
struct CliConfig {
#[serde(default)]
validation: ValidationConfig,
#[serde(default)]
resources: Option<pdfv_core::ResourceLimits>,
#[serde(default)]
output: OutputConfig,
}
#[derive(Clone, Debug, Default, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
struct ValidationConfig {
#[serde(default)]
flavour: Option<String>,
#[serde(default)]
default_flavour: Option<String>,
#[serde(default)]
max_failed_assertions_per_rule: Option<MaxDisplayedFailures>,
#[serde(default)]
record_passed_assertions: bool,
#[serde(default)]
password: Option<PasswordConfig>,
}
impl ValidationConfig {
fn flavour_selection(
&self,
cli_flavour: Option<FlavourSelection>,
cli_default_flavour: Option<ValidationFlavour>,
) -> Result<FlavourSelection> {
let configured_flavour = self
.flavour
.as_deref()
.map(parse_flavour_selection)
.transpose()
.map_err(|message| anyhow::anyhow!("invalid config validation.flavour: {message}"))?;
let configured_default = self
.default_flavour
.as_deref()
.map(parse_flavour)
.transpose()
.map_err(|message| {
anyhow::anyhow!("invalid config validation.defaultFlavour: {message}")
})?;
let flavour = cli_flavour.or(configured_flavour).unwrap_or_default();
let default = cli_default_flavour.or(configured_default);
Ok(apply_default_flavour(flavour, default))
}
fn password_source(&self) -> Result<Option<PasswordSource>> {
self.password
.as_ref()
.map(PasswordSource::try_from_config)
.transpose()
}
}
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
struct PasswordConfig {
#[serde(default)]
stdin: bool,
#[serde(default)]
file: Option<PathBuf>,
#[serde(default)]
env: Option<String>,
}
#[derive(Clone, Debug)]
enum PasswordSource {
Stdin,
File(PathBuf),
Env(String),
}
impl PasswordSource {
fn from_args(args: &ValidateArgs) -> Result<Option<Self>> {
let mut sources = Vec::new();
if args.password_stdin {
sources.push(Self::Stdin);
}
if let Some(path) = &args.password_file {
sources.push(Self::File(path.clone()));
}
if let Some(name) = &args.password_env {
sources.push(Self::Env(validate_env_name(name)?));
}
one_password_source(sources)
}
fn try_from_config(config: &PasswordConfig) -> Result<Self> {
let mut sources = Vec::new();
if config.stdin {
sources.push(Self::Stdin);
}
if let Some(path) = &config.file {
sources.push(Self::File(path.clone()));
}
if let Some(name) = &config.env {
sources.push(Self::Env(validate_env_name(name)?));
}
one_password_source(sources)?.ok_or_else(|| {
password_config_error("validation.password", "password source is empty").into()
})
}
}
fn one_password_source(mut sources: Vec<PasswordSource>) -> Result<Option<PasswordSource>> {
match sources.len() {
0 => Ok(None),
1 => Ok(sources.pop()),
_ => Err(password_config_error(
"password",
"exactly zero or one password source is allowed",
)
.into()),
}
}
fn validate_env_name(name: &str) -> Result<String> {
let is_valid = !name.is_empty()
&& name.len() <= 128
&& name
.bytes()
.all(|byte| byte.is_ascii_alphanumeric() || byte == b'_');
if is_valid {
Ok(name.to_owned())
} else {
Err(password_config_error("passwordEnv", "environment variable name is invalid").into())
}
}
#[derive(Clone, Debug, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
struct OutputConfig {
#[serde(default = "default_report_format")]
format: ConfigReportFormat,
#[serde(default)]
path: Option<PathBuf>,
#[serde(default)]
redact_paths: bool,
}
impl Default for OutputConfig {
fn default() -> Self {
Self {
format: default_report_format(),
path: None,
redact_paths: false,
}
}
}
#[derive(Clone, Copy, Debug, Deserialize)]
#[serde(rename_all = "kebab-case")]
enum ConfigReportFormat {
Json,
#[serde(alias = "jsonPretty")]
JsonPretty,
Text,
Xml,
Mrr,
#[serde(alias = "rawXml")]
Raw,
Html,
}
impl From<ConfigReportFormat> for ReportFormat {
fn from(value: ConfigReportFormat) -> Self {
match value {
ConfigReportFormat::Json => Self::Json,
ConfigReportFormat::JsonPretty => Self::JsonPretty,
ConfigReportFormat::Text => Self::Text,
ConfigReportFormat::Xml | ConfigReportFormat::Mrr => Self::Xml,
ConfigReportFormat::Raw => Self::RawXml,
ConfigReportFormat::Html => Self::Html,
}
}
}
fn default_report_format() -> ConfigReportFormat {
ConfigReportFormat::Json
}
fn exit_for_error(error: Option<&PdfvError>) -> u8 {
match error {
Some(PdfvError::Profile(pdfv_core::ProfileError::UnsupportedSelection)) => EXIT_INCOMPLETE,
Some(PdfvError::Configuration(_) | PdfvError::Policy(_) | PdfvError::Repair(_)) => {
EXIT_USAGE
}
_ => EXIT_INTERNAL,
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
enum CliExit {
Valid,
Invalid,
Incomplete,
Encrypted,
ParseFailed,
Internal,
}
impl CliExit {
fn from_status(status: ValidationStatus) -> Self {
match status {
ValidationStatus::Valid => Self::Valid,
ValidationStatus::Invalid => Self::Invalid,
ValidationStatus::Encrypted => Self::Encrypted,
ValidationStatus::ParseFailed => Self::ParseFailed,
_ => Self::Incomplete,
}
}
fn worst(left: Self, right: Self) -> Self {
if left.rank() >= right.rank() {
left
} else {
right
}
}
fn rank(self) -> u8 {
match self {
Self::Valid => EXIT_VALID,
Self::Invalid => EXIT_INVALID,
Self::ParseFailed => EXIT_PARSE_FAILED,
Self::Encrypted => EXIT_ENCRYPTED,
Self::Incomplete => EXIT_INCOMPLETE,
Self::Internal => EXIT_INTERNAL,
}
}
fn code(self) -> u8 {
match self {
Self::Valid => EXIT_VALID,
Self::Invalid => EXIT_INVALID,
Self::ParseFailed => EXIT_PARSE_FAILED,
Self::Encrypted => EXIT_ENCRYPTED,
Self::Incomplete => EXIT_INCOMPLETE,
Self::Internal => EXIT_INTERNAL,
}
}
}
#[cfg(test)]
mod tests {
use pdfv_core::{FlavourSelection, ValidationStatus};
use super::{CliExit, parse_flavour_selection};
#[test]
fn test_should_parse_auto_flavour() {
let result = parse_flavour_selection("auto");
assert!(matches!(
result,
Ok(FlavourSelection::Auto { default: None })
));
}
#[test]
fn test_should_parse_pdfa_flavour() {
let result = parse_flavour_selection("pdfa-1b");
assert!(matches!(result, Ok(FlavourSelection::Explicit { .. })));
}
#[test]
fn test_should_parse_phase_13_builtin_flavours() {
for flavour in [
"pdfa-4",
"pdfa-4e",
"pdfua-1",
"pdfua-2-iso32005",
"wtpdf-1-0-reuse",
"wtpdf-1-0-accessibility",
] {
let result = parse_flavour_selection(flavour);
assert!(matches!(result, Ok(FlavourSelection::Explicit { .. })));
}
}
#[test]
fn test_should_rank_parse_failed_as_worst_exit() {
let exit = [ValidationStatus::Invalid, ValidationStatus::ParseFailed]
.into_iter()
.map(CliExit::from_status)
.fold(CliExit::Valid, CliExit::worst);
assert_eq!(exit.code(), 2);
}
#[test]
fn test_should_rank_higher_exit_code_as_worst_exit() {
let exit = [ValidationStatus::ParseFailed, ValidationStatus::Incomplete]
.into_iter()
.map(CliExit::from_status)
.fold(CliExit::Valid, CliExit::worst);
assert_eq!(exit.code(), 4);
}
}