use std::fs;
use std::io::{Read, Write};
use std::path::Path;
use std::sync::Arc;
use crate::capability::Capability;
use crate::decoration::PageDecoration;
use crate::encrypt::EncryptOptions;
use crate::error::{internal_error, Error, Result};
use crate::form::{FormField, PdfFormMut};
use crate::license;
use crate::metadata::{Metadata, MetadataMut};
use crate::parity::{
CompressOptions, CompressReport, FontSubsetReport, ImageInsert, ImageInsertReport,
InsertImageFormat, ToImagesOptions, ToImagesReport,
};
use crate::watermark::{Rotation, WatermarkOptions};
#[derive(Debug, Clone, Default)]
#[non_exhaustive]
pub struct OpenOptions {
pub(crate) password: Option<String>,
pub(crate) repair: bool,
pub(crate) memory_limit: Option<usize>,
pub(crate) license_key: Option<String>,
pub(crate) processing_limits: Option<pdf_engine::ProcessingLimits>,
}
impl OpenOptions {
pub fn new() -> Self {
Self::default()
}
pub fn with_password(mut self, pw: impl Into<String>) -> Self {
self.password = Some(pw.into());
self
}
pub fn with_repair(mut self, repair: bool) -> Self {
self.repair = repair;
self
}
pub fn strict_memory_limit(mut self, bytes: usize) -> Self {
self.memory_limit = Some(bytes);
self
}
pub fn with_license_key(mut self, key: impl Into<String>) -> Self {
self.license_key = Some(key.into());
self
}
pub fn with_processing_limits(mut self, limits: pdf_engine::ProcessingLimits) -> Self {
self.processing_limits = Some(limits);
self
}
}
#[derive(Debug, Clone, Default)]
#[non_exhaustive]
pub struct SaveOptions {
pub(crate) linearize: bool,
pub(crate) overwrite: bool,
}
impl SaveOptions {
pub fn new() -> Self {
Self::default()
}
pub fn with_linearize(mut self, v: bool) -> Self {
self.linearize = v;
self
}
pub fn with_overwrite(mut self, v: bool) -> Self {
self.overwrite = v;
self
}
}
pub struct PdfDocument {
engine: pdf_engine::PdfDocument,
lopdf: lopdf::Document,
license_key_override: Option<String>,
processing_limits: Option<pdf_engine::ProcessingLimits>,
}
impl std::fmt::Debug for PdfDocument {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("PdfDocument")
.field("page_count", &self.engine.page_count())
.finish_non_exhaustive()
}
}
fn open_engine_from_shared_bytes(
shared: Arc<Vec<u8>>,
password: Option<&str>,
processing_limits: Option<&pdf_engine::ProcessingLimits>,
) -> Result<pdf_engine::PdfDocument> {
match (password, processing_limits) {
(Some(pw), Some(limits)) => {
pdf_engine::PdfDocument::open_with_password_and_processing_limits(
shared,
pw,
limits.clone(),
)
}
(None, Some(limits)) => {
pdf_engine::PdfDocument::open_with_processing_limits(shared, limits.clone())
}
(Some(pw), None) => pdf_engine::PdfDocument::open_with_password(shared, pw),
(None, None) => pdf_engine::PdfDocument::open(shared),
}
.map_err(Into::into)
}
fn load_lopdf_from_shared_bytes(
shared: &Arc<Vec<u8>>,
password: Option<&str>,
) -> Result<lopdf::Document> {
match password {
Some(pw) => {
let mut doc = lopdf::Document::load_mem(shared.as_slice())?;
let _ = doc.decrypt(pw);
Ok(doc)
}
None => lopdf::Document::load_mem(shared.as_slice()).map_err(Into::into),
}
}
impl PdfDocument {
pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
Self::open_with(path, OpenOptions::new())
}
#[cfg_attr(
feature = "tracing",
tracing::instrument(
target = "pdfluent",
skip(opts),
fields(path = %path.as_ref().display())
)
)]
pub fn open_with<P: AsRef<Path>>(path: P, opts: OpenOptions) -> Result<Self> {
license::require_capability(Capability::PdfParse)?;
let path_ref = path.as_ref();
let processing_file_cap: Option<u64> =
opts.processing_limits.as_ref().map(|l| l.max_file_bytes);
if opts.memory_limit.is_some() || processing_file_cap.is_some() {
let metadata = fs::metadata(path_ref).map_err(|source| match source.kind() {
std::io::ErrorKind::NotFound => Error::FileNotFound {
path: path_ref.to_path_buf(),
},
_ => Error::Io {
source,
path: Some(path_ref.to_path_buf()),
},
})?;
let size_u64 = metadata.len();
if let Some(limit_bytes) = processing_file_cap {
if size_u64 > limit_bytes {
return Err(Error::ResourceLimitExceeded {
kind: crate::error::ResourceLimitKind::FileTooLarge,
observed: size_u64,
limit: limit_bytes,
});
}
}
if let Some(limit) = opts.memory_limit {
let size = size_u64 as usize;
if size > limit {
return Err(Error::MemoryBudgetExceeded {
requested: size,
limit,
});
}
}
}
let bytes = fs::read(path_ref).map_err(|source| match source.kind() {
std::io::ErrorKind::NotFound => Error::FileNotFound {
path: path_ref.to_path_buf(),
},
_ => Error::Io {
source,
path: Some(path_ref.to_path_buf()),
},
})?;
Self::from_bytes_with(&bytes, opts)
}
pub fn from_bytes(bytes: &[u8]) -> Result<Self> {
Self::from_bytes_with(bytes, OpenOptions::new())
}
#[cfg_attr(
feature = "tracing",
tracing::instrument(
target = "pdfluent",
skip(bytes, opts),
fields(len = bytes.len())
)
)]
pub fn from_bytes_with(bytes: &[u8], opts: OpenOptions) -> Result<Self> {
license::require_capability(Capability::PdfParse)?;
if let Some(ref limits) = opts.processing_limits {
let len_u64 = bytes.len() as u64;
if len_u64 > limits.max_file_bytes {
return Err(Error::ResourceLimitExceeded {
kind: crate::error::ResourceLimitKind::FileTooLarge,
observed: len_u64,
limit: limits.max_file_bytes,
});
}
}
if let Some(limit) = opts.memory_limit {
if bytes.len() > limit {
return Err(Error::MemoryBudgetExceeded {
requested: bytes.len(),
limit,
});
}
}
let shared = Arc::new(bytes.to_vec());
let engine = open_engine_from_shared_bytes(
shared.clone(),
opts.password.as_deref(),
opts.processing_limits.as_ref(),
)?;
let lopdf = load_lopdf_from_shared_bytes(&shared, opts.password.as_deref())?;
Ok(Self {
engine,
lopdf,
license_key_override: opts.license_key.clone(),
processing_limits: opts.processing_limits.clone(),
})
}
pub fn from_reader<R: Read>(mut reader: R) -> Result<Self> {
license::require_capability(Capability::PdfParse)?;
let mut bytes = Vec::new();
reader
.read_to_end(&mut bytes)
.map_err(|source| Error::Io { source, path: None })?;
Self::from_bytes(&bytes)
}
pub fn create() -> Self {
use lopdf::{dictionary, Document as LoDoc, Object};
let mut doc = LoDoc::with_version("1.7");
let pages_id = doc.new_object_id();
let page_id = doc.add_object(dictionary! {
"Type" => "Page",
"Parent" => pages_id,
"MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
"Resources" => dictionary! {},
});
doc.objects.insert(
pages_id,
Object::Dictionary(dictionary! {
"Type" => "Pages",
"Kids" => vec![page_id.into()],
"Count" => 1,
}),
);
let catalog_id = doc.add_object(dictionary! {
"Type" => "Catalog",
"Pages" => pages_id,
});
doc.trailer.set("Root", catalog_id);
let mut buf: Vec<u8> = Vec::new();
doc.save_to(&mut buf)
.expect("serialising an in-memory lopdf::Document is infallible");
Self::from_bytes(&buf).expect("freshly-built PDF must re-parse")
}
pub fn page_count(&self) -> usize {
self.engine.page_count()
}
pub fn version(&self) -> PdfVersion {
let s = self.lopdf.version.as_str();
PdfVersion::parse(s).unwrap_or(PdfVersion { major: 1, minor: 7 })
}
pub fn text(&self) -> Result<String> {
self.require_capability(Capability::TextExtract)?;
Ok(self.engine.extract_all_text())
}
pub fn extract_text(&self) -> Result<String> {
self.require_capability(Capability::TextExtract)?;
let count = self.engine.page_count();
let mut parts: Vec<String> = Vec::with_capacity(count);
for idx in 0..count {
let page_text = self.engine.extract_text(idx)?;
parts.push(page_text);
}
Ok(parts.join("\n\n"))
}
pub fn text_with_layout(&self) -> Result<Vec<TextBlock>> {
self.require_capability(Capability::TextExtractWithLayout)?;
let mut out = Vec::new();
for (idx, blocks) in self
.engine
.extract_all_text_blocks()
.into_iter()
.enumerate()
{
for block in blocks {
out.push(TextBlock::from_engine(block, idx + 1));
}
}
Ok(out)
}
pub fn page(&self, page_number: usize) -> Result<Page<'_>> {
let total = self.engine.page_count();
if page_number == 0 || page_number > total {
return Err(internal_error(format!(
"page index {page_number} out of range (document has {total} pages)",
)));
}
Ok(Page {
doc: self,
index: page_number - 1,
})
}
pub fn pages(&self) -> Pages<'_> {
Pages { doc: self, next: 0 }
}
pub fn metadata(&self) -> Metadata {
let info = self.engine.info();
let (creation, modification) = crate::metadata::read_info_dates(&self.lopdf);
Metadata {
title: info.title,
author: info.author,
subject: info.subject,
keywords: crate::metadata::parse_keywords(info.keywords),
producer: info.producer,
creator: info.creator,
creation_date: creation,
modification_date: modification,
}
}
pub fn metadata_mut(&mut self) -> MetadataMut<'_> {
MetadataMut::new(self)
}
pub(crate) fn require_capability(&self, cap: Capability) -> Result<()> {
license::require_capability_with_override(cap, self.license_key_override.as_deref())
}
fn check_image_pixel_limits(&self) -> Result<()> {
let Some(ref limits) = self.processing_limits else {
return Ok(());
};
if limits.max_image_pixels == u64::MAX {
return Ok(());
}
use lopdf::Object;
for obj in self.lopdf.objects.values() {
let dict = match obj {
Object::Dictionary(d) => d,
Object::Stream(stream) => &stream.dict,
Object::Reference(id) => match self.lopdf.get_object(*id) {
Ok(Object::Dictionary(d)) => d,
Ok(Object::Stream(stream)) => &stream.dict,
_ => continue,
},
_ => continue,
};
let is_xobject = match dict.get(b"Type") {
Ok(Object::Name(n)) => n.as_slice() == b"XObject",
_ => false,
};
if !is_xobject {
continue;
}
let is_image = match dict.get(b"Subtype") {
Ok(Object::Name(n)) => n.as_slice() == b"Image",
_ => false,
};
if !is_image {
continue;
}
let width = match dict.get(b"Width") {
Ok(Object::Integer(v)) if *v > 0 => *v as u64,
Ok(Object::Reference(id)) => match self.lopdf.get_object(*id) {
Ok(Object::Integer(v)) if *v > 0 => *v as u64,
_ => continue,
},
_ => continue,
};
let height = match dict.get(b"Height") {
Ok(Object::Integer(v)) if *v > 0 => *v as u64,
Ok(Object::Reference(id)) => match self.lopdf.get_object(*id) {
Ok(Object::Integer(v)) if *v > 0 => *v as u64,
_ => continue,
},
_ => continue,
};
let pixels = width.saturating_mul(height);
if pixels > limits.max_image_pixels {
return Err(Error::ResourceLimitExceeded {
kind: crate::error::ResourceLimitKind::ImageTooLarge,
observed: pixels,
limit: limits.max_image_pixels,
});
}
}
Ok(())
}
pub(crate) fn lopdf(&self) -> &lopdf::Document {
&self.lopdf
}
pub(crate) fn lopdf_mut(&mut self) -> &mut lopdf::Document {
&mut self.lopdf
}
pub(crate) fn from_lopdf(mut lopdf_doc: lopdf::Document) -> Result<Self> {
let mut buf = Vec::with_capacity(64 * 1024);
lopdf_doc
.save_to(&mut buf)
.map_err(|source| Error::Io { source, path: None })?;
Self::from_bytes(&buf)
}
pub fn form_fields(&self) -> Result<Vec<FormField>> {
self.require_capability(Capability::AcroFormRead)?;
Ok(crate::form::read_acroform_fields(&self.lopdf))
}
pub fn form_mut(&mut self) -> PdfFormMut<'_> {
let license_override = self.license_key_override.as_deref();
PdfFormMut::new(&mut self.lopdf, license_override)
}
pub fn flatten_forms(&mut self) -> Result<()> {
self.require_capability(Capability::AcroFormFlatten)?;
Err(Error::MissingDependency {
dep: "pdf-manip::flatten_forms",
install_hint: "AcroForm flatten runtime tracked on #1223; lands in a 1.x MINOR. Use \
pdf_manip::flatten_forms directly for now if you need the raw pipeline.",
})
}
pub fn add_decoration(&mut self, decoration: PageDecoration) -> Result<()> {
self.require_capability(Capability::PdfWrite)?;
match decoration {
PageDecoration::Watermark {
text: _,
options: _,
} => Err(Error::MissingDependency {
dep: "pdf-manip::watermark",
install_hint:
"watermark runtime lands with Epic 2 #1223; consolidated surface is in place",
}),
}
}
pub fn add_watermark(&mut self, text: &str, opts: WatermarkOptions) -> Result<()> {
self.add_decoration(PageDecoration::watermark(text, opts))
}
#[cfg(not(target_arch = "wasm32"))]
#[cfg_attr(
feature = "tracing",
tracing::instrument(
target = "pdfluent",
skip(self),
fields(path = %path.as_ref().display())
)
)]
pub fn to_docx<P: AsRef<Path>>(&self, path: P) -> Result<()> {
self.require_capability(Capability::DocxExport)?;
let pdf_bytes = self.to_bytes()?;
let docx_bytes = pdf_docx::convert_pdf_bytes_to_docx(&pdf_bytes)
.map_err(|e| internal_error(format!("docx conversion failed: {e}")))?;
fs::write(path.as_ref(), docx_bytes).map_err(|source| Error::Io {
source,
path: Some(path.as_ref().to_path_buf()),
})?;
Ok(())
}
#[cfg(target_arch = "wasm32")]
pub fn to_docx<P: AsRef<Path>>(&self, _path: P) -> Result<()> {
Err(Error::UnsupportedOnWasm {
operation: "to_docx",
})
}
#[cfg(not(target_arch = "wasm32"))]
#[cfg_attr(
feature = "tracing",
tracing::instrument(
target = "pdfluent",
skip(self, opts),
fields(pattern = %pattern.as_ref().display())
)
)]
pub fn to_images<P: AsRef<Path>>(
&self,
pattern: P,
opts: ToImagesOptions,
) -> Result<ToImagesReport> {
use pdf_engine::render::RenderOptions;
self.require_capability(Capability::RenderRaster)?;
self.check_image_pixel_limits()?;
let total = self.engine.page_count();
if total == 0 {
return Ok(ToImagesReport { paths: Vec::new() });
}
let (from, to) = match opts.pages {
Some((f, t)) => {
if f == 0 || t < f || t > total {
return Err(internal_error(format!(
"invalid page range {f}..={t} (document has {total} pages)",
)));
}
(f, t)
}
None => (1, total),
};
let render_opts = RenderOptions {
dpi: opts.dpi as f64,
..Default::default()
};
let mut out_paths = Vec::with_capacity(to - from + 1);
for page_idx_1b in from..=to {
let rendered = self
.engine
.render_page(page_idx_1b - 1, &render_opts)
.map_err(|e| {
use pdf_engine::EngineError;
if let EngineError::LimitExceeded(ref le) = e {
return Error::from(le.clone());
}
internal_error(format!("render page {page_idx_1b} failed: {e}"))
})?;
let path = build_image_path(pattern.as_ref(), page_idx_1b, opts.format.extension());
encode_image(&rendered, opts.format, &path)?;
out_paths.push(path);
}
Ok(ToImagesReport { paths: out_paths })
}
#[cfg(target_arch = "wasm32")]
pub fn to_images<P: AsRef<Path>>(
&self,
_pattern: P,
_opts: ToImagesOptions,
) -> Result<ToImagesReport> {
Err(Error::UnsupportedOnWasm {
operation: "to_images",
})
}
#[cfg_attr(
feature = "tracing",
tracing::instrument(target = "pdfluent", skip(self, opts))
)]
pub fn compress(&mut self, opts: CompressOptions) -> Result<CompressReport> {
self.require_capability(Capability::PdfWrite)?;
let mut report = CompressReport::default();
if opts.subset_fonts {
let subset = pdf_manip::font_subset::subset_fonts(&mut self.lopdf)
.map_err(|e| internal_error(format!("font subsetting failed: {e:?}")))?;
report.font_subset = Some(FontSubsetReport {
fonts_processed: subset.fonts_processed,
fonts_subsetted: subset.fonts_subsetted,
bytes_saved: subset.bytes_saved,
});
}
if opts.compress_streams {
report.streams_compressed = pdf_manip::optimize::compress_streams(&mut self.lopdf)
.map_err(|e| internal_error(format!("stream compression failed: {e:?}")))?;
}
if opts.deduplicate_streams {
report.streams_deduplicated = pdf_manip::optimize::deduplicate_streams(&mut self.lopdf);
}
if opts.remove_unused {
report.unused_removed = pdf_manip::optimize::remove_unused_objects(&mut self.lopdf);
}
self.refresh_from_lopdf()?;
Ok(report)
}
pub fn linearize(&mut self) -> Result<()> {
self.require_capability(Capability::PdfWrite)?;
Err(Error::MissingDependency {
dep: "pdf-manip::linearize",
install_hint: "linearization not yet implemented; tracked as a 1.1 follow-up to #1224",
})
}
pub fn subset_fonts(&mut self) -> Result<FontSubsetReport> {
self.require_capability(Capability::PdfWrite)?;
let subset = pdf_manip::font_subset::subset_fonts(&mut self.lopdf)
.map_err(|e| internal_error(format!("font subsetting failed: {e:?}")))?;
self.refresh_from_lopdf()?;
Ok(FontSubsetReport {
fonts_processed: subset.fonts_processed,
fonts_subsetted: subset.fonts_subsetted,
bytes_saved: subset.bytes_saved,
})
}
pub fn embed_font(&mut self, _font_data: &[u8], _name: &str) -> Result<()> {
self.require_capability(Capability::PdfWrite)?;
Err(Error::MissingDependency {
dep: "pdf-manip::embed_font",
install_hint:
"arbitrary font embedding not yet implemented; tracked as a 1.1 follow-up to #1224",
})
}
pub fn insert_image(&mut self, img: ImageInsert) -> Result<ImageInsertReport> {
self.require_capability(Capability::PdfWrite)?;
let total = self.engine.page_count();
if img.page == 0 || img.page > total {
return Err(internal_error(format!(
"page index {} out of range (document has {} pages)",
img.page, total,
)));
}
let format = match img.format {
InsertImageFormat::Jpeg => pdf_manip::image_insert::ImageFormat::Jpeg,
InsertImageFormat::Png => pdf_manip::image_insert::ImageFormat::Png,
};
let insert = pdf_manip::image_insert::ImageInsert {
image_data: img.bytes,
format,
x: img.x,
y: img.y,
width: img.width,
height: img.height,
page_index: img.page as u32,
opacity: img.opacity,
};
let res = pdf_manip::image_insert::insert_image(&mut self.lopdf, &insert)
.map_err(|e| internal_error(format!("image insertion failed: {e:?}")))?;
self.refresh_from_lopdf()?;
Ok(ImageInsertReport {
pixel_width: res.pixel_width,
pixel_height: res.pixel_height,
resource_name: res.resource_name,
})
}
pub fn rotate_page(&mut self, page: usize, rotation: Rotation) -> Result<()> {
self.require_capability(Capability::PageOps)?;
let total = self.engine.page_count();
if page == 0 || page > total {
return Err(internal_error(format!(
"page index {page} out of range (document has {total} pages)",
)));
}
let degrees: i64 = match rotation {
Rotation::Clockwise90 => 90,
Rotation::Clockwise180 => 180,
Rotation::Clockwise270 => 270,
};
pdf_manip::pages::rotate_page(&mut self.lopdf, page as u32, degrees)?;
Ok(())
}
#[cfg_attr(
feature = "tracing",
tracing::instrument(target = "pdfluent", skip(self, opts))
)]
pub fn encrypt(&mut self, opts: EncryptOptions) -> Result<()> {
self.require_capability(Capability::EncryptionWrite)?;
let user_pw_bytes = opts.user_password.unwrap_or_default().into_bytes();
let owner_pw_bytes = opts
.owner_password
.map(String::into_bytes)
.unwrap_or_else(|| user_pw_bytes.clone());
let config = pdf_manip::encrypt::EncryptConfig {
user_password: user_pw_bytes,
owner_password: owner_pw_bytes,
algorithm: map_encryption_algorithm(opts.algorithm),
permissions: map_permissions(opts.permissions),
};
let mut sink = std::io::sink();
pdf_manip::encrypt::encrypt_and_save(&mut self.lopdf, &config, &mut sink)?;
Ok(())
}
#[cfg_attr(
feature = "tracing",
// `password` is deliberately NOT in fields — it's a secret.
tracing::instrument(target = "pdfluent", skip(self, password))
)]
pub fn decrypt(&mut self, password: &str) -> Result<()> {
self.require_capability(Capability::EncryptionRead)?;
pdf_manip::encrypt::decrypt(&mut self.lopdf, password)?;
self.refresh_from_lopdf()
}
#[cfg_attr(
feature = "tracing",
tracing::instrument(target = "pdfluent", skip_all)
)]
pub fn sign(
&mut self,
signer: &dyn crate::signer::PdfSigner,
opts: crate::signer::SignOptions,
) -> Result<()> {
self.require_capability(Capability::DigitalSignatureSign)?;
let pdf_bytes = self.to_bytes()?;
let inner_opts = map_sign_options(&opts);
let adapter = PdfSignerAdapter { inner: signer };
let signed = pdf_sign::sign_pdf(&pdf_bytes, &adapter, &inner_opts)?;
*self = Self::from_bytes(&signed)?;
Ok(())
}
pub fn signatures(&self) -> Result<Vec<crate::signer::SignatureInfo>> {
self.require_capability(Capability::DigitalSignatureVerify)?;
let pdf = self.engine.pdf();
let fields = pdf_sign::signature_fields(pdf);
let mut out = Vec::with_capacity(fields.len());
for f in fields {
out.push(crate::signer::SignatureInfo {
field_name: f.field_name.clone(),
signer_name: f.sig.signer_name().unwrap_or_default(),
timestamp: f.sig.signing_time(),
profile: None, });
}
Ok(out)
}
#[cfg_attr(
feature = "tracing",
tracing::instrument(target = "pdfluent", skip(self))
)]
pub fn verify_signatures(&self) -> Result<crate::signer::SignatureValidationReport> {
self.require_capability(Capability::DigitalSignatureVerify)?;
let pdf = self.engine.pdf();
let results = pdf_sign::validate_signatures(pdf);
let validations = results
.into_iter()
.map(|r| crate::signer::SignatureValidation {
info: crate::signer::SignatureInfo {
field_name: r.field_name,
signer_name: r.signer.unwrap_or_default(),
timestamp: r.timestamp,
profile: None,
},
status: match r.status {
pdf_sign::ValidationStatus::Valid => crate::signer::SignatureStatus::Valid,
pdf_sign::ValidationStatus::Invalid(reason) => {
crate::signer::SignatureStatus::Invalid { reason }
}
pdf_sign::ValidationStatus::Unknown(reason) => {
crate::signer::SignatureStatus::Unknown { reason }
}
},
})
.collect();
Ok(crate::signer::SignatureValidationReport::from_validations(
validations,
))
}
#[cfg_attr(
feature = "tracing",
// `text` is the redaction query — frequently PII/secrets the
// caller is trying to scrub. MUST be skipped so the tracing
// span records only `text_len` (low-cardinality, non-secret).
tracing::instrument(
target = "pdfluent",
skip(self, text, opts),
fields(text_len = text.len())
)
)]
pub fn redact(&mut self, text: &str, opts: crate::redact::RedactOptions) -> Result<()> {
self.require_capability(Capability::Redaction)?;
let search_opts = pdf_redact::RedactSearchOptions {
case_sensitive: opts.case_sensitive,
regex: opts.regex,
fill_color: [0.0, 0.0, 0.0],
pages: opts
.on_pages
.as_ref()
.map(|v| v.iter().map(|p| *p as u32).collect()),
overlay_text: None,
};
pdf_redact::search_and_redact(&mut self.lopdf, text, &search_opts)?;
self.refresh_from_lopdf()
}
pub fn redact_region(&mut self, page: usize, rect: [f64; 4]) -> Result<()> {
self.require_capability(Capability::Redaction)?;
let mut redactor = pdf_redact::Redactor::new();
redactor.mark(pdf_redact::RedactionArea {
page: page as u32,
rect,
fill_color: [0.0, 0.0, 0.0],
overlay_text: None,
});
redactor.apply(&mut self.lopdf)?;
self.refresh_from_lopdf()
}
fn refresh_from_lopdf(&mut self) -> Result<()> {
let mut buf = Vec::with_capacity(64 * 1024);
let mut clone = self.lopdf.clone();
clone
.save_to(&mut buf)
.map_err(|source| Error::Io { source, path: None })?;
*self = Self::from_bytes(&buf)?;
Ok(())
}
#[cfg(feature = "pdfa")]
pub fn validate_pdfa(
&self,
profile: crate::compliance::PdfAProfile,
) -> Result<crate::compliance::PdfAValidationReport> {
self.require_capability(Capability::PdfaValidate)?;
let raw = pdf_compliance::validate_pdfa(self.engine.pdf(), profile.into());
Ok(crate::compliance::report_from_compliance(raw, profile))
}
pub fn split_pages(&self) -> Result<Vec<PdfDocument>> {
self.require_capability(Capability::PageOps)?;
let split = pdf_manip::pages::split_per_page(&self.lopdf)?;
let mut out = Vec::with_capacity(split.len());
for lopdf_doc in split {
out.push(Self::from_lopdf(lopdf_doc)?);
}
Ok(out)
}
pub fn extract_pages<R: std::ops::RangeBounds<usize>>(&self, range: R) -> Result<PdfDocument> {
self.require_capability(Capability::PageOps)?;
let total = self.engine.page_count();
let (start, end) = normalise_page_range(&range, total)?;
let pages: Vec<u32> = (start..=end).map(|p| p as u32).collect();
let lopdf_doc = pdf_manip::pages::extract_pages(&self.lopdf, &pages)?;
Self::from_lopdf(lopdf_doc)
}
pub fn save<P: AsRef<Path>>(&self, path: P) -> Result<()> {
self.save_with(path, SaveOptions::new())
}
#[cfg_attr(
feature = "tracing",
tracing::instrument(
target = "pdfluent",
skip(self, opts),
fields(path = %path.as_ref().display(), overwrite = opts.overwrite)
)
)]
pub fn save_with<P: AsRef<Path>>(&self, path: P, opts: SaveOptions) -> Result<()> {
self.require_capability(Capability::PdfWrite)?;
let path_ref = path.as_ref();
if !opts.overwrite && path_ref.exists() {
return Err(Error::Io {
source: std::io::Error::new(
std::io::ErrorKind::AlreadyExists,
"target file exists; pass `SaveOptions::new().with_overwrite(true)` to clobber",
),
path: Some(path_ref.to_path_buf()),
});
}
let bytes = self.to_bytes()?;
fs::write(path_ref, bytes).map_err(|source| Error::Io {
source,
path: Some(path_ref.to_path_buf()),
})?;
Ok(())
}
#[cfg_attr(
feature = "tracing",
tracing::instrument(target = "pdfluent", skip(self))
)]
pub fn to_bytes(&self) -> Result<Vec<u8>> {
self.require_capability(Capability::PdfWrite)?;
let mut buf = Vec::with_capacity(64 * 1024);
let mut clone = self.lopdf.clone();
clone
.save_to(&mut buf)
.map_err(|source| Error::Io { source, path: None })?;
Ok(buf)
}
pub fn write_to<W: Write>(&self, mut writer: W) -> Result<()> {
self.require_capability(Capability::PdfWrite)?;
let bytes = self.to_bytes()?;
writer
.write_all(&bytes)
.map_err(|source| Error::Io { source, path: None })?;
Ok(())
}
}
#[cfg(not(target_arch = "wasm32"))]
fn build_image_path(pattern: &Path, page_1b: usize, ext: &str) -> std::path::PathBuf {
use std::path::PathBuf;
let s = pattern.to_string_lossy();
if s.contains("{page}") {
return PathBuf::from(s.replace("{page}", &page_1b.to_string()));
}
let parent = pattern.parent().map(Path::to_path_buf).unwrap_or_default();
let stem = pattern
.file_stem()
.map(|s| s.to_string_lossy().into_owned())
.unwrap_or_default();
let pattern_ext = pattern
.extension()
.map(|e| e.to_string_lossy().into_owned())
.unwrap_or_else(|| ext.to_string());
parent.join(format!("{stem}_{page_1b}.{pattern_ext}"))
}
#[cfg(not(target_arch = "wasm32"))]
fn encode_image(
page: &pdf_engine::render::RenderedPage,
format: crate::parity::ImageFormat,
path: &Path,
) -> Result<()> {
use crate::parity::ImageFormat as Fmt;
use pdf_engine::render::PixelFormat;
if !matches!(page.pixel_format, PixelFormat::Rgba8) {
return Err(internal_error(format!(
"unexpected pixel format {:?} from renderer",
page.pixel_format,
)));
}
match format {
Fmt::Png => encode_png(page.width, page.height, &page.pixels, path),
Fmt::Jpeg => encode_jpeg(page.width, page.height, &page.pixels, path),
}
}
#[cfg(not(target_arch = "wasm32"))]
fn encode_png(width: u32, height: u32, rgba: &[u8], path: &Path) -> Result<()> {
let file = fs::File::create(path).map_err(|source| Error::Io {
source,
path: Some(path.to_path_buf()),
})?;
let w = std::io::BufWriter::new(file);
let mut encoder = png::Encoder::new(w, width, height);
encoder.set_color(png::ColorType::Rgba);
encoder.set_depth(png::BitDepth::Eight);
let mut writer = encoder
.write_header()
.map_err(|e| internal_error(format!("png header failed: {e}")))?;
writer
.write_image_data(rgba)
.map_err(|e| internal_error(format!("png write failed: {e}")))?;
Ok(())
}
#[cfg(not(target_arch = "wasm32"))]
fn encode_jpeg(width: u32, height: u32, rgba: &[u8], path: &Path) -> Result<()> {
let mut rgb = Vec::with_capacity(rgba.len() / 4 * 3);
for chunk in rgba.chunks_exact(4) {
rgb.extend_from_slice(&chunk[..3]);
}
let file = fs::File::create(path).map_err(|source| Error::Io {
source,
path: Some(path.to_path_buf()),
})?;
let w = std::io::BufWriter::new(file);
use image::ImageEncoder;
let encoder = image::codecs::jpeg::JpegEncoder::new_with_quality(w, 90);
encoder
.write_image(&rgb, width, height, image::ExtendedColorType::Rgb8)
.map_err(|e| internal_error(format!("jpeg encoding failed: {e}")))?;
Ok(())
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct PdfVersion {
pub major: u8,
pub minor: u8,
}
impl PdfVersion {
pub fn parse(s: &str) -> Option<Self> {
let mut parts = s.trim_start_matches("PDF-").split('.');
let major: u8 = parts.next()?.parse().ok()?;
let minor: u8 = parts.next()?.parse().ok()?;
Some(Self { major, minor })
}
}
impl std::fmt::Display for PdfVersion {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}.{}", self.major, self.minor)
}
}
#[derive(Debug, Clone)]
pub struct TextBlock {
pub text: String,
pub bbox: [f64; 4],
pub page: usize,
}
impl TextBlock {
fn from_engine(engine_block: pdf_engine::TextBlock, page_1_based: usize) -> Self {
let mut x_min = f64::INFINITY;
let mut y_min = f64::INFINITY;
let mut x_max = f64::NEG_INFINITY;
let mut y_max = f64::NEG_INFINITY;
for span in &engine_block.spans {
x_min = x_min.min(span.x);
y_min = y_min.min(span.y);
x_max = x_max.max(span.x + span.width);
y_max = y_max.max(span.y + span.height);
}
if !x_min.is_finite() {
x_min = 0.0;
y_min = 0.0;
x_max = 0.0;
y_max = 0.0;
}
let text = engine_block.text();
Self {
text,
bbox: [x_min, y_min, x_max, y_max],
page: page_1_based,
}
}
}
pub struct Page<'a> {
doc: &'a PdfDocument,
index: usize,
}
impl Page<'_> {
pub fn text(&self) -> Result<String> {
self.doc.require_capability(Capability::TextExtract)?;
let text = self.doc.engine.extract_text(self.index)?;
Ok(text)
}
pub fn dimensions(&self) -> (f64, f64) {
match self.doc.engine.page_geometry(self.index) {
Ok(geom) => (geom.crop_box.width(), geom.crop_box.height()),
Err(_) => (0.0, 0.0),
}
}
pub fn number(&self) -> usize {
self.index + 1
}
}
pub struct Pages<'a> {
doc: &'a PdfDocument,
next: usize,
}
impl<'a> Iterator for Pages<'a> {
type Item = Page<'a>;
fn next(&mut self) -> Option<Self::Item> {
if self.next >= self.doc.engine.page_count() {
return None;
}
let page = Page {
doc: self.doc,
index: self.next,
};
self.next += 1;
Some(page)
}
}
fn map_encryption_algorithm(
alg: crate::encrypt::EncryptionAlgorithm,
) -> pdf_manip::encrypt::EncryptionAlgorithm {
use crate::encrypt::EncryptionAlgorithm as Ours;
use pdf_manip::encrypt::EncryptionAlgorithm as Theirs;
match alg {
Ours::Aes128 => Theirs::Aes128,
Ours::Aes256 => Theirs::Aes256,
}
}
fn map_permissions(perms: crate::encrypt::Permissions) -> pdf_manip::encrypt::Permissions {
pdf_manip::encrypt::Permissions {
print: perms.print,
modify_contents: perms.modify,
extract_content: perms.copy,
modify_annotations: perms.annotate,
fill_forms: perms.fill_forms,
extract_for_accessibility: perms.extract_accessibility,
assemble_document: perms.assemble,
print_high_quality: perms.print_high_quality,
}
}
fn map_sign_options(opts: &crate::signer::SignOptions) -> pdf_sign::SignOptions {
let sub_filter = match opts.profile {
crate::signer::PadesProfile::BasicSignature => pdf_sign::SubFilter::EtsiCadesDetached,
crate::signer::PadesProfile::Timestamped => pdf_sign::SubFilter::EtsiCadesDetached,
crate::signer::PadesProfile::LongTerm => pdf_sign::SubFilter::EtsiCadesDetached,
crate::signer::PadesProfile::LongTermArchive => pdf_sign::SubFilter::EtsiCadesDetached,
};
pdf_sign::SignOptions {
reason: opts.reason.clone(),
location: opts.location.clone(),
contact: opts.contact_info.clone(),
field_name: opts.field_name.clone(),
visible_rect: opts.visible_rect.map(|(page, rect)| (page as u32, rect)),
sub_filter,
certification: None,
placeholder_size: 8192,
}
}
struct PdfSignerAdapter<'a> {
inner: &'a dyn crate::signer::PdfSigner,
}
impl<'a> pdf_sign::PdfSigner for PdfSignerAdapter<'a> {
fn sign(&self, data: &[u8]) -> std::result::Result<Vec<u8>, pdf_sign::SignError> {
self.inner
.sign(data)
.map_err(|e| pdf_sign::SignError::SigningFailed(e.to_string()))
}
fn certificate_chain_der(&self) -> &[Vec<u8>] {
self.inner.certificate_chain()
}
fn digest_algorithm(&self) -> pdf_sign::DigestAlgorithm {
pdf_sign::DigestAlgorithm::Sha256
}
fn signature_algorithm_oid(&self) -> &[u8] {
&[0x2A, 0x86, 0x48, 0x86, 0xF7, 0x0D, 0x01, 0x01, 0x01]
}
}
fn normalise_page_range<R: std::ops::RangeBounds<usize>>(
range: &R,
total_pages: usize,
) -> Result<(usize, usize)> {
use std::ops::Bound;
let start = match range.start_bound() {
Bound::Included(&n) => n,
Bound::Excluded(&n) => n + 1,
Bound::Unbounded => 1,
};
let end = match range.end_bound() {
Bound::Included(&n) => n,
Bound::Excluded(&n) => n.saturating_sub(1),
Bound::Unbounded => total_pages,
};
if start == 0 || start > total_pages || end < start || end > total_pages {
return Err(internal_error(format!(
"page range {start}..={end} is out of bounds (document has {total_pages} pages)",
)));
}
Ok((start, end))
}
#[cfg(test)]
mod tests {
use super::{load_lopdf_from_shared_bytes, open_engine_from_shared_bytes, SaveOptions};
use std::sync::Arc;
fn minimal_pdf_bytes() -> Vec<u8> {
use lopdf::{dictionary, Document, Object, Stream};
let mut doc = Document::with_version("1.4");
let pages_id = doc.new_object_id();
let page_id = doc.new_object_id();
let content = Stream::new(dictionary! {}, b"BT ET".to_vec());
let content_id = doc.add_object(content);
doc.objects.insert(
page_id,
Object::Dictionary(dictionary! {
"Type" => Object::Name(b"Page".to_vec()),
"Parent" => Object::Reference(pages_id),
"MediaBox" => Object::Array(vec![
Object::Integer(0),
Object::Integer(0),
Object::Integer(72),
Object::Integer(72),
]),
"Contents" => Object::Reference(content_id),
}),
);
doc.objects.insert(
pages_id,
Object::Dictionary(dictionary! {
"Type" => Object::Name(b"Pages".to_vec()),
"Kids" => Object::Array(vec![Object::Reference(page_id)]),
"Count" => Object::Integer(1),
}),
);
let catalog_id = doc.new_object_id();
doc.objects.insert(
catalog_id,
Object::Dictionary(dictionary! {
"Type" => Object::Name(b"Catalog".to_vec()),
"Pages" => Object::Reference(pages_id),
}),
);
doc.trailer.set("Root", Object::Reference(catalog_id));
let mut bytes = Vec::new();
doc.save_to(&mut bytes).expect("build minimal fixture");
bytes
}
#[test]
fn from_bytes_with_shares_single_owned_input_buffer() {
let shared = Arc::new(minimal_pdf_bytes());
let shared_ptr = shared.as_slice().as_ptr();
let shared_len = shared.len();
let engine = open_engine_from_shared_bytes(shared.clone(), None, None)
.expect("open engine from shared bytes");
assert_eq!(
engine.pdf().data().as_ref().as_ptr(),
shared_ptr,
"pdf_engine should retain the same shared buffer rather than a second Vec clone",
);
assert_eq!(engine.pdf().data().as_ref().len(), shared_len);
let owners_after_engine = Arc::strong_count(&shared);
assert!(
owners_after_engine >= 2,
"pdf_engine should retain shared ownership of the input buffer",
);
let _lopdf = load_lopdf_from_shared_bytes(&shared, None)
.expect("load lopdf from borrowed shared bytes");
assert_eq!(
Arc::strong_count(&shared),
owners_after_engine,
"lopdf should parse from a borrowed slice of the same shared buffer",
);
}
#[test]
fn save_options_default_overwrite_is_false() {
assert!(!SaveOptions::default().overwrite);
}
#[cfg(feature = "pdfa")]
#[test]
fn validate_pdfa_returns_report_for_non_conforming_doc() {
use super::PdfDocument;
use crate::compliance::PdfAProfile;
let bytes = minimal_pdf_bytes();
let doc = PdfDocument::from_bytes(&bytes).expect("parse minimal fixture");
let report = doc
.validate_pdfa(PdfAProfile::A2b)
.expect("validate_pdfa should not return an error");
assert!(
!report.is_compliant(),
"a minimal synthetic PDF must not pass PDF/A-2b validation"
);
}
#[test]
fn extract_text_returns_ok_for_minimal_pdf() {
use super::PdfDocument;
let bytes = minimal_pdf_bytes();
let doc = PdfDocument::from_bytes(&bytes).expect("parse minimal fixture");
let text = doc.extract_text().expect("extract_text should not fail");
let _ = text; }
#[test]
fn extract_text_joins_pages_with_double_newline() {
use super::PdfDocument;
use lopdf::{dictionary, Document, Object, Stream};
let mut doc = Document::with_version("1.4");
let pages_id = doc.new_object_id();
let make_page = |doc: &mut Document, pages_id| {
let content = Stream::new(dictionary! {}, b"BT ET".to_vec());
let content_id = doc.add_object(content);
let page_id = doc.new_object_id();
doc.objects.insert(
page_id,
Object::Dictionary(dictionary! {
"Type" => Object::Name(b"Page".to_vec()),
"Parent" => Object::Reference(pages_id),
"MediaBox" => Object::Array(vec![
Object::Integer(0),
Object::Integer(0),
Object::Integer(72),
Object::Integer(72),
]),
"Contents" => Object::Reference(content_id),
}),
);
page_id
};
let page1_id = make_page(&mut doc, pages_id);
let page2_id = make_page(&mut doc, pages_id);
doc.objects.insert(
pages_id,
Object::Dictionary(dictionary! {
"Type" => Object::Name(b"Pages".to_vec()),
"Kids" => Object::Array(vec![
Object::Reference(page1_id),
Object::Reference(page2_id),
]),
"Count" => Object::Integer(2),
}),
);
let catalog_id = doc.new_object_id();
doc.objects.insert(
catalog_id,
Object::Dictionary(dictionary! {
"Type" => Object::Name(b"Catalog".to_vec()),
"Pages" => Object::Reference(pages_id),
}),
);
doc.trailer.set("Root", Object::Reference(catalog_id));
let mut bytes = Vec::new();
doc.save_to(&mut bytes).expect("build two-page fixture");
let pdf_doc = PdfDocument::from_bytes(&bytes).expect("parse two-page fixture");
assert_eq!(pdf_doc.page_count(), 2, "fixture must have 2 pages");
let text = pdf_doc
.extract_text()
.expect("extract_text on two-page doc");
assert_eq!(
text.matches("\n\n").count(),
1,
"two pages joined with '\\n\\n' must produce exactly one separator; got {text:?}",
);
}
#[test]
fn extract_text_capability_gate_error_is_well_formed() {
use crate::capability::Capability;
use crate::error::Error;
use crate::tier::Tier;
let err = Error::FeatureNotInTier {
capability: Capability::TextExtract,
current_tier: Tier::Trial,
required_tier: Tier::Developer,
};
assert_eq!(
err.code(),
"E-LICENSE-FEATURE-NOT-IN-TIER",
"stable error code must match RFC §5.4",
);
let rendered = format!("{err}");
assert!(
rendered.contains("TextExtract"),
"Display must mention the missing capability; got {rendered:?}",
);
}
}