use crate::error::{PdfError, Result};
use crate::parser::objects::{PdfDictionary, PdfName, PdfObject, PdfString};
use crate::parser::PdfReader;
use std::collections::HashMap;
use std::io::Cursor;
pub struct IncrementalFormFiller<'a> {
base_bytes: &'a [u8],
}
impl<'a> IncrementalFormFiller<'a> {
pub fn new(base_bytes: &'a [u8]) -> Self {
Self { base_bytes }
}
pub fn fill(&self, field_name: &str, value: &str) -> Result<Vec<u8>> {
self.fill_many(&[(field_name, value)])
}
pub fn fill_many(&self, fields: &[(&str, &str)]) -> Result<Vec<u8>> {
fill_many_impl(self.base_bytes, fields)
}
}
const MAX_FIELD_DEPTH: u8 = 32;
fn resolve_acroform_fields(
reader: &mut PdfReader<Cursor<&[u8]>>,
) -> Result<HashMap<String, (u32, u16)>> {
let acroform_dict = {
let catalog = reader
.catalog()
.map_err(|e| PdfError::InvalidStructure(format!("read catalog: {e}")))?
.clone();
match catalog.get("AcroForm") {
Some(PdfObject::Reference(n, g)) => reader
.get_object(*n, *g)
.map_err(|e| PdfError::InvalidStructure(format!("resolve /AcroForm: {e}")))?
.as_dict()
.cloned()
.ok_or_else(|| {
PdfError::InvalidStructure("/AcroForm is not a dictionary".to_string())
})?,
Some(PdfObject::Dictionary(d)) => d.clone(),
_ => {
return Err(PdfError::InvalidStructure(
"document has no /AcroForm".to_string(),
))
}
}
};
let field_refs: Vec<(u32, u16)> = match acroform_dict.get("Fields") {
Some(PdfObject::Array(arr)) => arr.0.iter().filter_map(|o| o.as_reference()).collect(),
_ => Vec::new(),
};
let mut out = HashMap::new();
for (n, g) in field_refs {
collect_fields(reader, (n, g), "", &mut out, 0)?;
}
Ok(out)
}
fn collect_fields(
reader: &mut PdfReader<Cursor<&[u8]>>,
node_ref: (u32, u16),
parent_prefix: &str,
out: &mut HashMap<String, (u32, u16)>,
depth: u8,
) -> Result<()> {
if depth >= MAX_FIELD_DEPTH {
return Err(PdfError::InvalidStructure(
"AcroForm field tree exceeds maximum depth".to_string(),
));
}
let node = reader
.get_object(node_ref.0, node_ref.1)
.map_err(|e| PdfError::InvalidStructure(format!("resolve field object: {e}")))?
.as_dict()
.cloned()
.ok_or_else(|| {
PdfError::InvalidStructure("field object is not a dictionary".to_string())
})?;
let partial = node
.get("T")
.and_then(|o| o.as_string())
.map(|s| String::from_utf8_lossy(s.as_bytes()).into_owned());
let full_name = match (&partial, parent_prefix.is_empty()) {
(Some(t), true) => t.clone(),
(Some(t), false) => format!("{parent_prefix}.{t}"),
(None, _) => parent_prefix.to_string(),
};
let kids: Vec<(u32, u16)> = match node.get("Kids") {
Some(PdfObject::Array(arr)) => arr.0.iter().filter_map(|o| o.as_reference()).collect(),
_ => Vec::new(),
};
let kids_are_subfields = kids.iter().any(|(n, g)| {
reader
.get_object(*n, *g)
.ok()
.and_then(|o| o.as_dict())
.map(|d| d.contains_key("T"))
.unwrap_or(false)
});
if kids.is_empty() || !kids_are_subfields {
if partial.is_some() {
out.insert(full_name, node_ref);
}
} else {
for kid in kids {
collect_fields(reader, kid, &full_name, out, depth + 1)?;
}
}
Ok(())
}
fn fill_many_impl(base_bytes: &[u8], fields: &[(&str, &str)]) -> Result<Vec<u8>> {
let mut reader = PdfReader::new(Cursor::new(base_bytes))
.map_err(|e| PdfError::InvalidStructure(format!("parse base PDF: {e}")))?;
if reader.is_encrypted() {
return Err(PdfError::PermissionDenied(
"incremental form fill is not supported on encrypted PDFs".to_string(),
));
}
let base_startxref = reader.trailer().xref_offset;
let base_root = reader
.trailer()
.root()
.map_err(|e| PdfError::InvalidStructure(format!("base /Root: {e}")))?;
let base_size = reader
.trailer()
.size()
.map_err(|e| PdfError::InvalidStructure(format!("base /Size: {e}")))?;
let base_id_first: Option<Vec<u8>> = first_id_bytes(reader.trailer().id());
let (acro_ref, mut acro_dict) = resolve_acroform_object(&mut reader)?;
let field_map = resolve_acroform_fields(&mut reader)?;
let mut modified: Vec<(u32, u16, PdfDictionary)> = Vec::new();
for (name, value) in fields {
let (num, gen) = field_map
.get(*name)
.copied()
.ok_or_else(|| PdfError::FieldNotFound((*name).to_string()))?;
let mut field_dict = reader
.get_object(num, gen)
.map_err(|e| PdfError::InvalidStructure(format!("resolve field {name}: {e}")))?
.as_dict()
.cloned()
.ok_or_else(|| {
PdfError::InvalidStructure(format!("field {name} is not a dictionary"))
})?;
field_dict.insert(
"V".to_string(),
PdfObject::String(PdfString(value.as_bytes().to_vec())),
);
match modified.iter_mut().find(|(n, g, _)| *n == num && *g == gen) {
Some(slot) => slot.2 = field_dict,
None => modified.push((num, gen, field_dict)),
}
}
acro_dict.insert("NeedAppearances".to_string(), PdfObject::Boolean(true));
let mut out = Vec::with_capacity(base_bytes.len() + 1024);
out.extend_from_slice(base_bytes);
let mut changed: Vec<(u32, u16, u64)> = Vec::new();
for (num, gen, dict) in &modified {
let offset = out.len() as u64;
write_indirect_object(&mut out, *num, *gen, dict)?;
changed.push((*num, *gen, offset));
}
let acro_offset = out.len() as u64;
write_indirect_object(&mut out, acro_ref.0, acro_ref.1, &acro_dict)?;
changed.push((acro_ref.0, acro_ref.1, acro_offset));
let xref_pos = out.len() as u64;
let id_pair = base_id_first.map(|first| {
let second = derive_revision_id(&first, fields, xref_pos);
(first, second)
});
out.extend_from_slice(&write_partial_xref_section(&changed));
out.extend_from_slice(&write_incremental_trailer(
base_startxref,
base_root,
base_size,
xref_pos,
id_pair,
));
Ok(out)
}
fn resolve_acroform_object(
reader: &mut PdfReader<Cursor<&[u8]>>,
) -> Result<((u32, u16), PdfDictionary)> {
let catalog = reader
.catalog()
.map_err(|e| PdfError::InvalidStructure(format!("read catalog: {e}")))?
.clone();
match catalog.get("AcroForm") {
Some(PdfObject::Reference(n, g)) => {
let dict = reader
.get_object(*n, *g)
.map_err(|e| PdfError::InvalidStructure(format!("resolve /AcroForm: {e}")))?
.as_dict()
.cloned()
.ok_or_else(|| {
PdfError::InvalidStructure("/AcroForm is not a dictionary".to_string())
})?;
Ok(((*n, *g), dict))
}
_ => Err(PdfError::InvalidStructure(
"/AcroForm must be an indirect reference for incremental fill".to_string(),
)),
}
}
fn first_id_bytes(id: Option<&PdfObject>) -> Option<Vec<u8>> {
match id {
Some(PdfObject::Array(arr)) => arr
.0
.first()
.and_then(|o| o.as_string())
.map(|s| s.as_bytes().to_vec()),
_ => None,
}
}
fn write_indirect_object(
out: &mut Vec<u8>,
num: u32,
gen: u16,
dict: &PdfDictionary,
) -> Result<()> {
out.extend_from_slice(format!("{num} {gen} obj\n").as_bytes());
write_object_value(out, &PdfObject::Dictionary(dict.clone()))?;
out.extend_from_slice(b"\nendobj\n");
Ok(())
}
fn write_object_value(out: &mut Vec<u8>, obj: &PdfObject) -> Result<()> {
match obj {
PdfObject::Null => out.extend_from_slice(b"null"),
PdfObject::Boolean(b) => out.extend_from_slice(if *b { b"true" } else { b"false" }),
PdfObject::Integer(i) => out.extend_from_slice(i.to_string().as_bytes()),
PdfObject::Real(f) => out.extend_from_slice(format_real(*f).as_bytes()),
PdfObject::String(s) => write_literal_string(out, s.as_bytes()),
PdfObject::Name(n) => write_name(out, n),
PdfObject::Reference(num, gen) => {
out.extend_from_slice(format!("{num} {gen} R").as_bytes())
}
PdfObject::Array(arr) => {
out.extend_from_slice(b"[");
for (i, item) in arr.0.iter().enumerate() {
if i > 0 {
out.extend_from_slice(b" ");
}
write_object_value(out, item)?;
}
out.extend_from_slice(b"]");
}
PdfObject::Dictionary(d) => write_dict(out, d)?,
PdfObject::Stream(_) => {
return Err(PdfError::InvalidStructure(
"unexpected stream object in AcroForm field dictionary".to_string(),
))
}
}
Ok(())
}
fn write_dict(out: &mut Vec<u8>, dict: &PdfDictionary) -> Result<()> {
out.extend_from_slice(b"<< ");
let mut keys: Vec<&PdfName> = dict.0.keys().collect();
keys.sort_by(|a, b| a.0.cmp(&b.0));
for key in keys {
write_name(out, key);
out.extend_from_slice(b" ");
write_object_value(out, &dict.0[key])?;
out.extend_from_slice(b" ");
}
out.extend_from_slice(b">>");
Ok(())
}
fn write_name(out: &mut Vec<u8>, name: &PdfName) {
out.extend_from_slice(b"/");
for &b in name.0.as_bytes() {
if b.is_ascii_alphanumeric()
|| matches!(
b,
b'+' | b'-' | b'.' | b'_' | b'@' | b'$' | b':' | b';' | b'*' | b'?'
)
{
out.push(b);
} else {
out.extend_from_slice(format!("#{b:02X}").as_bytes());
}
}
}
fn write_literal_string(out: &mut Vec<u8>, bytes: &[u8]) {
out.push(b'(');
for &b in bytes {
match b {
b'(' => out.extend_from_slice(b"\\("),
b')' => out.extend_from_slice(b"\\)"),
b'\\' => out.extend_from_slice(b"\\\\"),
b'\n' => out.extend_from_slice(b"\\n"),
b'\r' => out.extend_from_slice(b"\\r"),
b'\t' => out.extend_from_slice(b"\\t"),
0x20..=0x7E => out.push(b),
_ => out.extend_from_slice(format!("\\{b:03o}").as_bytes()),
}
}
out.push(b')');
}
fn format_real(f: f64) -> String {
if !f.is_finite() {
return "0".to_string();
}
if f == f.trunc() {
return format!("{}", f as i64);
}
let mut s = format!("{f:.6}");
while s.ends_with('0') {
s.pop();
}
if s.ends_with('.') {
s.pop();
}
s
}
fn write_partial_xref_section(changed: &[(u32, u16, u64)]) -> Vec<u8> {
let mut entries = changed.to_vec();
entries.sort_by_key(|(num, _, _)| *num);
let mut out = Vec::new();
out.extend_from_slice(b"xref\n");
let mut i = 0;
while i < entries.len() {
let start = entries[i].0;
let mut j = i;
while j + 1 < entries.len() && entries[j + 1].0 == entries[j].0 + 1 {
j += 1;
}
let count = j - i + 1;
out.extend_from_slice(format!("{start} {count}\n").as_bytes());
for entry in &entries[i..=j] {
out.extend_from_slice(format!("{:010} {:05} n \n", entry.2, entry.1).as_bytes());
}
i = j + 1;
}
out
}
fn write_incremental_trailer(
base_prev_xref: u64,
base_root: (u32, u16),
base_size: u32,
new_xref_pos: u64,
id_pair: Option<(Vec<u8>, Vec<u8>)>,
) -> Vec<u8> {
let mut out = Vec::new();
out.extend_from_slice(b"trailer\n<< ");
out.extend_from_slice(format!("/Size {base_size} ").as_bytes());
out.extend_from_slice(format!("/Root {} {} R ", base_root.0, base_root.1).as_bytes());
out.extend_from_slice(format!("/Prev {base_prev_xref} ").as_bytes());
if let Some((first, second)) = id_pair {
let hex = |bytes: &[u8]| -> String { bytes.iter().map(|b| format!("{b:02X}")).collect() };
out.extend_from_slice(format!("/ID [<{}> <{}>] ", hex(&first), hex(&second)).as_bytes());
}
out.extend_from_slice(b">>\n");
out.extend_from_slice(b"startxref\n");
out.extend_from_slice(format!("{new_xref_pos}\n").as_bytes());
out.extend_from_slice(b"%%EOF\n");
out
}
fn derive_revision_id(first: &[u8], fields: &[(&str, &str)], xref_pos: u64) -> Vec<u8> {
let mut buf = Vec::new();
buf.extend_from_slice(first);
for (name, value) in fields {
buf.extend_from_slice(name.as_bytes());
buf.push(0);
buf.extend_from_slice(value.as_bytes());
buf.push(0);
}
buf.extend_from_slice(&xref_pos.to_le_bytes());
md5::compute(&buf).0.to_vec()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn partial_xref_groups_contiguous_subsections() {
let bytes = write_partial_xref_section(&[(5, 0, 1024), (7, 0, 2048)]);
let s = String::from_utf8(bytes).unwrap();
assert!(s.starts_with("xref\n"), "must start with xref keyword");
assert!(s.contains("5 1\n0000001024 00000 n \n"), "obj 5 subsection");
assert!(s.contains("7 1\n0000002048 00000 n \n"), "obj 7 subsection");
assert!(!s.contains("\n6 "), "gap object 6 must not appear");
}
#[test]
fn partial_xref_merges_adjacent_ids() {
let bytes = write_partial_xref_section(&[(7, 0, 2048), (5, 0, 1024), (6, 0, 1536)]);
let s = String::from_utf8(bytes).unwrap();
assert!(
s.contains("5 3\n"),
"contiguous ids form one subsection: {s}"
);
assert!(s.contains("0000001024 00000 n \n0000001536 00000 n \n0000002048 00000 n \n"));
}
#[test]
fn incremental_trailer_carries_root_prev_size() {
let bytes = write_incremental_trailer(312, (1, 0), 8, 5000, None);
let s = String::from_utf8(bytes).unwrap();
assert!(s.contains("/Prev 312"), "must chain /Prev: {s}");
assert!(s.contains("/Root 1 0 R"), "must reuse base /Root");
assert!(s.contains("/Size 8"), "must carry /Size");
assert!(s.ends_with("startxref\n5000\n%%EOF\n"), "suffix: {s}");
assert!(!s.contains("/Info"), "no /Info in incremental trailer");
}
#[test]
fn incremental_trailer_emits_distinct_id_pair() {
let bytes = write_incremental_trailer(
10,
(2, 0),
5,
99,
Some((vec![0xDE, 0xAD], vec![0xBE, 0xEF])),
);
let s = String::from_utf8(bytes).unwrap();
assert!(s.contains("/ID [<DEAD> <BEEF>]"), "distinct id pair: {s}");
}
#[test]
fn revision_id_differs_from_permanent_and_is_deterministic() {
let first = vec![1u8, 2, 3, 4];
let a = derive_revision_id(&first, &[("name", "Ada")], 100);
let b = derive_revision_id(&first, &[("name", "Ada")], 100);
let c = derive_revision_id(&first, &[("name", "Grace")], 100);
assert_eq!(a, b, "same inputs -> same id (reproducible)");
assert_ne!(
a, first,
"second element must differ from the permanent one"
);
assert_ne!(a, c, "different content -> different revision id");
assert_eq!(a.len(), 16, "PDF /ID elements are 16 bytes");
}
#[test]
fn literal_string_escapes_reserved_bytes() {
let mut out = Vec::new();
write_literal_string(&mut out, b"a(b)c\\d");
assert_eq!(out, b"(a\\(b\\)c\\\\d)");
}
}