use std::collections::{HashMap, HashSet};
use std::io;
use std::path::Path;
#[derive(Debug, PartialEq)]
pub enum PdfReadError {
NotAPdf,
StartxrefNotFound,
MalformedXref,
MalformedTrailer,
XrefStreamNotSupported,
UnresolvableObject(u32),
MalformedPageTree,
Io(String),
}
impl std::fmt::Display for PdfReadError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
PdfReadError::NotAPdf => write!(f, "not a PDF file"),
PdfReadError::StartxrefNotFound => write!(f, "startxref not found"),
PdfReadError::MalformedXref => write!(f, "malformed or missing xref table"),
PdfReadError::MalformedTrailer => write!(f, "malformed or missing trailer"),
PdfReadError::XrefStreamNotSupported => {
write!(
f,
"cross-reference streams (PDF 1.5+) are not yet supported"
)
}
PdfReadError::UnresolvableObject(n) => write!(f, "cannot resolve object {}", n),
PdfReadError::MalformedPageTree => write!(f, "malformed page tree"),
PdfReadError::Io(msg) => write!(f, "I/O error: {}", msg),
}
}
}
impl std::error::Error for PdfReadError {}
impl From<io::Error> for PdfReadError {
fn from(e: io::Error) -> Self {
PdfReadError::Io(e.to_string())
}
}
pub struct PdfReader {
data: Vec<u8>,
xref: HashMap<u32, usize>,
version: String,
page_count: usize,
catalog_num: u32,
}
impl PdfReader {
pub fn open<P: AsRef<Path>>(path: P) -> Result<Self, PdfReadError> {
let data = std::fs::read(path.as_ref())?;
Self::from_bytes(data)
}
pub fn from_bytes(data: Vec<u8>) -> Result<Self, PdfReadError> {
let version = parse_version(&data)?;
let xref_offset = find_startxref(&data)?;
let (xref, root_ref) = parse_xref_and_trailer(&data, xref_offset)?;
let page_count = resolve_page_count(&data, &xref, root_ref)?;
Ok(PdfReader {
data,
xref,
version,
page_count,
catalog_num: root_ref,
})
}
pub fn page_count(&self) -> usize {
self.page_count
}
pub fn pdf_version(&self) -> &str {
&self.version
}
#[allow(dead_code)]
pub(crate) fn page_object_numbers(&self) -> Result<Vec<u32>, PdfReadError> {
let catalog_dict = resolve_dict(&self.data, &self.xref, self.catalog_num)?;
let pages_ref = catalog_dict
.get("Pages")
.ok_or(PdfReadError::MalformedPageTree)?;
let pages_num: u32 = pages_ref
.parse()
.map_err(|_| PdfReadError::MalformedPageTree)?;
let mut result = Vec::new();
walk_page_tree(&self.data, &self.xref, pages_num, &mut result)?;
Ok(result)
}
#[allow(dead_code)]
pub(crate) fn collect_closure(&self, roots: &[u32]) -> Result<HashSet<u32>, PdfReadError> {
let mut visited: HashSet<u32> = HashSet::new();
let mut queue: Vec<u32> = roots.to_vec();
while let Some(obj_num) = queue.pop() {
if !visited.insert(obj_num) {
continue; }
if let Ok(bytes) = self.raw_object_bytes(obj_num) {
for r in extract_indirect_refs(bytes) {
if !visited.contains(&r) && self.xref.contains_key(&r) {
queue.push(r);
}
}
}
}
Ok(visited)
}
#[allow(dead_code)]
pub(crate) fn raw_object_bytes(&self, obj_num: u32) -> Result<&[u8], PdfReadError> {
let offset = self
.xref
.get(&obj_num)
.copied()
.ok_or(PdfReadError::UnresolvableObject(obj_num))?;
if offset >= self.data.len() {
return Err(PdfReadError::UnresolvableObject(obj_num));
}
let slice = &self.data[offset..];
let endobj_pos = slice
.windows(6)
.position(|w| w == b"endobj")
.ok_or(PdfReadError::UnresolvableObject(obj_num))?;
Ok(&slice[..endobj_pos + 6])
}
}
fn parse_version(data: &[u8]) -> Result<String, PdfReadError> {
if data.len() < 8 || !data.starts_with(b"%PDF-") {
return Err(PdfReadError::NotAPdf);
}
let rest = &data[5..];
let end = rest
.iter()
.position(|&b| b == b'\n' || b == b'\r' || b == b' ')
.unwrap_or(rest.len());
let version = std::str::from_utf8(&rest[..end])
.map(|s| s.to_string())
.map_err(|_| PdfReadError::NotAPdf)?;
Ok(version)
}
fn find_startxref(data: &[u8]) -> Result<usize, PdfReadError> {
let search_start = data.len().saturating_sub(1024);
let tail = &data[search_start..];
let keyword = b"startxref";
let pos = tail
.windows(keyword.len())
.rposition(|w| w == keyword)
.ok_or(PdfReadError::StartxrefNotFound)?;
let after = &tail[pos + keyword.len()..];
let offset_str = skip_whitespace_to_token(after).ok_or(PdfReadError::StartxrefNotFound)?;
let offset: usize = offset_str
.parse()
.map_err(|_| PdfReadError::StartxrefNotFound)?;
if offset >= data.len() {
return Err(PdfReadError::StartxrefNotFound);
}
Ok(offset)
}
fn parse_xref_and_trailer(
data: &[u8],
xref_offset: usize,
) -> Result<(HashMap<u32, usize>, u32), PdfReadError> {
if xref_offset >= data.len() {
return Err(PdfReadError::MalformedXref);
}
let section = &data[xref_offset..];
let trimmed = skip_ascii_whitespace(section);
if !trimmed.starts_with(b"xref") {
return Err(PdfReadError::XrefStreamNotSupported);
}
let xref = parse_xref_table(section)?;
let root = parse_trailer_root(data, xref_offset)?;
Ok((xref, root))
}
fn parse_xref_table(section: &[u8]) -> Result<HashMap<u32, usize>, PdfReadError> {
let mut map = HashMap::new();
let rest = skip_ascii_whitespace(consume_token(section, b"xref")?);
let mut cursor = rest;
loop {
let trimmed = skip_ascii_whitespace(cursor);
if trimmed.is_empty() || trimmed.starts_with(b"trailer") {
break;
}
let (first_obj_str, after_first) =
next_token(trimmed).ok_or(PdfReadError::MalformedXref)?;
let first_obj: u32 = first_obj_str
.parse()
.map_err(|_| PdfReadError::MalformedXref)?;
let after_first = skip_ascii_whitespace(after_first);
let (count_str, after_count) =
next_token(after_first).ok_or(PdfReadError::MalformedXref)?;
let count: usize = count_str.parse().map_err(|_| PdfReadError::MalformedXref)?;
let entries_start = skip_line(after_count);
let entry_size = 20;
let entries_bytes = entries_start.len();
if entries_bytes < count * entry_size {
return Err(PdfReadError::MalformedXref);
}
for i in 0..count {
let entry = &entries_start[i * entry_size..(i + 1) * entry_size];
let offset_bytes = &entry[..10];
let status = entry[17];
if status == b'n' {
let offset_str =
std::str::from_utf8(offset_bytes).map_err(|_| PdfReadError::MalformedXref)?;
let offset: usize = offset_str
.parse()
.map_err(|_| PdfReadError::MalformedXref)?;
let obj_num = first_obj + i as u32;
if obj_num > 0 {
map.insert(obj_num, offset);
}
}
}
cursor = &entries_start[count * entry_size..];
}
Ok(map)
}
fn parse_trailer_root(data: &[u8], xref_offset: usize) -> Result<u32, PdfReadError> {
let section = &data[xref_offset..];
let pos = section
.windows(7)
.position(|w| w == b"trailer")
.ok_or(PdfReadError::MalformedTrailer)?;
let after_trailer = skip_ascii_whitespace(§ion[pos + 7..]);
let dict = parse_dict_bytes(after_trailer).ok_or(PdfReadError::MalformedTrailer)?;
let root_ref = dict.get("Root").ok_or(PdfReadError::MalformedTrailer)?;
let obj_num: u32 = root_ref
.parse()
.map_err(|_| PdfReadError::MalformedTrailer)?;
Ok(obj_num)
}
fn resolve_page_count(
data: &[u8],
xref: &HashMap<u32, usize>,
catalog_obj_num: u32,
) -> Result<usize, PdfReadError> {
let catalog_dict = resolve_dict(data, xref, catalog_obj_num)?;
let pages_ref = catalog_dict
.get("Pages")
.ok_or(PdfReadError::MalformedPageTree)?;
let pages_obj_num: u32 = pages_ref
.parse()
.map_err(|_| PdfReadError::MalformedPageTree)?;
let pages_dict = resolve_dict(data, xref, pages_obj_num)?;
let count_str = pages_dict
.get("Count")
.ok_or(PdfReadError::MalformedPageTree)?;
let count: usize = count_str
.parse()
.map_err(|_| PdfReadError::MalformedPageTree)?;
Ok(count)
}
fn resolve_dict(
data: &[u8],
xref: &HashMap<u32, usize>,
obj_num: u32,
) -> Result<HashMap<String, String>, PdfReadError> {
let offset = xref
.get(&obj_num)
.copied()
.ok_or(PdfReadError::UnresolvableObject(obj_num))?;
if offset >= data.len() {
return Err(PdfReadError::UnresolvableObject(obj_num));
}
let slice = &data[offset..];
let after_header = skip_obj_header(slice).ok_or(PdfReadError::UnresolvableObject(obj_num))?;
let after_ws = skip_ascii_whitespace(after_header);
parse_dict_bytes(after_ws).ok_or(PdfReadError::UnresolvableObject(obj_num))
}
#[allow(dead_code)]
fn walk_page_tree(
data: &[u8],
xref: &HashMap<u32, usize>,
node: u32,
result: &mut Vec<u32>,
) -> Result<(), PdfReadError> {
let dict = resolve_dict(data, xref, node)?;
match dict.get("Type").map(String::as_str) {
Some("/Page") => {
result.push(node);
}
Some("/Pages") | None => {
let kids = resolve_kids(data, xref, node)?;
for kid in kids {
walk_page_tree(data, xref, kid, result)?;
}
}
_ => {}
}
Ok(())
}
#[allow(dead_code)]
fn resolve_kids(
data: &[u8],
xref: &HashMap<u32, usize>,
obj_num: u32,
) -> Result<Vec<u32>, PdfReadError> {
let offset = xref
.get(&obj_num)
.copied()
.ok_or(PdfReadError::UnresolvableObject(obj_num))?;
if offset >= data.len() {
return Err(PdfReadError::UnresolvableObject(obj_num));
}
let slice = &data[offset..];
let obj_end = slice
.windows(6)
.position(|w| w == b"endobj")
.map(|p| p + 6)
.unwrap_or(slice.len());
let slice = &slice[..obj_end];
let after_header = skip_obj_header(slice).ok_or(PdfReadError::UnresolvableObject(obj_num))?;
let after_ws = skip_ascii_whitespace(after_header);
let needle = b"/Kids";
let kids_pos = after_ws
.windows(needle.len())
.position(|w| w == needle)
.ok_or(PdfReadError::MalformedPageTree)?;
let after_kids = skip_ascii_whitespace(&after_ws[kids_pos + needle.len()..]);
if !after_kids.starts_with(b"[") {
return Err(PdfReadError::MalformedPageTree);
}
parse_ref_array(after_kids).ok_or(PdfReadError::MalformedPageTree)
}
#[allow(dead_code)]
fn parse_ref_array(data: &[u8]) -> Option<Vec<u32>> {
debug_assert!(data.starts_with(b"["));
let end = data.iter().position(|&b| b == b']')?;
let inner = &data[1..end];
let mut result = Vec::new();
let mut cursor = inner;
loop {
cursor = skip_ascii_whitespace(cursor);
if cursor.is_empty() {
break;
}
let (n_str, rest) = next_token(cursor)?;
let Ok(n) = n_str.parse::<u32>() else { break };
let rest = skip_ascii_whitespace(rest);
let (_g_str, rest) = next_token(rest)?;
let rest = skip_ascii_whitespace(rest);
let (r_str, rest) = next_token(rest)?;
if r_str != "R" {
break;
}
result.push(n);
cursor = rest;
}
Some(result)
}
#[allow(dead_code)]
fn extract_indirect_refs(data: &[u8]) -> HashSet<u32> {
let mut refs = HashSet::new();
let is_delim = |b: u8| b.is_ascii_whitespace() || matches!(b, b'<' | b'>' | b'[' | b']');
let mut tokens: Vec<&[u8]> = Vec::new();
let mut cursor = data;
loop {
let start = cursor.iter().position(|&b| !is_delim(b));
let Some(start) = start else { break };
cursor = &cursor[start..];
let end = cursor
.iter()
.position(|&b| is_delim(b))
.unwrap_or(cursor.len());
tokens.push(&cursor[..end]);
cursor = &cursor[end..];
}
let mut i = 0;
while i + 2 < tokens.len() {
let a = tokens[i];
let b = tokens[i + 1];
let c = tokens[i + 2];
if c == b"R"
&& a.iter().all(|&x| x.is_ascii_digit())
&& b.iter().all(|&x| x.is_ascii_digit())
&& !a.is_empty()
{
if let Ok(s) = std::str::from_utf8(a) {
if let Ok(n) = s.parse::<u32>() {
refs.insert(n);
i += 3;
continue;
}
}
}
i += 1;
}
refs
}
fn parse_dict_bytes(data: &[u8]) -> Option<HashMap<String, String>> {
let data = skip_ascii_whitespace(data);
if !data.starts_with(b"<<") {
return None;
}
let mut map = HashMap::new();
let mut cursor = &data[2..];
loop {
cursor = skip_ascii_whitespace(cursor);
if cursor.starts_with(b">>") {
break;
}
if !cursor.starts_with(b"/") {
let (_, rest) = next_token(cursor)?;
cursor = rest;
continue;
}
let (key, after_key) = next_token(&cursor[1..])?;
cursor = skip_ascii_whitespace(after_key);
if cursor.starts_with(b"<<") {
cursor = skip_nested_dict(cursor)?;
} else if cursor.starts_with(b"[") {
cursor = skip_array(cursor)?;
} else if cursor.starts_with(b"(") {
cursor = skip_literal_string(cursor)?;
} else {
let (val, rest) = next_token(cursor)?;
cursor = skip_ascii_whitespace(rest);
if let Some((gen_str, after_gen)) = next_token(cursor) {
let after_gen_ws = skip_ascii_whitespace(after_gen);
if let Some((r_str, after_r)) = next_token(after_gen_ws) {
if r_str == "R"
&& val.chars().all(|c| c.is_ascii_digit())
&& gen_str.chars().all(|c| c.is_ascii_digit())
{
map.insert(key.to_string(), val.to_string());
cursor = after_r;
continue;
}
}
map.insert(key.to_string(), val.to_string());
} else {
map.insert(key.to_string(), val.to_string());
}
}
}
Some(map)
}
fn skip_nested_dict(data: &[u8]) -> Option<&[u8]> {
debug_assert!(data.starts_with(b"<<"));
let mut depth = 0i32;
let mut i = 0;
while i < data.len() {
if data[i..].starts_with(b"<<") {
depth += 1;
i += 2;
} else if data[i..].starts_with(b">>") {
depth -= 1;
i += 2;
if depth == 0 {
return Some(&data[i..]);
}
} else {
i += 1;
}
}
None
}
fn skip_array(data: &[u8]) -> Option<&[u8]> {
debug_assert!(data.starts_with(b"["));
let pos = data.iter().position(|&b| b == b']')?;
Some(&data[pos + 1..])
}
fn skip_literal_string(data: &[u8]) -> Option<&[u8]> {
debug_assert!(data.starts_with(b"("));
let mut i = 1;
let mut depth = 1i32;
while i < data.len() {
match data[i] {
b'\\' => i += 2,
b'(' => {
depth += 1;
i += 1;
}
b')' => {
depth -= 1;
i += 1;
if depth == 0 {
return Some(&data[i..]);
}
}
_ => i += 1,
}
}
None
}
fn skip_obj_header(data: &[u8]) -> Option<&[u8]> {
let (_, rest) = next_token(data)?; let rest = skip_ascii_whitespace(rest);
let (_, rest) = next_token(rest)?; let rest = skip_ascii_whitespace(rest);
let (keyword, rest) = next_token(rest)?; if keyword != "obj" {
return None;
}
Some(rest)
}
fn skip_ascii_whitespace(data: &[u8]) -> &[u8] {
let pos = data
.iter()
.position(|&b| !b.is_ascii_whitespace())
.unwrap_or(data.len());
&data[pos..]
}
fn skip_line(data: &[u8]) -> &[u8] {
let pos = data
.iter()
.position(|&b| b == b'\n')
.unwrap_or(data.len().saturating_sub(1));
if pos + 1 < data.len() {
&data[pos + 1..]
} else {
&data[data.len()..]
}
}
fn consume_token<'a>(data: &'a [u8], token: &[u8]) -> Result<&'a [u8], PdfReadError> {
let trimmed = skip_ascii_whitespace(data);
if trimmed.starts_with(token) {
Ok(&trimmed[token.len()..])
} else {
Err(PdfReadError::MalformedXref)
}
}
fn next_token(data: &[u8]) -> Option<(&str, &[u8])> {
let data = skip_ascii_whitespace(data);
if data.is_empty() {
return None;
}
let end = data
.iter()
.position(|&b| b.is_ascii_whitespace() || b == b'<' || b == b'>')
.unwrap_or(data.len());
if end == 0 {
let token = std::str::from_utf8(&data[..1]).ok()?;
return Some((token, &data[1..]));
}
let token = std::str::from_utf8(&data[..end]).ok()?;
Some((token, &data[end..]))
}
fn skip_whitespace_to_token(data: &[u8]) -> Option<&str> {
let (tok, _) = next_token(data)?;
Some(tok)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::document::PdfDocument;
fn make_pdf(n: usize) -> Vec<u8> {
let mut doc = PdfDocument::new(Vec::new()).unwrap();
for _ in 0..n {
doc.begin_page(612.0, 792.0);
doc.end_page().unwrap();
}
doc.end_document().unwrap()
}
#[test]
fn page_object_numbers_count_matches_page_count() {
for n in [0, 1, 3, 10] {
let bytes = make_pdf(n);
let reader = PdfReader::from_bytes(bytes).unwrap();
let nums = reader.page_object_numbers().unwrap();
assert_eq!(nums.len(), n, "expected {n} page objects");
}
}
#[test]
fn page_object_numbers_are_unique_and_positive() {
let bytes = make_pdf(5);
let reader = PdfReader::from_bytes(bytes).unwrap();
let nums = reader.page_object_numbers().unwrap();
assert!(
nums.iter().all(|&n| n > 0),
"all object numbers must be > 0"
);
let unique: HashSet<_> = nums.iter().collect();
assert_eq!(unique.len(), nums.len(), "object numbers must be unique");
}
#[test]
fn page_object_numbers_order_is_stable() {
let bytes = make_pdf(4);
let reader = PdfReader::from_bytes(bytes).unwrap();
let first = reader.page_object_numbers().unwrap();
let second = reader.page_object_numbers().unwrap();
assert_eq!(first, second);
}
#[test]
fn collect_closure_contains_seed_objects() {
let bytes = make_pdf(2);
let reader = PdfReader::from_bytes(bytes).unwrap();
let page_nums = reader.page_object_numbers().unwrap();
let closure = reader.collect_closure(&page_nums).unwrap();
for &n in &page_nums {
assert!(closure.contains(&n), "closure must include seed object {n}");
}
}
#[test]
fn collect_closure_includes_dependencies() {
let bytes = make_pdf(1);
let reader = PdfReader::from_bytes(bytes).unwrap();
let page_nums = reader.page_object_numbers().unwrap();
let closure = reader.collect_closure(&page_nums).unwrap();
assert!(
closure.len() > page_nums.len(),
"closure must include objects beyond the page nodes"
);
}
#[test]
fn collect_closure_empty_roots_returns_empty() {
let bytes = make_pdf(1);
let reader = PdfReader::from_bytes(bytes).unwrap();
let closure = reader.collect_closure(&[]).unwrap();
assert!(closure.is_empty());
}
#[test]
fn raw_object_bytes_starts_with_obj_header_and_ends_with_endobj() {
let bytes = make_pdf(1);
let reader = PdfReader::from_bytes(bytes).unwrap();
let page_num = reader.page_object_numbers().unwrap()[0];
let raw = reader.raw_object_bytes(page_num).unwrap();
assert!(raw.windows(4).any(|w| w == b" obj"), "must contain ' obj'");
assert!(raw.ends_with(b"endobj"), "must end with 'endobj'");
}
#[test]
fn raw_object_bytes_error_on_missing_object() {
let bytes = make_pdf(1);
let reader = PdfReader::from_bytes(bytes).unwrap();
assert!(
reader.raw_object_bytes(99999).is_err(),
"non-existent object must return Err"
);
}
#[test]
fn parse_dict_bytes_handles_nested_resources_dict() {
let dict = b"<< /Type /Page /Resources << /Font << >> >> >>";
let map = parse_dict_bytes(dict).expect("should parse successfully");
assert_eq!(
map.get("Type").map(String::as_str),
Some("/Page"),
"Type must be /Page, not overwritten by the inner Pages object"
);
}
#[test]
fn skip_nested_dict_returns_after_matching_close() {
let data = b"<< /K << >> >> tail";
let rest = skip_nested_dict(data).expect("should find closing >>");
assert_eq!(rest, b" tail");
}
}