1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
use std::borrow::Cow;
use std::collections::HashSet;
use tracing::warn;
use zpdf_core::{Error, ObjectId, PdfObject, Result};
use zpdf_parser::PdfFile;
use crate::page::{PdfPage, MAX_PAGE_TREE_DEPTH};
pub struct Catalog {
pub pages_ref: ObjectId,
pub page_count: usize,
page_refs: Vec<ObjectId>,
}
impl Catalog {
pub fn from_trailer(file: &PdfFile) -> Result<Self> {
// /Count is advisory only; the guarded kid walk determines the real
// page list (broken kids are skipped, cycles and over-deep chains pruned).
let pages_ref = Self::resolve_pages_ref(file);
let mut page_refs = Vec::new();
let mut visited = HashSet::new();
if let Some(pages_ref) = pages_ref {
Self::collect_page_refs(file, pages_ref, &mut page_refs, &mut visited, 0)?;
}
// Fallback: the /Root or /Pages tree was missing, null, or yielded no
// leaves — but the page objects often physically exist in the file (a
// broken xref, a catalog stranded in an /ObjStm, a /Root aimed at the
// wrong object, or a tree pruned by the cycle/depth guards). Mainstream
// readers degrade to a whole-document scan for /Type /Page; do the same.
if page_refs.is_empty() {
warn!("page tree unreachable via /Pages; scanning all objects for /Type /Page");
page_refs = file.find_objects_by_type("Page");
}
// Last resort: fuzzed files often byte-flip or drop the page's /Type
// (e.g. it parses as a Stream, or the type name is corrupted). Accept
// any "page-shaped" dict — carries /MediaBox or /Contents, is not a
// page-tree node (/Kids) or catalog (/Pages). Only reached when the
// document is already otherwise unopenable, so the loose heuristic
// cannot regress healthy files.
if page_refs.is_empty() {
warn!("no /Type /Page objects; scanning for page-shaped dicts");
page_refs = Self::scan_page_like(file);
}
if page_refs.is_empty() {
return Err(Error::InvalidObject(
0,
"page tree contains no usable pages".into(),
));
}
Ok(Self {
pages_ref: pages_ref.unwrap_or(ObjectId(0, 0)),
page_count: page_refs.len(),
page_refs,
})
}
/// Whole-document scan for "page-shaped" dicts: a leaf carries `/MediaBox`
/// or `/Contents`, is not an interior page-tree node (`/Kids`) and not the
/// catalog (`/Pages`). Used only when `/Type /Page` matching already came up
/// empty, to recover pages whose `/Type` was corrupted or dropped.
fn scan_page_like(file: &PdfFile) -> Vec<ObjectId> {
file.all_object_ids()
.into_iter()
.filter(|&id| {
let Ok(obj) = file.resolve(id) else {
return false;
};
let Ok(dict) = obj.as_dict() else {
return false;
};
dict.get("Kids").is_none()
&& dict.get("Pages").is_none()
&& (dict.get("MediaBox").is_some() || dict.get("Contents").is_some())
})
.collect()
}
/// Resolve `/Root` → `/Pages`, tolerating an absent/null/non-dict Root or a
/// missing /Pages by returning `None` (the caller then falls back to a
/// whole-document page scan instead of failing the open).
fn resolve_pages_ref(file: &PdfFile) -> Option<ObjectId> {
let root_ref = file.trailer.get_ref("Root").ok()?;
let root = file.resolve(root_ref).ok()?;
root.as_dict().ok()?.get_ref("Pages").ok()
}
fn collect_page_refs(
file: &PdfFile,
node_id: ObjectId,
refs: &mut Vec<ObjectId>,
visited: &mut HashSet<ObjectId>,
depth: usize,
) -> Result<()> {
if depth > MAX_PAGE_TREE_DEPTH {
warn!("page tree deeper than {MAX_PAGE_TREE_DEPTH} at {node_id}; pruning subtree");
return Ok(());
}
if !visited.insert(node_id) {
warn!("page tree cycle: node {node_id} already visited; pruning");
return Ok(());
}
let node = match file.resolve(node_id) {
Ok(PdfObject::Null) => {
warn!("page tree node {node_id} resolves to null; skipping");
return Ok(());
}
Ok(obj) => obj,
Err(e) => {
warn!("failed to resolve page tree node {node_id}: {e}; skipping");
return Ok(());
}
};
let Ok(dict) = node.as_dict() else {
warn!(
"page tree node {node_id} is {}, expected Dict; skipping",
node.type_name()
);
return Ok(());
};
// /Type is formally required but missing or wrong in real-world files;
// fall back on the presence of /Kids to tell interior nodes from leaves.
let is_pages = match dict.get_name("Type") {
Ok("Pages") => true,
Ok("Page") => false,
_ => dict.get("Kids").is_some(),
};
if is_pages {
// /Kids may itself be an indirect ref to the array.
let kids: Cow<'_, [PdfObject]> = match dict.get("Kids") {
Some(PdfObject::Array(a)) => Cow::Borrowed(a.as_slice()),
Some(PdfObject::Ref(r)) => match file.resolve(*r) {
Ok(PdfObject::Array(a)) => Cow::Owned(a),
_ => {
warn!("pages node {node_id}: /Kids ref {r} is not an array; skipping");
return Ok(());
}
},
_ => {
warn!("pages node {node_id} has no /Kids array; skipping");
return Ok(());
}
};
for kid in kids.iter() {
match kid {
PdfObject::Ref(r) => {
Self::collect_page_refs(file, *r, refs, visited, depth + 1)?;
}
PdfObject::Null => {
warn!("pages node {node_id}: null kid; skipping");
}
other => {
warn!(
"pages node {node_id}: kid is {}, expected Ref; skipping",
other.type_name()
);
}
}
}
} else {
refs.push(node_id);
}
Ok(())
}
pub fn get_page(&self, file: &PdfFile, index: usize) -> Result<PdfPage> {
let page_ref =
self.page_refs.get(index).copied().ok_or_else(|| {
Error::InvalidObject(0, format!("page index {index} out of range"))
})?;
PdfPage::from_object(file, page_ref)
}
}
#[cfg(test)]
mod tests {
use crate::page::MAX_PAGE_TREE_DEPTH;
use crate::test_util::build_pdf;
use crate::PdfDocument;
#[test]
fn kids_cycle_is_pruned() {
// The pages node lists itself as a kid; the walk must terminate.
let doc = PdfDocument::open(build_pdf(&[
"<< /Type /Catalog /Pages 2 0 R >>",
"<< /Type /Pages /Kids [3 0 R 2 0 R] /Count 1 >>",
"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>",
]))
.expect("open");
assert_eq!(doc.page_count(), 1);
}
#[test]
fn dangling_and_null_kids_are_skipped() {
// 99 0 R is dangling (skipped whether resolve errors or returns Null);
// the literal null kid is skipped outright.
let doc = PdfDocument::open(build_pdf(&[
"<< /Type /Catalog /Pages 2 0 R >>",
"<< /Type /Pages /Kids [99 0 R 3 0 R null] /Count 3 >>",
"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>",
]))
.expect("open");
assert_eq!(doc.page_count(), 1);
assert!(doc.page(0).is_ok());
}
#[test]
fn missing_type_nodes_tolerated() {
// Neither tree node carries /Type; /Kids presence tells interior from
// leaf, and inheritance still works through the untyped interior node.
let doc = PdfDocument::open(build_pdf(&[
"<< /Type /Catalog /Pages 2 0 R >>",
"<< /Kids [3 0 R] /Count 1 /MediaBox [0 0 200 200] >>",
"<< /Parent 2 0 R >>",
]))
.expect("open");
assert_eq!(doc.page_count(), 1);
let page = doc.page(0).expect("page");
assert_eq!(page.media_box.width(), 200.0);
}
#[test]
fn empty_page_tree_is_an_error() {
assert!(PdfDocument::open(build_pdf(&[
"<< /Type /Catalog /Pages 2 0 R >>",
"<< /Type /Pages /Kids [] /Count 0 >>",
]))
.is_err());
}
#[test]
fn null_root_is_a_hard_error() {
// Object 1 (the /Root target) is the literal null object.
assert!(PdfDocument::open(build_pdf(&["null"])).is_err());
}
#[test]
fn overly_deep_page_tree_is_pruned() {
// A single-kid Pages chain deeper than the guard: the kid walk must
// terminate (no hang/stack overflow) with the leaf pruned. The
// document-level fallback then recovers the orphaned /Type /Page leaf
// via a whole-document scan, so the document still opens with that page.
let mut objects: Vec<String> = vec!["<< /Type /Catalog /Pages 2 0 R >>".into()];
let chain = MAX_PAGE_TREE_DEPTH + 10;
for i in 0..chain {
objects.push(format!("<< /Type /Pages /Kids [{} 0 R] /Count 1 >>", i + 3));
}
objects.push("<< /Type /Page /MediaBox [0 0 10 10] >>".into());
let refs: Vec<&str> = objects.iter().map(|s| s.as_str()).collect();
let doc = PdfDocument::open(build_pdf(&refs)).expect("fallback recovers the pruned leaf");
assert_eq!(doc.page_count(), 1);
}
}