1use std::borrow::Cow;
2use std::collections::HashSet;
3
4use tracing::warn;
5use zpdf_core::{Error, ObjectId, PdfObject, Result};
6use zpdf_parser::PdfFile;
7
8use crate::page::{PdfPage, MAX_PAGE_TREE_DEPTH};
9
10pub struct Catalog {
11 pub pages_ref: ObjectId,
12 pub page_count: usize,
13 page_refs: Vec<ObjectId>,
14}
15
16impl Catalog {
17 pub fn from_trailer(file: &PdfFile) -> Result<Self> {
18 let pages_ref = Self::resolve_pages_ref(file);
21 let mut page_refs = Vec::new();
22 let mut visited = HashSet::new();
23 if let Some(pages_ref) = pages_ref {
24 Self::collect_page_refs(file, pages_ref, &mut page_refs, &mut visited, 0)?;
25 }
26
27 if page_refs.is_empty() {
33 warn!("page tree unreachable via /Pages; scanning all objects for /Type /Page");
34 page_refs = file.find_objects_by_type("Page");
35 }
36
37 if page_refs.is_empty() {
44 warn!("no /Type /Page objects; scanning for page-shaped dicts");
45 page_refs = Self::scan_page_like(file);
46 }
47
48 if page_refs.is_empty() {
49 return Err(Error::InvalidObject(
50 0,
51 "page tree contains no usable pages".into(),
52 ));
53 }
54
55 Ok(Self {
56 pages_ref: pages_ref.unwrap_or(ObjectId(0, 0)),
57 page_count: page_refs.len(),
58 page_refs,
59 })
60 }
61
62 fn scan_page_like(file: &PdfFile) -> Vec<ObjectId> {
67 file.all_object_ids()
68 .into_iter()
69 .filter(|&id| {
70 let Ok(obj) = file.resolve(id) else {
71 return false;
72 };
73 let Ok(dict) = obj.as_dict() else {
74 return false;
75 };
76 dict.get("Kids").is_none()
77 && dict.get("Pages").is_none()
78 && (dict.get("MediaBox").is_some() || dict.get("Contents").is_some())
79 })
80 .collect()
81 }
82
83 fn resolve_pages_ref(file: &PdfFile) -> Option<ObjectId> {
87 let root_ref = file.trailer.get_ref("Root").ok()?;
88 let root = file.resolve(root_ref).ok()?;
89 root.as_dict().ok()?.get_ref("Pages").ok()
90 }
91
92 fn collect_page_refs(
93 file: &PdfFile,
94 node_id: ObjectId,
95 refs: &mut Vec<ObjectId>,
96 visited: &mut HashSet<ObjectId>,
97 depth: usize,
98 ) -> Result<()> {
99 if depth > MAX_PAGE_TREE_DEPTH {
100 warn!("page tree deeper than {MAX_PAGE_TREE_DEPTH} at {node_id}; pruning subtree");
101 return Ok(());
102 }
103 if !visited.insert(node_id) {
104 warn!("page tree cycle: node {node_id} already visited; pruning");
105 return Ok(());
106 }
107
108 let node = match file.resolve(node_id) {
109 Ok(PdfObject::Null) => {
110 warn!("page tree node {node_id} resolves to null; skipping");
111 return Ok(());
112 }
113 Ok(obj) => obj,
114 Err(e) => {
115 warn!("failed to resolve page tree node {node_id}: {e}; skipping");
116 return Ok(());
117 }
118 };
119 let Ok(dict) = node.as_dict() else {
120 warn!(
121 "page tree node {node_id} is {}, expected Dict; skipping",
122 node.type_name()
123 );
124 return Ok(());
125 };
126
127 let is_pages = match dict.get_name("Type") {
130 Ok("Pages") => true,
131 Ok("Page") => false,
132 _ => dict.get("Kids").is_some(),
133 };
134
135 if is_pages {
136 let kids: Cow<'_, [PdfObject]> = match dict.get("Kids") {
138 Some(PdfObject::Array(a)) => Cow::Borrowed(a.as_slice()),
139 Some(PdfObject::Ref(r)) => match file.resolve(*r) {
140 Ok(PdfObject::Array(a)) => Cow::Owned(a),
141 _ => {
142 warn!("pages node {node_id}: /Kids ref {r} is not an array; skipping");
143 return Ok(());
144 }
145 },
146 _ => {
147 warn!("pages node {node_id} has no /Kids array; skipping");
148 return Ok(());
149 }
150 };
151 for kid in kids.iter() {
152 match kid {
153 PdfObject::Ref(r) => {
154 Self::collect_page_refs(file, *r, refs, visited, depth + 1)?;
155 }
156 PdfObject::Null => {
157 warn!("pages node {node_id}: null kid; skipping");
158 }
159 other => {
160 warn!(
161 "pages node {node_id}: kid is {}, expected Ref; skipping",
162 other.type_name()
163 );
164 }
165 }
166 }
167 } else {
168 refs.push(node_id);
169 }
170 Ok(())
171 }
172
173 pub fn get_page(&self, file: &PdfFile, index: usize) -> Result<PdfPage> {
174 let page_ref =
175 self.page_refs.get(index).copied().ok_or_else(|| {
176 Error::InvalidObject(0, format!("page index {index} out of range"))
177 })?;
178
179 PdfPage::from_object(file, page_ref)
180 }
181}
182
183#[cfg(test)]
184mod tests {
185 use crate::page::MAX_PAGE_TREE_DEPTH;
186 use crate::test_util::build_pdf;
187 use crate::PdfDocument;
188
189 #[test]
190 fn kids_cycle_is_pruned() {
191 let doc = PdfDocument::open(build_pdf(&[
193 "<< /Type /Catalog /Pages 2 0 R >>",
194 "<< /Type /Pages /Kids [3 0 R 2 0 R] /Count 1 >>",
195 "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>",
196 ]))
197 .expect("open");
198 assert_eq!(doc.page_count(), 1);
199 }
200
201 #[test]
202 fn dangling_and_null_kids_are_skipped() {
203 let doc = PdfDocument::open(build_pdf(&[
206 "<< /Type /Catalog /Pages 2 0 R >>",
207 "<< /Type /Pages /Kids [99 0 R 3 0 R null] /Count 3 >>",
208 "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>",
209 ]))
210 .expect("open");
211 assert_eq!(doc.page_count(), 1);
212 assert!(doc.page(0).is_ok());
213 }
214
215 #[test]
216 fn missing_type_nodes_tolerated() {
217 let doc = PdfDocument::open(build_pdf(&[
220 "<< /Type /Catalog /Pages 2 0 R >>",
221 "<< /Kids [3 0 R] /Count 1 /MediaBox [0 0 200 200] >>",
222 "<< /Parent 2 0 R >>",
223 ]))
224 .expect("open");
225 assert_eq!(doc.page_count(), 1);
226 let page = doc.page(0).expect("page");
227 assert_eq!(page.media_box.width(), 200.0);
228 }
229
230 #[test]
231 fn empty_page_tree_is_an_error() {
232 assert!(PdfDocument::open(build_pdf(&[
233 "<< /Type /Catalog /Pages 2 0 R >>",
234 "<< /Type /Pages /Kids [] /Count 0 >>",
235 ]))
236 .is_err());
237 }
238
239 #[test]
240 fn null_root_is_a_hard_error() {
241 assert!(PdfDocument::open(build_pdf(&["null"])).is_err());
243 }
244
245 #[test]
246 fn overly_deep_page_tree_is_pruned() {
247 let mut objects: Vec<String> = vec!["<< /Type /Catalog /Pages 2 0 R >>".into()];
252 let chain = MAX_PAGE_TREE_DEPTH + 10;
253 for i in 0..chain {
254 objects.push(format!("<< /Type /Pages /Kids [{} 0 R] /Count 1 >>", i + 3));
255 }
256 objects.push("<< /Type /Page /MediaBox [0 0 10 10] >>".into());
257 let refs: Vec<&str> = objects.iter().map(|s| s.as_str()).collect();
258 let doc = PdfDocument::open(build_pdf(&refs)).expect("fallback recovers the pruned leaf");
259 assert_eq!(doc.page_count(), 1);
260 }
261}