1use std::borrow::Cow;
2use std::collections::HashSet;
3
4use tracing::warn;
5use zpdf_core::{Error, ObjectId, PdfObject, Result};
6use zpdf_parser::PdfFile;
7
8use crate::page::{PdfPage, MAX_PAGE_TREE_DEPTH};
9
10pub struct Catalog {
11 pub pages_ref: ObjectId,
12 pub page_count: usize,
13 page_refs: Vec<ObjectId>,
14}
15
16impl Catalog {
17 pub fn from_trailer(file: &PdfFile) -> Result<Self> {
18 let root_ref = file.trailer.get_ref("Root")?;
19 let root = file.resolve(root_ref)?;
20 if root.is_null() {
23 return Err(Error::InvalidObject(
24 0,
25 format!("/Root {root_ref} resolves to null"),
26 ));
27 }
28 let root_dict = root.as_dict()?;
29
30 let pages_ref = root_dict.get_ref("Pages")?;
31
32 let mut page_refs = Vec::new();
35 let mut visited = HashSet::new();
36 Self::collect_page_refs(file, pages_ref, &mut page_refs, &mut visited, 0)?;
37
38 if page_refs.is_empty() {
39 return Err(Error::InvalidObject(
40 0,
41 "page tree contains no usable pages".into(),
42 ));
43 }
44
45 Ok(Self {
46 pages_ref,
47 page_count: page_refs.len(),
48 page_refs,
49 })
50 }
51
52 fn collect_page_refs(
53 file: &PdfFile,
54 node_id: ObjectId,
55 refs: &mut Vec<ObjectId>,
56 visited: &mut HashSet<ObjectId>,
57 depth: usize,
58 ) -> Result<()> {
59 if depth > MAX_PAGE_TREE_DEPTH {
60 warn!("page tree deeper than {MAX_PAGE_TREE_DEPTH} at {node_id}; pruning subtree");
61 return Ok(());
62 }
63 if !visited.insert(node_id) {
64 warn!("page tree cycle: node {node_id} already visited; pruning");
65 return Ok(());
66 }
67
68 let node = match file.resolve(node_id) {
69 Ok(PdfObject::Null) => {
70 warn!("page tree node {node_id} resolves to null; skipping");
71 return Ok(());
72 }
73 Ok(obj) => obj,
74 Err(e) => {
75 warn!("failed to resolve page tree node {node_id}: {e}; skipping");
76 return Ok(());
77 }
78 };
79 let Ok(dict) = node.as_dict() else {
80 warn!(
81 "page tree node {node_id} is {}, expected Dict; skipping",
82 node.type_name()
83 );
84 return Ok(());
85 };
86
87 let is_pages = match dict.get_name("Type") {
90 Ok("Pages") => true,
91 Ok("Page") => false,
92 _ => dict.get("Kids").is_some(),
93 };
94
95 if is_pages {
96 let kids: Cow<'_, [PdfObject]> = match dict.get("Kids") {
98 Some(PdfObject::Array(a)) => Cow::Borrowed(a.as_slice()),
99 Some(PdfObject::Ref(r)) => match file.resolve(*r) {
100 Ok(PdfObject::Array(a)) => Cow::Owned(a),
101 _ => {
102 warn!("pages node {node_id}: /Kids ref {r} is not an array; skipping");
103 return Ok(());
104 }
105 },
106 _ => {
107 warn!("pages node {node_id} has no /Kids array; skipping");
108 return Ok(());
109 }
110 };
111 for kid in kids.iter() {
112 match kid {
113 PdfObject::Ref(r) => {
114 Self::collect_page_refs(file, *r, refs, visited, depth + 1)?;
115 }
116 PdfObject::Null => {
117 warn!("pages node {node_id}: null kid; skipping");
118 }
119 other => {
120 warn!(
121 "pages node {node_id}: kid is {}, expected Ref; skipping",
122 other.type_name()
123 );
124 }
125 }
126 }
127 } else {
128 refs.push(node_id);
129 }
130 Ok(())
131 }
132
133 pub fn get_page(&self, file: &PdfFile, index: usize) -> Result<PdfPage> {
134 let page_ref =
135 self.page_refs.get(index).copied().ok_or_else(|| {
136 Error::InvalidObject(0, format!("page index {index} out of range"))
137 })?;
138
139 PdfPage::from_object(file, page_ref)
140 }
141}
142
143#[cfg(test)]
144mod tests {
145 use crate::page::MAX_PAGE_TREE_DEPTH;
146 use crate::test_util::build_pdf;
147 use crate::PdfDocument;
148
149 #[test]
150 fn kids_cycle_is_pruned() {
151 let doc = PdfDocument::open(build_pdf(&[
153 "<< /Type /Catalog /Pages 2 0 R >>",
154 "<< /Type /Pages /Kids [3 0 R 2 0 R] /Count 1 >>",
155 "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>",
156 ]))
157 .expect("open");
158 assert_eq!(doc.page_count(), 1);
159 }
160
161 #[test]
162 fn dangling_and_null_kids_are_skipped() {
163 let doc = PdfDocument::open(build_pdf(&[
166 "<< /Type /Catalog /Pages 2 0 R >>",
167 "<< /Type /Pages /Kids [99 0 R 3 0 R null] /Count 3 >>",
168 "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>",
169 ]))
170 .expect("open");
171 assert_eq!(doc.page_count(), 1);
172 assert!(doc.page(0).is_ok());
173 }
174
175 #[test]
176 fn missing_type_nodes_tolerated() {
177 let doc = PdfDocument::open(build_pdf(&[
180 "<< /Type /Catalog /Pages 2 0 R >>",
181 "<< /Kids [3 0 R] /Count 1 /MediaBox [0 0 200 200] >>",
182 "<< /Parent 2 0 R >>",
183 ]))
184 .expect("open");
185 assert_eq!(doc.page_count(), 1);
186 let page = doc.page(0).expect("page");
187 assert_eq!(page.media_box.width(), 200.0);
188 }
189
190 #[test]
191 fn empty_page_tree_is_an_error() {
192 assert!(PdfDocument::open(build_pdf(&[
193 "<< /Type /Catalog /Pages 2 0 R >>",
194 "<< /Type /Pages /Kids [] /Count 0 >>",
195 ]))
196 .is_err());
197 }
198
199 #[test]
200 fn null_root_is_a_hard_error() {
201 assert!(PdfDocument::open(build_pdf(&["null"])).is_err());
203 }
204
205 #[test]
206 fn overly_deep_page_tree_is_pruned() {
207 let mut objects: Vec<String> = vec!["<< /Type /Catalog /Pages 2 0 R >>".into()];
210 let chain = MAX_PAGE_TREE_DEPTH + 10;
211 for i in 0..chain {
212 objects.push(format!("<< /Type /Pages /Kids [{} 0 R] /Count 1 >>", i + 3));
213 }
214 objects.push("<< /Type /Page /MediaBox [0 0 10 10] >>".into());
215 let refs: Vec<&str> = objects.iter().map(|s| s.as_str()).collect();
216 assert!(PdfDocument::open(build_pdf(&refs)).is_err());
217 }
218}