Skip to main content

zpdf_document/
catalog.rs

1use std::borrow::Cow;
2use std::collections::HashSet;
3
4use tracing::warn;
5use zpdf_core::{Error, ObjectId, PdfObject, Result};
6use zpdf_parser::PdfFile;
7
8use crate::page::{PdfPage, MAX_PAGE_TREE_DEPTH};
9
10pub struct Catalog {
11    pub pages_ref: ObjectId,
12    pub page_count: usize,
13    page_refs: Vec<ObjectId>,
14}
15
16impl Catalog {
17    pub fn from_trailer(file: &PdfFile) -> Result<Self> {
18        // /Count is advisory only; the guarded kid walk determines the real
19        // page list (broken kids are skipped, cycles and over-deep chains pruned).
20        let pages_ref = Self::resolve_pages_ref(file);
21        let mut page_refs = Vec::new();
22        let mut visited = HashSet::new();
23        if let Some(pages_ref) = pages_ref {
24            Self::collect_page_refs(file, pages_ref, &mut page_refs, &mut visited, 0)?;
25        }
26
27        // Fallback: the /Root or /Pages tree was missing, null, or yielded no
28        // leaves — but the page objects often physically exist in the file (a
29        // broken xref, a catalog stranded in an /ObjStm, a /Root aimed at the
30        // wrong object, or a tree pruned by the cycle/depth guards). Mainstream
31        // readers degrade to a whole-document scan for /Type /Page; do the same.
32        if page_refs.is_empty() {
33            warn!("page tree unreachable via /Pages; scanning all objects for /Type /Page");
34            page_refs = file.find_objects_by_type("Page");
35        }
36
37        // Last resort: fuzzed files often byte-flip or drop the page's /Type
38        // (e.g. it parses as a Stream, or the type name is corrupted). Accept
39        // any "page-shaped" dict — carries /MediaBox or /Contents, is not a
40        // page-tree node (/Kids) or catalog (/Pages). Only reached when the
41        // document is already otherwise unopenable, so the loose heuristic
42        // cannot regress healthy files.
43        if page_refs.is_empty() {
44            warn!("no /Type /Page objects; scanning for page-shaped dicts");
45            page_refs = Self::scan_page_like(file);
46        }
47
48        if page_refs.is_empty() {
49            return Err(Error::InvalidObject(
50                0,
51                "page tree contains no usable pages".into(),
52            ));
53        }
54
55        Ok(Self {
56            pages_ref: pages_ref.unwrap_or(ObjectId(0, 0)),
57            page_count: page_refs.len(),
58            page_refs,
59        })
60    }
61
62    /// Whole-document scan for "page-shaped" dicts: a leaf carries `/MediaBox`
63    /// or `/Contents`, is not an interior page-tree node (`/Kids`) and not the
64    /// catalog (`/Pages`). Used only when `/Type /Page` matching already came up
65    /// empty, to recover pages whose `/Type` was corrupted or dropped.
66    fn scan_page_like(file: &PdfFile) -> Vec<ObjectId> {
67        file.all_object_ids()
68            .into_iter()
69            .filter(|&id| {
70                let Ok(obj) = file.resolve(id) else {
71                    return false;
72                };
73                let Ok(dict) = obj.as_dict() else {
74                    return false;
75                };
76                dict.get("Kids").is_none()
77                    && dict.get("Pages").is_none()
78                    && (dict.get("MediaBox").is_some() || dict.get("Contents").is_some())
79            })
80            .collect()
81    }
82
83    /// Resolve `/Root` → `/Pages`, tolerating an absent/null/non-dict Root or a
84    /// missing /Pages by returning `None` (the caller then falls back to a
85    /// whole-document page scan instead of failing the open).
86    fn resolve_pages_ref(file: &PdfFile) -> Option<ObjectId> {
87        let root_ref = file.trailer.get_ref("Root").ok()?;
88        let root = file.resolve(root_ref).ok()?;
89        root.as_dict().ok()?.get_ref("Pages").ok()
90    }
91
92    fn collect_page_refs(
93        file: &PdfFile,
94        node_id: ObjectId,
95        refs: &mut Vec<ObjectId>,
96        visited: &mut HashSet<ObjectId>,
97        depth: usize,
98    ) -> Result<()> {
99        if depth > MAX_PAGE_TREE_DEPTH {
100            warn!("page tree deeper than {MAX_PAGE_TREE_DEPTH} at {node_id}; pruning subtree");
101            return Ok(());
102        }
103        if !visited.insert(node_id) {
104            warn!("page tree cycle: node {node_id} already visited; pruning");
105            return Ok(());
106        }
107
108        let node = match file.resolve(node_id) {
109            Ok(PdfObject::Null) => {
110                warn!("page tree node {node_id} resolves to null; skipping");
111                return Ok(());
112            }
113            Ok(obj) => obj,
114            Err(e) => {
115                warn!("failed to resolve page tree node {node_id}: {e}; skipping");
116                return Ok(());
117            }
118        };
119        let Ok(dict) = node.as_dict() else {
120            warn!(
121                "page tree node {node_id} is {}, expected Dict; skipping",
122                node.type_name()
123            );
124            return Ok(());
125        };
126
127        // /Type is formally required but missing or wrong in real-world files;
128        // fall back on the presence of /Kids to tell interior nodes from leaves.
129        let is_pages = match dict.get_name("Type") {
130            Ok("Pages") => true,
131            Ok("Page") => false,
132            _ => dict.get("Kids").is_some(),
133        };
134
135        if is_pages {
136            // /Kids may itself be an indirect ref to the array.
137            let kids: Cow<'_, [PdfObject]> = match dict.get("Kids") {
138                Some(PdfObject::Array(a)) => Cow::Borrowed(a.as_slice()),
139                Some(PdfObject::Ref(r)) => match file.resolve(*r) {
140                    Ok(PdfObject::Array(a)) => Cow::Owned(a),
141                    _ => {
142                        warn!("pages node {node_id}: /Kids ref {r} is not an array; skipping");
143                        return Ok(());
144                    }
145                },
146                _ => {
147                    warn!("pages node {node_id} has no /Kids array; skipping");
148                    return Ok(());
149                }
150            };
151            for kid in kids.iter() {
152                match kid {
153                    PdfObject::Ref(r) => {
154                        Self::collect_page_refs(file, *r, refs, visited, depth + 1)?;
155                    }
156                    PdfObject::Null => {
157                        warn!("pages node {node_id}: null kid; skipping");
158                    }
159                    other => {
160                        warn!(
161                            "pages node {node_id}: kid is {}, expected Ref; skipping",
162                            other.type_name()
163                        );
164                    }
165                }
166            }
167        } else {
168            refs.push(node_id);
169        }
170        Ok(())
171    }
172
173    pub fn get_page(&self, file: &PdfFile, index: usize) -> Result<PdfPage> {
174        let page_ref =
175            self.page_refs.get(index).copied().ok_or_else(|| {
176                Error::InvalidObject(0, format!("page index {index} out of range"))
177            })?;
178
179        PdfPage::from_object(file, page_ref)
180    }
181}
182
183#[cfg(test)]
184mod tests {
185    use crate::page::MAX_PAGE_TREE_DEPTH;
186    use crate::test_util::build_pdf;
187    use crate::PdfDocument;
188
189    #[test]
190    fn kids_cycle_is_pruned() {
191        // The pages node lists itself as a kid; the walk must terminate.
192        let doc = PdfDocument::open(build_pdf(&[
193            "<< /Type /Catalog /Pages 2 0 R >>",
194            "<< /Type /Pages /Kids [3 0 R 2 0 R] /Count 1 >>",
195            "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>",
196        ]))
197        .expect("open");
198        assert_eq!(doc.page_count(), 1);
199    }
200
201    #[test]
202    fn dangling_and_null_kids_are_skipped() {
203        // 99 0 R is dangling (skipped whether resolve errors or returns Null);
204        // the literal null kid is skipped outright.
205        let doc = PdfDocument::open(build_pdf(&[
206            "<< /Type /Catalog /Pages 2 0 R >>",
207            "<< /Type /Pages /Kids [99 0 R 3 0 R null] /Count 3 >>",
208            "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>",
209        ]))
210        .expect("open");
211        assert_eq!(doc.page_count(), 1);
212        assert!(doc.page(0).is_ok());
213    }
214
215    #[test]
216    fn missing_type_nodes_tolerated() {
217        // Neither tree node carries /Type; /Kids presence tells interior from
218        // leaf, and inheritance still works through the untyped interior node.
219        let doc = PdfDocument::open(build_pdf(&[
220            "<< /Type /Catalog /Pages 2 0 R >>",
221            "<< /Kids [3 0 R] /Count 1 /MediaBox [0 0 200 200] >>",
222            "<< /Parent 2 0 R >>",
223        ]))
224        .expect("open");
225        assert_eq!(doc.page_count(), 1);
226        let page = doc.page(0).expect("page");
227        assert_eq!(page.media_box.width(), 200.0);
228    }
229
230    #[test]
231    fn empty_page_tree_is_an_error() {
232        assert!(PdfDocument::open(build_pdf(&[
233            "<< /Type /Catalog /Pages 2 0 R >>",
234            "<< /Type /Pages /Kids [] /Count 0 >>",
235        ]))
236        .is_err());
237    }
238
239    #[test]
240    fn null_root_is_a_hard_error() {
241        // Object 1 (the /Root target) is the literal null object.
242        assert!(PdfDocument::open(build_pdf(&["null"])).is_err());
243    }
244
245    #[test]
246    fn overly_deep_page_tree_is_pruned() {
247        // A single-kid Pages chain deeper than the guard: the kid walk must
248        // terminate (no hang/stack overflow) with the leaf pruned. The
249        // document-level fallback then recovers the orphaned /Type /Page leaf
250        // via a whole-document scan, so the document still opens with that page.
251        let mut objects: Vec<String> = vec!["<< /Type /Catalog /Pages 2 0 R >>".into()];
252        let chain = MAX_PAGE_TREE_DEPTH + 10;
253        for i in 0..chain {
254            objects.push(format!("<< /Type /Pages /Kids [{} 0 R] /Count 1 >>", i + 3));
255        }
256        objects.push("<< /Type /Page /MediaBox [0 0 10 10] >>".into());
257        let refs: Vec<&str> = objects.iter().map(|s| s.as_str()).collect();
258        let doc = PdfDocument::open(build_pdf(&refs)).expect("fallback recovers the pruned leaf");
259        assert_eq!(doc.page_count(), 1);
260    }
261}