Skip to main content

zpdf_document/
catalog.rs

1use std::borrow::Cow;
2use std::collections::HashSet;
3
4use tracing::warn;
5use zpdf_core::{Error, ObjectId, PdfObject, Result};
6use zpdf_parser::PdfFile;
7
8use crate::page::{PdfPage, MAX_PAGE_TREE_DEPTH};
9
10pub struct Catalog {
11    pub pages_ref: ObjectId,
12    pub page_count: usize,
13    page_refs: Vec<ObjectId>,
14}
15
16impl Catalog {
17    pub fn from_trailer(file: &PdfFile) -> Result<Self> {
18        let root_ref = file.trailer.get_ref("Root")?;
19        let root = file.resolve(root_ref)?;
20        // A lenient resolver may report a dangling /Root as Null instead of an
21        // error; either way a document without a catalog is unusable.
22        if root.is_null() {
23            return Err(Error::InvalidObject(
24                0,
25                format!("/Root {root_ref} resolves to null"),
26            ));
27        }
28        let root_dict = root.as_dict()?;
29
30        let pages_ref = root_dict.get_ref("Pages")?;
31
32        // /Count is advisory only; the guarded kid walk determines the real
33        // page list (broken kids are skipped, cycles and over-deep chains pruned).
34        let mut page_refs = Vec::new();
35        let mut visited = HashSet::new();
36        Self::collect_page_refs(file, pages_ref, &mut page_refs, &mut visited, 0)?;
37
38        if page_refs.is_empty() {
39            return Err(Error::InvalidObject(
40                0,
41                "page tree contains no usable pages".into(),
42            ));
43        }
44
45        Ok(Self {
46            pages_ref,
47            page_count: page_refs.len(),
48            page_refs,
49        })
50    }
51
52    fn collect_page_refs(
53        file: &PdfFile,
54        node_id: ObjectId,
55        refs: &mut Vec<ObjectId>,
56        visited: &mut HashSet<ObjectId>,
57        depth: usize,
58    ) -> Result<()> {
59        if depth > MAX_PAGE_TREE_DEPTH {
60            warn!("page tree deeper than {MAX_PAGE_TREE_DEPTH} at {node_id}; pruning subtree");
61            return Ok(());
62        }
63        if !visited.insert(node_id) {
64            warn!("page tree cycle: node {node_id} already visited; pruning");
65            return Ok(());
66        }
67
68        let node = match file.resolve(node_id) {
69            Ok(PdfObject::Null) => {
70                warn!("page tree node {node_id} resolves to null; skipping");
71                return Ok(());
72            }
73            Ok(obj) => obj,
74            Err(e) => {
75                warn!("failed to resolve page tree node {node_id}: {e}; skipping");
76                return Ok(());
77            }
78        };
79        let Ok(dict) = node.as_dict() else {
80            warn!(
81                "page tree node {node_id} is {}, expected Dict; skipping",
82                node.type_name()
83            );
84            return Ok(());
85        };
86
87        // /Type is formally required but missing or wrong in real-world files;
88        // fall back on the presence of /Kids to tell interior nodes from leaves.
89        let is_pages = match dict.get_name("Type") {
90            Ok("Pages") => true,
91            Ok("Page") => false,
92            _ => dict.get("Kids").is_some(),
93        };
94
95        if is_pages {
96            // /Kids may itself be an indirect ref to the array.
97            let kids: Cow<'_, [PdfObject]> = match dict.get("Kids") {
98                Some(PdfObject::Array(a)) => Cow::Borrowed(a.as_slice()),
99                Some(PdfObject::Ref(r)) => match file.resolve(*r) {
100                    Ok(PdfObject::Array(a)) => Cow::Owned(a),
101                    _ => {
102                        warn!("pages node {node_id}: /Kids ref {r} is not an array; skipping");
103                        return Ok(());
104                    }
105                },
106                _ => {
107                    warn!("pages node {node_id} has no /Kids array; skipping");
108                    return Ok(());
109                }
110            };
111            for kid in kids.iter() {
112                match kid {
113                    PdfObject::Ref(r) => {
114                        Self::collect_page_refs(file, *r, refs, visited, depth + 1)?;
115                    }
116                    PdfObject::Null => {
117                        warn!("pages node {node_id}: null kid; skipping");
118                    }
119                    other => {
120                        warn!(
121                            "pages node {node_id}: kid is {}, expected Ref; skipping",
122                            other.type_name()
123                        );
124                    }
125                }
126            }
127        } else {
128            refs.push(node_id);
129        }
130        Ok(())
131    }
132
133    pub fn get_page(&self, file: &PdfFile, index: usize) -> Result<PdfPage> {
134        let page_ref =
135            self.page_refs.get(index).copied().ok_or_else(|| {
136                Error::InvalidObject(0, format!("page index {index} out of range"))
137            })?;
138
139        PdfPage::from_object(file, page_ref)
140    }
141}
142
143#[cfg(test)]
144mod tests {
145    use crate::page::MAX_PAGE_TREE_DEPTH;
146    use crate::test_util::build_pdf;
147    use crate::PdfDocument;
148
149    #[test]
150    fn kids_cycle_is_pruned() {
151        // The pages node lists itself as a kid; the walk must terminate.
152        let doc = PdfDocument::open(build_pdf(&[
153            "<< /Type /Catalog /Pages 2 0 R >>",
154            "<< /Type /Pages /Kids [3 0 R 2 0 R] /Count 1 >>",
155            "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>",
156        ]))
157        .expect("open");
158        assert_eq!(doc.page_count(), 1);
159    }
160
161    #[test]
162    fn dangling_and_null_kids_are_skipped() {
163        // 99 0 R is dangling (skipped whether resolve errors or returns Null);
164        // the literal null kid is skipped outright.
165        let doc = PdfDocument::open(build_pdf(&[
166            "<< /Type /Catalog /Pages 2 0 R >>",
167            "<< /Type /Pages /Kids [99 0 R 3 0 R null] /Count 3 >>",
168            "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>",
169        ]))
170        .expect("open");
171        assert_eq!(doc.page_count(), 1);
172        assert!(doc.page(0).is_ok());
173    }
174
175    #[test]
176    fn missing_type_nodes_tolerated() {
177        // Neither tree node carries /Type; /Kids presence tells interior from
178        // leaf, and inheritance still works through the untyped interior node.
179        let doc = PdfDocument::open(build_pdf(&[
180            "<< /Type /Catalog /Pages 2 0 R >>",
181            "<< /Kids [3 0 R] /Count 1 /MediaBox [0 0 200 200] >>",
182            "<< /Parent 2 0 R >>",
183        ]))
184        .expect("open");
185        assert_eq!(doc.page_count(), 1);
186        let page = doc.page(0).expect("page");
187        assert_eq!(page.media_box.width(), 200.0);
188    }
189
190    #[test]
191    fn empty_page_tree_is_an_error() {
192        assert!(PdfDocument::open(build_pdf(&[
193            "<< /Type /Catalog /Pages 2 0 R >>",
194            "<< /Type /Pages /Kids [] /Count 0 >>",
195        ]))
196        .is_err());
197    }
198
199    #[test]
200    fn null_root_is_a_hard_error() {
201        // Object 1 (the /Root target) is the literal null object.
202        assert!(PdfDocument::open(build_pdf(&["null"])).is_err());
203    }
204
205    #[test]
206    fn overly_deep_page_tree_is_pruned() {
207        // A single-kid Pages chain deeper than the guard: opening must
208        // terminate, and with the only leaf pruned the tree comes up empty.
209        let mut objects: Vec<String> = vec!["<< /Type /Catalog /Pages 2 0 R >>".into()];
210        let chain = MAX_PAGE_TREE_DEPTH + 10;
211        for i in 0..chain {
212            objects.push(format!("<< /Type /Pages /Kids [{} 0 R] /Count 1 >>", i + 3));
213        }
214        objects.push("<< /Type /Page /MediaBox [0 0 10 10] >>".into());
215        let refs: Vec<&str> = objects.iter().map(|s| s.as_str()).collect();
216        assert!(PdfDocument::open(build_pdf(&refs)).is_err());
217    }
218}