Skip to main content

justpdf_core/page/
mod.rs

1use crate::error::{JustPdfError, Result};
2use crate::object::{IndirectRef, PdfDict, PdfObject};
3use crate::parser::PdfDocument;
4
5/// A rectangle defined by [llx, lly, urx, ury].
6#[derive(Debug, Clone, Copy, PartialEq)]
7pub struct Rect {
8    pub llx: f64,
9    pub lly: f64,
10    pub urx: f64,
11    pub ury: f64,
12}
13
14impl Rect {
15    pub fn width(&self) -> f64 {
16        (self.urx - self.llx).abs()
17    }
18
19    pub fn height(&self) -> f64 {
20        (self.ury - self.lly).abs()
21    }
22
23    /// Parse a Rect from a PDF array [llx, lly, urx, ury].
24    pub fn from_pdf_array(arr: &[PdfObject]) -> Option<Self> {
25        if arr.len() < 4 {
26            return None;
27        }
28        Some(Self {
29            llx: arr[0].as_f64()?,
30            lly: arr[1].as_f64()?,
31            urx: arr[2].as_f64()?,
32            ury: arr[3].as_f64()?,
33        })
34    }
35}
36
37impl std::fmt::Display for Rect {
38    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
39        write!(f, "[{} {} {} {}]", self.llx, self.lly, self.urx, self.ury)
40    }
41}
42
43/// Information about a single PDF page.
44#[derive(Debug, Clone)]
45pub struct PageInfo {
46    /// 0-based page index.
47    pub index: usize,
48    /// The indirect reference to this page object.
49    pub page_ref: IndirectRef,
50    /// MediaBox (required, possibly inherited).
51    pub media_box: Rect,
52    /// CropBox (optional, defaults to MediaBox).
53    pub crop_box: Option<Rect>,
54    /// BleedBox (optional).
55    pub bleed_box: Option<Rect>,
56    /// TrimBox (optional).
57    pub trim_box: Option<Rect>,
58    /// ArtBox (optional).
59    pub art_box: Option<Rect>,
60    /// Page rotation in degrees (0, 90, 180, 270).
61    pub rotate: i64,
62    /// Reference to the Contents (stream or array of streams).
63    pub contents_ref: Option<PdfObject>,
64    /// Reference to the Resources dict.
65    pub resources_ref: Option<PdfObject>,
66}
67
68/// Walk the page tree and collect all pages in order.
69pub fn collect_pages(doc: &PdfDocument) -> Result<Vec<PageInfo>> {
70    let catalog_ref = doc
71        .catalog_ref()
72        .ok_or(JustPdfError::TrailerNotFound)?
73        .clone();
74
75    let catalog = doc.resolve(&catalog_ref)?;
76    let catalog_dict = catalog.as_dict().ok_or(JustPdfError::InvalidObject {
77        offset: 0,
78        detail: "catalog is not a dict".into(),
79    })?;
80
81    let pages_ref = catalog_dict
82        .get_ref(b"Pages")
83        .ok_or(JustPdfError::InvalidObject {
84            offset: 0,
85            detail: "catalog has no /Pages".into(),
86        })?
87        .clone();
88
89    let mut pages = Vec::new();
90    let inherited = InheritedAttrs::default();
91    walk_page_tree(doc, &pages_ref, &inherited, &mut pages)?;
92    Ok(pages)
93}
94
95/// Get the total page count from the Pages dict /Count.
96pub fn page_count(doc: &PdfDocument) -> Result<usize> {
97    let catalog_ref = doc
98        .catalog_ref()
99        .ok_or(JustPdfError::TrailerNotFound)?
100        .clone();
101
102    let catalog = doc.resolve(&catalog_ref)?;
103    let catalog_dict = catalog.as_dict().ok_or(JustPdfError::InvalidObject {
104        offset: 0,
105        detail: "catalog is not a dict".into(),
106    })?;
107
108    let pages_ref = catalog_dict
109        .get_ref(b"Pages")
110        .ok_or(JustPdfError::InvalidObject {
111            offset: 0,
112            detail: "catalog has no /Pages".into(),
113        })?
114        .clone();
115
116    let pages_obj = doc.resolve(&pages_ref)?;
117    let pages_dict = pages_obj.as_dict().ok_or(JustPdfError::InvalidObject {
118        offset: 0,
119        detail: "Pages is not a dict".into(),
120    })?;
121
122    Ok(pages_dict.get_i64(b"Count").unwrap_or(0) as usize)
123}
124
125/// Get a single page by 0-based index without collecting all pages.
126///
127/// This walks the page tree, counting pages as it goes, and returns the
128/// `PageInfo` for the requested page as soon as it is found.  For documents
129/// with many pages this avoids allocating and resolving every page object when
130/// only a single page is needed.
131pub fn get_page(doc: &PdfDocument, index: usize) -> Result<PageInfo> {
132    let catalog_ref = doc
133        .catalog_ref()
134        .ok_or(JustPdfError::TrailerNotFound)?
135        .clone();
136
137    let catalog = doc.resolve(&catalog_ref)?;
138    let catalog_dict = catalog.as_dict().ok_or(JustPdfError::InvalidObject {
139        offset: 0,
140        detail: "catalog is not a dict".into(),
141    })?;
142
143    let pages_ref = catalog_dict
144        .get_ref(b"Pages")
145        .ok_or(JustPdfError::InvalidObject {
146            offset: 0,
147            detail: "catalog has no /Pages".into(),
148        })?
149        .clone();
150
151    // Optional: fast-path bounds check via /Count.
152    let pages_obj = doc.resolve(&pages_ref)?;
153    let pages_dict = pages_obj.as_dict().ok_or(JustPdfError::InvalidObject {
154        offset: 0,
155        detail: "Pages is not a dict".into(),
156    })?;
157    let count = pages_dict.get_i64(b"Count").unwrap_or(0) as usize;
158    if index >= count {
159        return Err(JustPdfError::InvalidObject {
160            offset: 0,
161            detail: format!(
162                "page index {index} out of range (document has {count} pages)"
163            ),
164        });
165    }
166
167    let inherited = InheritedAttrs::default();
168    let mut counter: usize = 0;
169    walk_page_tree_find(doc, &pages_ref, &inherited, index, &mut counter)
170        .and_then(|opt| {
171            opt.ok_or(JustPdfError::InvalidObject {
172                offset: 0,
173                detail: format!("page index {index} not found in page tree"),
174            })
175        })
176}
177
178/// Recursively walk the page tree looking for the page at `target` index.
179/// `counter` tracks how many leaf pages have been seen so far.
180/// Returns `Ok(Some(page))` as soon as the target page is found, or
181/// `Ok(None)` after exhausting the subtree without finding it.
182fn walk_page_tree_find(
183    doc: &PdfDocument,
184    node_ref: &IndirectRef,
185    inherited: &InheritedAttrs,
186    target: usize,
187    counter: &mut usize,
188) -> Result<Option<PageInfo>> {
189    let node_obj = doc.resolve(node_ref)?;
190    let dict = node_obj.as_dict().ok_or(JustPdfError::InvalidObject {
191        offset: 0,
192        detail: "page tree node is not a dict".into(),
193    })?;
194
195    let node_type = dict.get_name(b"Type").unwrap_or(b"");
196
197    match node_type {
198        b"Pages" => {
199            // Pruning: if this subtree's /Count means the target lies beyond
200            // it, skip the entire subtree.
201            let subtree_count = dict.get_i64(b"Count").unwrap_or(0) as usize;
202            if *counter + subtree_count <= target {
203                *counter += subtree_count;
204                return Ok(None);
205            }
206
207            let updated = inherited.with_overrides(dict);
208            if let Some(kids) = dict.get_array(b"Kids") {
209                let kid_refs: Vec<IndirectRef> = kids
210                    .iter()
211                    .filter_map(|obj| obj.as_reference().cloned())
212                    .collect();
213
214                for kid_ref in kid_refs {
215                    if let Some(page) =
216                        walk_page_tree_find(doc, &kid_ref, &updated, target, counter)?
217                    {
218                        return Ok(Some(page));
219                    }
220                }
221            }
222            Ok(None)
223        }
224        _ if node_type == b"Page"
225            || dict.contains_key(b"MediaBox")
226            || inherited.media_box.is_some() =>
227        {
228            let current_index = *counter;
229            *counter += 1;
230
231            if current_index != target {
232                return Ok(None);
233            }
234
235            let updated = inherited.with_overrides(dict);
236
237            let media_box = updated.media_box.unwrap_or(Rect {
238                llx: 0.0,
239                lly: 0.0,
240                urx: 612.0,
241                ury: 792.0,
242            });
243
244            Ok(Some(PageInfo {
245                index: current_index,
246                page_ref: node_ref.clone(),
247                media_box,
248                crop_box: updated
249                    .crop_box
250                    .or_else(|| dict.get_array(b"CropBox").and_then(Rect::from_pdf_array)),
251                bleed_box: dict.get_array(b"BleedBox").and_then(Rect::from_pdf_array),
252                trim_box: dict.get_array(b"TrimBox").and_then(Rect::from_pdf_array),
253                art_box: dict.get_array(b"ArtBox").and_then(Rect::from_pdf_array),
254                rotate: updated.rotate.unwrap_or(0),
255                contents_ref: dict.get(b"Contents").cloned(),
256                resources_ref: updated
257                    .resources
258                    .or_else(|| dict.get(b"Resources").cloned()),
259            }))
260        }
261        _ => Ok(None),
262    }
263}
264
265/// Attributes that can be inherited from parent Pages nodes.
266#[derive(Debug, Clone, Default)]
267struct InheritedAttrs {
268    media_box: Option<Rect>,
269    crop_box: Option<Rect>,
270    rotate: Option<i64>,
271    resources: Option<PdfObject>,
272}
273
274impl InheritedAttrs {
275    /// Create a child copy with overrides from a Pages/Page dict.
276    fn with_overrides(&self, dict: &PdfDict) -> Self {
277        let mut child = self.clone();
278
279        if let Some(arr) = dict.get_array(b"MediaBox")
280            && let Some(rect) = Rect::from_pdf_array(arr)
281        {
282            child.media_box = Some(rect);
283        }
284        if let Some(arr) = dict.get_array(b"CropBox")
285            && let Some(rect) = Rect::from_pdf_array(arr)
286        {
287            child.crop_box = Some(rect);
288        }
289        if let Some(r) = dict.get_i64(b"Rotate") {
290            child.rotate = Some(r);
291        }
292        if dict.get(b"Resources").is_some() {
293            child.resources = dict.get(b"Resources").cloned();
294        }
295
296        child
297    }
298}
299
300/// Recursively walk the page tree.
301fn walk_page_tree(
302    doc: &PdfDocument,
303    node_ref: &IndirectRef,
304    inherited: &InheritedAttrs,
305    pages: &mut Vec<PageInfo>,
306) -> Result<()> {
307    let node_obj = doc.resolve(node_ref)?;
308    let dict = node_obj.as_dict().ok_or(JustPdfError::InvalidObject {
309        offset: 0,
310        detail: "page tree node is not a dict".into(),
311    })?;
312
313    let node_type = dict.get_name(b"Type").unwrap_or(b"");
314
315    match node_type {
316        b"Pages" => {
317            let updated = inherited.with_overrides(dict);
318            if let Some(kids) = dict.get_array(b"Kids") {
319                let kid_refs: Vec<IndirectRef> = kids
320                    .iter()
321                    .filter_map(|obj| obj.as_reference().cloned())
322                    .collect();
323
324                for kid_ref in kid_refs {
325                    walk_page_tree(doc, &kid_ref, &updated, pages)?;
326                }
327            }
328        }
329        _ if node_type == b"Page"
330            || dict.contains_key(b"MediaBox")
331            || inherited.media_box.is_some() =>
332        {
333            let updated = inherited.with_overrides(dict);
334
335            let media_box = updated.media_box.unwrap_or(Rect {
336                llx: 0.0,
337                lly: 0.0,
338                urx: 612.0,
339                ury: 792.0,
340            });
341
342            let page_info = PageInfo {
343                index: pages.len(),
344                page_ref: node_ref.clone(),
345                media_box,
346                crop_box: updated
347                    .crop_box
348                    .or_else(|| dict.get_array(b"CropBox").and_then(Rect::from_pdf_array)),
349                bleed_box: dict.get_array(b"BleedBox").and_then(Rect::from_pdf_array),
350                trim_box: dict.get_array(b"TrimBox").and_then(Rect::from_pdf_array),
351                art_box: dict.get_array(b"ArtBox").and_then(Rect::from_pdf_array),
352                rotate: updated.rotate.unwrap_or(0),
353                contents_ref: dict.get(b"Contents").cloned(),
354                resources_ref: updated
355                    .resources
356                    .or_else(|| dict.get(b"Resources").cloned()),
357            };
358
359            pages.push(page_info);
360        }
361        _ => {
362            // Unknown type, skip
363        }
364    }
365
366    Ok(())
367}
368
369#[cfg(test)]
370mod tests {
371    use super::*;
372
373    #[test]
374    fn test_rect_from_array() {
375        let arr = vec![
376            PdfObject::Integer(0),
377            PdfObject::Integer(0),
378            PdfObject::Integer(612),
379            PdfObject::Integer(792),
380        ];
381        let rect = Rect::from_pdf_array(&arr).unwrap();
382        assert_eq!(rect.llx, 0.0);
383        assert_eq!(rect.ury, 792.0);
384        assert_eq!(rect.width(), 612.0);
385        assert_eq!(rect.height(), 792.0);
386    }
387
388    #[test]
389    fn test_rect_from_real_array() {
390        let arr = vec![
391            PdfObject::Real(10.5),
392            PdfObject::Real(20.5),
393            PdfObject::Real(595.0),
394            PdfObject::Real(842.0),
395        ];
396        let rect = Rect::from_pdf_array(&arr).unwrap();
397        assert_eq!(rect.llx, 10.5);
398        assert_eq!(rect.width(), 584.5);
399    }
400
401    #[test]
402    fn test_rect_too_short() {
403        let arr = vec![PdfObject::Integer(0), PdfObject::Integer(0)];
404        assert!(Rect::from_pdf_array(&arr).is_none());
405    }
406
407    #[test]
408    fn test_rect_display() {
409        let rect = Rect {
410            llx: 0.0,
411            lly: 0.0,
412            urx: 612.0,
413            ury: 792.0,
414        };
415        assert_eq!(rect.to_string(), "[0 0 612 792]");
416    }
417}