Skip to main content

edgeparse_core/pdf/
page_info.rs

1//! Page geometry extraction — MediaBox, CropBox, Rotation.
2
3use lopdf::{Document, Object};
4
5use crate::models::bbox::BoundingBox;
6
7/// Physical page information extracted from the PDF.
8#[derive(Debug, Clone)]
9pub struct PageInfo {
10    /// Zero-based page index.
11    pub index: usize,
12    /// One-based page number.
13    pub page_number: u32,
14    /// MediaBox — the full physical page size.
15    pub media_box: BoundingBox,
16    /// CropBox — the visible area (defaults to MediaBox if absent).
17    pub crop_box: BoundingBox,
18    /// Page rotation in degrees (0, 90, 180, 270).
19    pub rotation: i64,
20    /// Page width in points.
21    pub width: f64,
22    /// Page height in points.
23    pub height: f64,
24}
25
26/// Extract page info for all pages in the document.
27pub fn extract_page_info(doc: &Document) -> Vec<PageInfo> {
28    let pages = doc.get_pages();
29    let mut infos = Vec::with_capacity(pages.len());
30
31    let mut sorted_pages: Vec<_> = pages.into_iter().collect();
32    sorted_pages.sort_by_key(|&(num, _)| num);
33
34    for (idx, (page_num, page_id)) in sorted_pages.into_iter().enumerate() {
35        let page_dict = match doc.get_object(page_id).and_then(|o| o.as_dict().cloned()) {
36            Ok(d) => d,
37            Err(_) => continue,
38        };
39
40        let media_box = extract_rect(doc, &page_dict, b"MediaBox")
41            .unwrap_or_else(|| BoundingBox::new(None, 0.0, 0.0, 612.0, 792.0)); // US Letter default
42
43        let crop_box =
44            extract_rect(doc, &page_dict, b"CropBox").unwrap_or_else(|| media_box.clone());
45
46        let rotation = page_dict
47            .get(b"Rotate")
48            .ok()
49            .and_then(|o| resolve_integer(doc, o))
50            .unwrap_or(0);
51
52        let width = media_box.width();
53        let height = media_box.height();
54
55        infos.push(PageInfo {
56            index: idx,
57            page_number: page_num,
58            media_box,
59            crop_box,
60            rotation,
61            width,
62            height,
63        });
64    }
65
66    infos
67}
68
69/// Extract a rectangle array [llx, lly, urx, ury] from a page dictionary,
70/// walking up the page tree if not found on the page itself.
71fn extract_rect(doc: &Document, dict: &lopdf::Dictionary, key: &[u8]) -> Option<BoundingBox> {
72    if let Ok(obj) = dict.get(key) {
73        if let Some(bbox) = parse_rect_array(doc, obj) {
74            return Some(bbox);
75        }
76    }
77    // Try parent
78    if let Ok(parent_ref) = dict.get(b"Parent") {
79        if let Ok((_, parent_obj)) = doc.dereference(parent_ref) {
80            if let Ok(parent_dict) = parent_obj.as_dict() {
81                return extract_rect(doc, parent_dict, key);
82            }
83        }
84    }
85    None
86}
87
88fn parse_rect_array(doc: &Document, obj: &Object) -> Option<BoundingBox> {
89    let arr = match obj {
90        Object::Array(a) => a.clone(),
91        Object::Reference(id) => doc
92            .get_object(*id)
93            .ok()
94            .and_then(|o| o.as_array().ok().cloned())?,
95        _ => return None,
96    };
97
98    if arr.len() < 4 {
99        return None;
100    }
101
102    let vals: Vec<f64> = arr.iter().filter_map(|o| resolve_number(doc, o)).collect();
103
104    if vals.len() < 4 {
105        return None;
106    }
107
108    Some(BoundingBox::new(None, vals[0], vals[1], vals[2], vals[3]))
109}
110
111fn resolve_number(doc: &Document, obj: &Object) -> Option<f64> {
112    match obj {
113        Object::Real(f) => Some(*f),
114        Object::Integer(i) => Some(*i as f64),
115        Object::Reference(id) => doc
116            .get_object(*id)
117            .ok()
118            .and_then(|o| resolve_number(doc, o)),
119        _ => None,
120    }
121}
122
123fn resolve_integer(doc: &Document, obj: &Object) -> Option<i64> {
124    match obj {
125        Object::Integer(i) => Some(*i),
126        Object::Reference(id) => doc
127            .get_object(*id)
128            .ok()
129            .and_then(|o| resolve_integer(doc, o)),
130        _ => None,
131    }
132}
133
134#[cfg(test)]
135mod tests {
136    use super::*;
137
138    #[test]
139    fn test_empty_document() {
140        let doc = Document::new();
141        let infos = extract_page_info(&doc);
142        assert!(infos.is_empty());
143    }
144
145    #[test]
146    fn test_page_info_defaults() {
147        let info = PageInfo {
148            index: 0,
149            page_number: 1,
150            media_box: BoundingBox::new(None, 0.0, 0.0, 612.0, 792.0),
151            crop_box: BoundingBox::new(None, 0.0, 0.0, 612.0, 792.0),
152            rotation: 0,
153            width: 612.0,
154            height: 792.0,
155        };
156        assert_eq!(info.width, 612.0);
157        assert_eq!(info.height, 792.0);
158        assert_eq!(info.rotation, 0);
159    }
160
161    #[test]
162    fn test_rotated_page() {
163        let info = PageInfo {
164            index: 0,
165            page_number: 1,
166            media_box: BoundingBox::new(None, 0.0, 0.0, 595.0, 842.0),
167            crop_box: BoundingBox::new(None, 0.0, 0.0, 595.0, 842.0),
168            rotation: 90,
169            width: 595.0,
170            height: 842.0,
171        };
172        assert_eq!(info.rotation, 90);
173    }
174}