oxidize_pdf/parser/
page_tree.rs

1//! PDF Page Tree Parser
2//!
3//! This module handles navigation and extraction of pages from the PDF page tree structure.
4//! The page tree is a hierarchical structure that organizes pages in a PDF document,
5//! allowing for efficient access and inheritance of properties from parent nodes.
6//!
7//! # Overview
8//!
9//! The PDF page tree consists of:
10//! - **Page Tree Nodes**: Internal nodes that can contain other nodes or pages
11//! - **Page Objects**: Leaf nodes representing individual pages
12//! - **Inherited Properties**: Resources, MediaBox, CropBox, and Rotate can be inherited from parent nodes
13//!
14//! # Example
15//!
16//! ```rust,no_run
17//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
18//!
19//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
20//! // Open a PDF document
21//! let reader = PdfReader::open("document.pdf")?;
22//! let document = PdfDocument::new(reader);
23//!
24//! // Get a specific page
25//! let page = document.get_page(0)?;
26//!
27//! // Access page properties
28//! println!("Page size: {}x{} points", page.width(), page.height());
29//! println!("Rotation: {}°", page.rotation);
30//!
31//! // Get page resources
32//! if let Some(resources) = page.get_resources() {
33//!     println!("Page has resources");
34//! }
35//! # Ok(())
36//! # }
37//! ```
38
39use super::document::PdfDocument;
40use super::objects::{PdfArray, PdfDictionary, PdfObject, PdfStream};
41use super::reader::PdfReader;
42use super::{ParseError, ParseResult};
43use std::collections::{HashMap, HashSet};
44use std::io::{Read, Seek};
45
46/// Represents a single page in the PDF with all its properties and resources.
47///
48/// A `ParsedPage` contains all the information needed to render or analyze a PDF page,
49/// including its dimensions, content streams, resources, and inherited properties from
50/// parent page tree nodes.
51///
52/// # Fields
53///
54/// * `obj_ref` - Object reference (object number, generation number) pointing to this page in the PDF
55/// * `dict` - Complete page dictionary containing all page-specific entries
56/// * `inherited_resources` - Resources inherited from parent page tree nodes
57/// * `media_box` - Page dimensions in PDF units [llx, lly, urx, ury]
58/// * `crop_box` - Optional visible area of the page
59/// * `rotation` - Page rotation in degrees (0, 90, 180, or 270)
60///
61/// # Example
62///
63/// ```rust,no_run
64/// use oxidize_pdf::parser::{PdfDocument, PdfReader};
65///
66/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
67/// let reader = PdfReader::open("document.pdf")?;
68/// let document = PdfDocument::new(reader);
69/// let page = document.get_page(0)?;
70///
71/// // Access page properties
72/// let (obj_num, gen_num) = page.obj_ref;
73/// println!("Page object: {} {} R", obj_num, gen_num);
74///
75/// // Get page dimensions
76/// let [llx, lly, urx, ury] = page.media_box;
77/// println!("MediaBox: ({}, {}) to ({}, {})", llx, lly, urx, ury);
78///
79/// // Check for content
80/// if let Some(contents) = page.dict.get("Contents") {
81///     println!("Page has content streams");
82/// }
83/// # Ok(())
84/// # }
85/// ```
86#[derive(Debug, Clone)]
87pub struct ParsedPage {
88    /// Object reference to this page in the form (object_number, generation_number).
89    /// This uniquely identifies the page object in the PDF file.
90    pub obj_ref: (u32, u16),
91
92    /// Page dictionary containing all page-specific entries like Contents, Resources, etc.
93    /// This is the raw PDF dictionary for the page object.
94    pub dict: PdfDictionary,
95
96    /// Resources inherited from parent page tree nodes.
97    /// These are automatically merged during page tree traversal.
98    pub inherited_resources: Option<PdfDictionary>,
99
100    /// MediaBox defining the page dimensions in PDF units (typically points).
101    /// Format: [lower_left_x, lower_left_y, upper_right_x, upper_right_y]
102    pub media_box: [f64; 4],
103
104    /// CropBox defining the visible area of the page.
105    /// If None, the entire MediaBox is visible.
106    pub crop_box: Option<[f64; 4]>,
107
108    /// Page rotation in degrees. Valid values are 0, 90, 180, or 270.
109    /// The rotation is applied clockwise.
110    pub rotation: i32,
111
112    /// Annotations array containing references to annotation objects.
113    /// This is parsed from the page's /Annots entry.
114    pub annotations: Option<PdfArray>,
115}
116
117/// Maximum number of pages to allow in a flat index.
118/// Prevents OOM from malicious /Count values (e.g., 9,999,999,999).
119const MAX_PAGES: usize = 100_000;
120
121/// Page tree navigator
122pub struct PageTree {
123    /// Total number of pages
124    page_count: u32,
125    /// Cached pages by index
126    pages: HashMap<u32, ParsedPage>,
127    /// Root pages dictionary (for navigation)
128    #[allow(dead_code)]
129    pages_dict: Option<PdfDictionary>,
130    /// Flat index of page object references, built once during initialization.
131    /// Each entry is (obj_num, gen_num) for a leaf Page node.
132    page_refs: Vec<(u32, u16)>,
133}
134
135impl PageTree {
136    /// Create a new page tree navigator
137    pub fn new(page_count: u32) -> Self {
138        Self {
139            page_count,
140            pages: HashMap::new(),
141            pages_dict: None,
142            page_refs: Vec::new(),
143        }
144    }
145
146    /// Create a new page tree navigator with pages dictionary
147    pub fn new_with_pages_dict(page_count: u32, pages_dict: PdfDictionary) -> Self {
148        Self {
149            page_count,
150            pages: HashMap::new(),
151            pages_dict: Some(pages_dict),
152            page_refs: Vec::new(),
153        }
154    }
155
156    /// Create a new page tree navigator with a pre-built flat index.
157    /// The page_count is derived from the actual number of leaf pages found.
158    pub fn new_with_flat_index(pages_dict: PdfDictionary, page_refs: Vec<(u32, u16)>) -> Self {
159        let page_count = page_refs.len() as u32;
160        Self {
161            page_count,
162            pages: HashMap::new(),
163            pages_dict: Some(pages_dict),
164            page_refs,
165        }
166    }
167
168    /// Get a cached page by index (0-based)
169    pub fn get_cached_page(&self, index: u32) -> Option<&ParsedPage> {
170        self.pages.get(&index)
171    }
172
173    /// Cache a page
174    pub fn cache_page(&mut self, index: u32, page: ParsedPage) {
175        self.pages.insert(index, page);
176    }
177
178    /// Clear all cached pages
179    pub fn clear_cache(&mut self) {
180        self.pages.clear();
181    }
182
183    /// Get the total page count
184    pub fn page_count(&self) -> u32 {
185        self.page_count
186    }
187
188    /// Get a page object reference from the flat index by page index (0-based).
189    pub fn get_page_ref(&self, index: u32) -> Option<(u32, u16)> {
190        self.page_refs.get(index as usize).copied()
191    }
192
193    /// Flatten the page tree into a `Vec<(u32, u16)>` of leaf Page object references.
194    ///
195    /// This walks the tree iteratively using an explicit stack, with:
196    /// - **Cycle detection**: `HashSet<(u32, u16)>` prevents infinite loops from circular refs
197    /// - **Page cap**: Stops at `MAX_PAGES` to prevent OOM from absurd `/Count` values
198    /// - **Type inference**: Handles missing `/Type` keys by checking for `/Kids`, `/Contents`, `/MediaBox`
199    pub fn flatten_page_tree<R: Read + Seek>(
200        reader: &mut PdfReader<R>,
201        pages_dict: &PdfDictionary,
202    ) -> ParseResult<Vec<(u32, u16)>> {
203        let mut page_refs: Vec<(u32, u16)> = Vec::new();
204        let mut visited: HashSet<(u32, u16)> = HashSet::new();
205
206        // Work stack: each entry is an object reference to process
207        let mut stack: Vec<(u32, u16)> = Vec::new();
208
209        // Seed from root Kids array
210        if let Some(kids) = pages_dict.get("Kids").and_then(|k| k.as_array()) {
211            // Push in reverse so first kid is processed first (LIFO stack)
212            for kid_obj in kids.0.iter().rev() {
213                if let Some(kid_ref) = kid_obj.as_reference() {
214                    stack.push(kid_ref);
215                }
216            }
217        }
218
219        while let Some(obj_ref) = stack.pop() {
220            if page_refs.len() >= MAX_PAGES {
221                tracing::warn!("Page tree exceeds {} leaves, truncating", MAX_PAGES);
222                break;
223            }
224
225            // Cycle detection
226            if !visited.insert(obj_ref) {
227                tracing::warn!(
228                    "Cycle detected at {} {} R in page tree, skipping",
229                    obj_ref.0,
230                    obj_ref.1
231                );
232                continue;
233            }
234
235            // Resolve the object
236            let obj = match reader.get_object(obj_ref.0, obj_ref.1) {
237                Ok(o) => o,
238                Err(e) => {
239                    tracing::warn!(
240                        "Failed to resolve page tree node {} {} R: {}",
241                        obj_ref.0,
242                        obj_ref.1,
243                        e
244                    );
245                    continue;
246                }
247            };
248
249            let dict = match obj.as_dict() {
250                Some(d) => d,
251                None => {
252                    // Check if it's a stream with a dict (some PDFs embed page data in streams)
253                    if let Some(stream) = obj.as_stream() {
254                        &stream.dict
255                    } else {
256                        continue; // Skip non-dict/non-stream nodes
257                    }
258                }
259            };
260
261            // Determine node type
262            let node_type = dict.get_type().or_else(|| {
263                if dict.contains_key("Kids") {
264                    Some("Pages")
265                } else if dict.contains_key("Contents") || dict.contains_key("MediaBox") {
266                    Some("Page")
267                } else {
268                    None
269                }
270            });
271
272            match node_type {
273                Some("Page") => {
274                    page_refs.push(obj_ref);
275                }
276                Some("Pages") => {
277                    if let Some(kids) = dict.get("Kids").and_then(|k| k.as_array()) {
278                        // Push in reverse for correct order
279                        for kid_obj in kids.0.iter().rev() {
280                            if let Some(kid_ref) = kid_obj.as_reference() {
281                                stack.push(kid_ref);
282                            }
283                        }
284                    }
285                }
286                _ => {
287                    // Unknown type — treat as Page if it has page-like attributes
288                    if dict.contains_key("MediaBox") || dict.contains_key("Contents") {
289                        page_refs.push(obj_ref);
290                    }
291                    // Otherwise silently skip
292                }
293            }
294        }
295
296        Ok(page_refs)
297    }
298
299    /// Load a specific page by traversing the page tree
300    ///
301    /// Note: This method is currently not fully implemented due to architectural constraints
302    /// with recursive page tree traversal and borrow checker issues.
303    #[allow(dead_code)]
304    fn load_page_at_index<R: Read + Seek>(
305        &self,
306        reader: &mut PdfReader<R>,
307        node: &PdfDictionary,
308        node_ref: (u32, u16),
309        target_index: u32,
310        inherited: Option<&PdfDictionary>,
311    ) -> ParseResult<ParsedPage> {
312        let node_type = node
313            .get_type()
314            .or_else(|| {
315                // If Type is missing, try to infer from content
316                if node.contains_key("Kids") && node.contains_key("Count") {
317                    Some("Pages")
318                } else if node.contains_key("Contents") || node.contains_key("MediaBox") {
319                    Some("Page")
320                } else {
321                    None
322                }
323            })
324            .or_else(|| {
325                // If Type is missing and we have lenient parsing, try to infer
326                let lenient_syntax = reader.options().lenient_syntax;
327                let collect_warnings = reader.options().collect_warnings;
328
329                if lenient_syntax || collect_warnings {
330                    // If it has Kids, it's likely a Pages node
331                    if node.contains_key("Kids") {
332                        if collect_warnings {
333                            tracing::debug!(
334                                "Warning: Inferred Type=Pages for object {} {} R (missing Type field, has Kids)",
335                                node_ref.0, node_ref.1
336                            );
337                        }
338                        Some("Pages")
339                    }
340                    // If it has Contents or MediaBox but no Kids, it's likely a Page
341                    else if node.contains_key("Contents")
342                        || (node.contains_key("MediaBox") && !node.contains_key("Kids"))
343                    {
344                        if collect_warnings {
345                            tracing::debug!(
346                                "Warning: Inferred Type=Page for object {} {} R (missing Type field, has Contents/MediaBox)",
347                                node_ref.0, node_ref.1
348                            );
349                        }
350                        Some("Page")
351                    } else {
352                        None
353                    }
354                } else {
355                    None
356                }
357            })
358            .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
359
360        match node_type {
361            "Pages" => {
362                // This is a page tree node
363                let kids = node
364                    .get("Kids")
365                    .and_then(|obj| obj.as_array())
366                    .or_else(|| {
367                        // If Kids is missing and we have lenient parsing, use empty array
368                        if reader.options().lenient_syntax {
369                            if reader.options().collect_warnings {
370                                tracing::debug!(
371                                    "Warning: Missing Kids array in Pages node, using empty array"
372                                );
373                            }
374                            Some(&super::objects::EMPTY_PDF_ARRAY)
375                        } else {
376                            None
377                        }
378                    })
379                    .ok_or_else(|| ParseError::MissingKey("Kids".to_string()))?;
380
381                // Merge inherited attributes
382                let mut merged_inherited = inherited.cloned().unwrap_or_else(PdfDictionary::new);
383
384                // Inheritable attributes: Resources, MediaBox, CropBox, Rotate
385                if let Some(resources) = node.get("Resources") {
386                    if !merged_inherited.contains_key("Resources") {
387                        merged_inherited.insert("Resources".to_string(), resources.clone());
388                    }
389                }
390                if let Some(media_box) = node.get("MediaBox") {
391                    if !merged_inherited.contains_key("MediaBox") {
392                        merged_inherited.insert("MediaBox".to_string(), media_box.clone());
393                    }
394                }
395                if let Some(crop_box) = node.get("CropBox") {
396                    if !merged_inherited.contains_key("CropBox") {
397                        merged_inherited.insert("CropBox".to_string(), crop_box.clone());
398                    }
399                }
400                if let Some(rotate) = node.get("Rotate") {
401                    if !merged_inherited.contains_key("Rotate") {
402                        merged_inherited.insert("Rotate".to_string(), rotate.clone());
403                    }
404                }
405
406                // Find which kid contains our target page
407                let mut current_index = 0;
408                for kid_ref in &kids.0 {
409                    let kid_ref =
410                        kid_ref
411                            .as_reference()
412                            .ok_or_else(|| ParseError::SyntaxError {
413                                position: 0,
414                                message: "Kids array must contain references".to_string(),
415                            })?;
416
417                    // Get the kid object info first
418                    let (_kid_type, count, is_target) = {
419                        // Cache parse options to avoid borrow checker issues
420                        let lenient_syntax = reader.options().lenient_syntax;
421                        let collect_warnings = reader.options().collect_warnings;
422
423                        let kid_obj = reader.get_object(kid_ref.0, kid_ref.1)?;
424                        let kid_dict =
425                            kid_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
426                                position: 0,
427                                message: "Page tree node must be a dictionary".to_string(),
428                            })?;
429
430                        let kid_type = kid_dict
431                            .get_type()
432                            .or_else(|| {
433                                // If Type is missing, try to infer from content
434                                if kid_dict.contains_key("Kids") && kid_dict.contains_key("Count") {
435                                    Some("Pages")
436                                } else if kid_dict.contains_key("Contents")
437                                    || kid_dict.contains_key("MediaBox")
438                                {
439                                    Some("Page")
440                                } else {
441                                    None
442                                }
443                            })
444                            .or_else(|| {
445                                // Additional inference for reconstructed/corrupted objects
446                                if lenient_syntax || collect_warnings {
447                                    // If it has Kids, it's likely a Pages node
448                                    if kid_dict.contains_key("Kids") {
449                                        if collect_warnings {
450                                            tracing::debug!(
451                                                "Warning: Inferred Type=Pages for object {} 0 R (missing Type field, has Kids)",
452                                                kid_ref.0
453                                            );
454                                        }
455                                        Some("Pages")
456                                    }
457                                    // If it has Contents or MediaBox but no Kids, it's likely a Page
458                                    else if kid_dict.contains_key("Contents")
459                                        || (kid_dict.contains_key("MediaBox") && !kid_dict.contains_key("Kids"))
460                                    {
461                                        if collect_warnings {
462                                            tracing::debug!(
463                                                "Warning: Inferred Type=Page for object {} 0 R (missing Type field, has Contents/MediaBox)",
464                                                kid_ref.0
465                                            );
466                                        }
467                                        Some("Page")
468                                    } else {
469                                        None
470                                    }
471                                } else {
472                                    None
473                                }
474                            })
475                            .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
476
477                        let count = if kid_type == "Pages" {
478                            // This is another page tree node
479                            if let Some(count_obj) = kid_dict.get("Count") {
480                                count_obj.as_integer().unwrap_or(0) as u32
481                            } else {
482                                // Missing Count - use size of Kids array as approximation
483                                if let Some(nested_kids_obj) = kid_dict.get("Kids") {
484                                    if let Some(nested_kids_array) = nested_kids_obj.as_array() {
485                                        // Use array length as page count approximation
486                                        nested_kids_array.0.len() as u32
487                                    } else {
488                                        1 // Default if Kids is not an array
489                                    }
490                                } else {
491                                    1 // Default if no Kids array
492                                }
493                            }
494                        } else {
495                            // This is a page
496                            1
497                        };
498
499                        let is_target = target_index < current_index + count;
500                        (kid_type.to_string(), count, is_target)
501                    };
502
503                    if is_target {
504                        // Found the right subtree/page
505                        // Due to borrow checker constraints with recursive calls,
506                        // we return a placeholder page for now.
507                        // A proper implementation would require refactoring the page tree
508                        // traversal to use an iterative approach instead of recursion.
509
510                        return Ok(ParsedPage {
511                            obj_ref: kid_ref,
512                            dict: PdfDictionary::new(),
513                            inherited_resources: Some(merged_inherited.clone()),
514                            media_box: [0.0, 0.0, 612.0, 792.0],
515                            crop_box: None,
516                            rotation: 0,
517                            annotations: None,
518                        });
519                    }
520
521                    current_index += count;
522                }
523
524                Err(ParseError::SyntaxError {
525                    position: 0,
526                    message: "Page not found in tree".to_string(),
527                })
528            }
529            "Page" => {
530                // This is a page object
531                if target_index != 0 {
532                    return Err(ParseError::SyntaxError {
533                        position: 0,
534                        message: "Page index mismatch".to_string(),
535                    });
536                }
537
538                // Use the object reference passed as parameter
539                let obj_ref = node_ref;
540
541                // Extract page attributes
542                let media_box =
543                    Self::get_rectangle(node, inherited, "MediaBox")?.unwrap_or_else(|| {
544                        // Use default Letter size if MediaBox is missing
545                        #[cfg(debug_assertions)]
546                        tracing::debug!(
547                            "Warning: Page {} {} R missing MediaBox, using default Letter size",
548                            obj_ref.0,
549                            obj_ref.1
550                        );
551                        [0.0, 0.0, 612.0, 792.0]
552                    });
553
554                let crop_box = Self::get_rectangle(node, inherited, "CropBox")?;
555
556                let rotation = Self::get_integer(node, inherited, "Rotate")?.unwrap_or(0) as i32;
557
558                // Get resources
559                let inherited_resources = if let Some(inherited) = inherited {
560                    inherited
561                        .get("Resources")
562                        .and_then(|r| r.as_dict())
563                        .cloned()
564                } else {
565                    None
566                };
567
568                // Get annotations if present
569                let annotations = node.get("Annots").and_then(|obj| obj.as_array()).cloned();
570
571                Ok(ParsedPage {
572                    obj_ref,
573                    dict: node.clone(),
574                    inherited_resources,
575                    media_box,
576                    crop_box,
577                    rotation,
578                    annotations,
579                })
580            }
581            _ => Err(ParseError::SyntaxError {
582                position: 0,
583                message: format!("Invalid page tree node type: {node_type}"),
584            }),
585        }
586    }
587
588    /// Get a rectangle value, checking both node and inherited dictionaries
589    #[allow(dead_code)]
590    fn get_rectangle(
591        node: &PdfDictionary,
592        inherited: Option<&PdfDictionary>,
593        key: &str,
594    ) -> ParseResult<Option<[f64; 4]>> {
595        let array = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
596
597        if let Some(array) = array.and_then(|obj| obj.as_array()) {
598            if array.len() != 4 {
599                return Err(ParseError::SyntaxError {
600                    position: 0,
601                    message: format!("{key} must have 4 elements"),
602                });
603            }
604
605            // Safe: array length is guaranteed to be 4 after validation above
606            let rect = [
607                array.0[0].as_real().unwrap_or(0.0),
608                array.0[1].as_real().unwrap_or(0.0),
609                array.0[2].as_real().unwrap_or(0.0),
610                array.0[3].as_real().unwrap_or(0.0),
611            ];
612
613            Ok(Some(rect))
614        } else {
615            Ok(None)
616        }
617    }
618
619    /// Get an integer value, checking both node and inherited dictionaries
620    #[allow(dead_code)]
621    fn get_integer(
622        node: &PdfDictionary,
623        inherited: Option<&PdfDictionary>,
624        key: &str,
625    ) -> ParseResult<Option<i64>> {
626        let value = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
627
628        Ok(value.and_then(|obj| obj.as_integer()))
629    }
630}
631
632impl ParsedPage {
633    /// Get the effective page width accounting for rotation.
634    ///
635    /// The width is calculated from the MediaBox and adjusted based on the page rotation.
636    /// For 90° or 270° rotations, the width and height are swapped.
637    ///
638    /// # Returns
639    ///
640    /// The page width in PDF units (typically points, where 1 point = 1/72 inch)
641    ///
642    /// # Example
643    ///
644    /// ```rust,no_run
645    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
646    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
647    /// # let reader = PdfReader::open("document.pdf")?;
648    /// # let document = PdfDocument::new(reader);
649    /// let page = document.get_page(0)?;
650    /// let width_pts = page.width();
651    /// let width_inches = width_pts / 72.0;
652    /// let width_mm = width_pts * 25.4 / 72.0;
653    /// println!("Page width: {} points ({:.2} inches, {:.2} mm)", width_pts, width_inches, width_mm);
654    /// # Ok(())
655    /// # }
656    /// ```
657    pub fn width(&self) -> f64 {
658        match self.rotation {
659            90 | 270 => self.media_box[3] - self.media_box[1],
660            _ => self.media_box[2] - self.media_box[0],
661        }
662    }
663
664    /// Get the effective page height accounting for rotation.
665    ///
666    /// The height is calculated from the MediaBox and adjusted based on the page rotation.
667    /// For 90° or 270° rotations, the width and height are swapped.
668    ///
669    /// # Returns
670    ///
671    /// The page height in PDF units (typically points, where 1 point = 1/72 inch)
672    ///
673    /// # Example
674    ///
675    /// ```rust,no_run
676    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
677    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
678    /// # let reader = PdfReader::open("document.pdf")?;
679    /// # let document = PdfDocument::new(reader);
680    /// let page = document.get_page(0)?;
681    /// println!("Page dimensions: {}x{} points", page.width(), page.height());
682    /// if page.rotation != 0 {
683    ///     println!("Page is rotated {} degrees", page.rotation);
684    /// }
685    /// # Ok(())
686    /// # }
687    /// ```
688    pub fn height(&self) -> f64 {
689        match self.rotation {
690            90 | 270 => self.media_box[2] - self.media_box[0],
691            _ => self.media_box[3] - self.media_box[1],
692        }
693    }
694
695    /// Get the content streams for this page using a PdfReader.
696    ///
697    /// Content streams contain the actual drawing instructions (operators) that render
698    /// text, graphics, and images on the page. A page may have multiple content streams
699    /// which are concatenated during rendering.
700    ///
701    /// # Arguments
702    ///
703    /// * `reader` - Mutable reference to the PDF reader
704    ///
705    /// # Returns
706    ///
707    /// A vector of decompressed content stream data. Each vector contains the raw bytes
708    /// of a content stream ready for parsing.
709    ///
710    /// # Errors
711    ///
712    /// Returns an error if:
713    /// - The Contents entry is malformed
714    /// - Stream decompression fails
715    /// - Referenced objects cannot be resolved
716    ///
717    /// # Example
718    ///
719    /// ```rust,no_run
720    /// # use oxidize_pdf::parser::{PdfReader, ParsedPage};
721    /// # fn example(page: &ParsedPage, reader: &mut PdfReader<std::fs::File>) -> Result<(), Box<dyn std::error::Error>> {
722    /// let streams = page.content_streams(reader)?;
723    /// for (i, stream) in streams.iter().enumerate() {
724    ///     println!("Content stream {}: {} bytes", i, stream.len());
725    /// }
726    /// # Ok(())
727    /// # }
728    /// ```
729    pub fn content_streams<R: Read + Seek>(
730        &self,
731        reader: &mut PdfReader<R>,
732    ) -> ParseResult<Vec<Vec<u8>>> {
733        let mut streams = Vec::new();
734
735        if let Some(contents) = self.dict.get("Contents") {
736            // First resolve contents to check its type
737            let contents_type = match contents {
738                PdfObject::Reference(obj_num, gen_num) => {
739                    let resolved = reader.get_object(*obj_num, *gen_num)?;
740                    match resolved {
741                        PdfObject::Stream(_) => "stream",
742                        PdfObject::Array(_) => "array",
743                        _ => "other",
744                    }
745                }
746                PdfObject::Stream(_) => "stream",
747                PdfObject::Array(_) => "array",
748                _ => "other",
749            };
750
751            let options = reader.options().clone();
752            match contents_type {
753                "stream" => {
754                    let resolved = reader.resolve(contents)?;
755                    if let PdfObject::Stream(stream) = resolved {
756                        streams.push(stream.decode(&options)?);
757                    }
758                }
759                "array" => {
760                    // Get array references first
761                    let refs: Vec<(u32, u16)> = {
762                        let resolved = reader.resolve(contents)?;
763                        if let PdfObject::Array(array) = resolved {
764                            array
765                                .0
766                                .iter()
767                                .filter_map(|obj| {
768                                    if let PdfObject::Reference(num, gen) = obj {
769                                        Some((*num, *gen))
770                                    } else {
771                                        None
772                                    }
773                                })
774                                .collect()
775                        } else {
776                            Vec::new()
777                        }
778                    };
779
780                    // Now resolve each reference
781                    for (obj_num, gen_num) in refs {
782                        let obj = reader.get_object(obj_num, gen_num)?;
783                        if let PdfObject::Stream(stream) = obj {
784                            streams.push(stream.decode(&options)?);
785                        }
786                    }
787                }
788                _ => {
789                    return Err(ParseError::SyntaxError {
790                        position: 0,
791                        message: "Contents must be a stream or array of streams".to_string(),
792                    })
793                }
794            }
795        }
796
797        Ok(streams)
798    }
799
800    /// Get content streams using PdfDocument (recommended method).
801    ///
802    /// This is the preferred method for accessing content streams as it uses the
803    /// document's caching and resource management capabilities.
804    ///
805    /// # Arguments
806    ///
807    /// * `document` - Reference to the PDF document
808    ///
809    /// # Returns
810    ///
811    /// A vector of decompressed content stream data ready for parsing with `ContentParser`.
812    ///
813    /// # Example
814    ///
815    /// ```rust,no_run
816    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
817    /// # use oxidize_pdf::parser::content::ContentParser;
818    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
819    /// let reader = PdfReader::open("document.pdf")?;
820    /// let document = PdfDocument::new(reader);
821    /// let page = document.get_page(0)?;
822    ///
823    /// // Get content streams
824    /// let streams = page.content_streams_with_document(&document)?;
825    ///
826    /// // Parse each stream
827    /// for stream_data in streams {
828    ///     let operations = ContentParser::parse_content(&stream_data)?;
829    ///     println!("Stream has {} operations", operations.len());
830    /// }
831    /// # Ok(())
832    /// # }
833    /// ```
834    pub fn content_streams_with_document<R: Read + Seek>(
835        &self,
836        document: &PdfDocument<R>,
837    ) -> ParseResult<Vec<Vec<u8>>> {
838        document.get_page_content_streams(self)
839    }
840
841    /// Get the effective resources for this page (including inherited).
842    ///
843    /// Resources include fonts, images (XObjects), color spaces, patterns, and other
844    /// assets needed to render the page. This method returns page-specific resources
845    /// if present, otherwise falls back to inherited resources from parent nodes.
846    ///
847    /// # Returns
848    ///
849    /// The Resources dictionary if available, or None if the page has no resources.
850    ///
851    /// # Resource Categories
852    ///
853    /// The Resources dictionary may contain:
854    /// - `Font` - Font definitions used by text operators
855    /// - `XObject` - External objects (images, form XObjects)
856    /// - `ColorSpace` - Color space definitions
857    /// - `Pattern` - Pattern definitions for fills
858    /// - `Shading` - Shading dictionaries
859    /// - `ExtGState` - Graphics state parameter dictionaries
860    /// - `Properties` - Property list dictionaries
861    ///
862    /// # Example
863    ///
864    /// ```rust,no_run
865    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
866    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
867    /// # let reader = PdfReader::open("document.pdf")?;
868    /// # let document = PdfDocument::new(reader);
869    /// # let page = document.get_page(0)?;
870    /// if let Some(resources) = page.get_resources() {
871    ///     // Check for fonts
872    ///     if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
873    ///         println!("Page uses {} fonts", fonts.0.len());
874    ///     }
875    ///     
876    ///     // Check for images
877    ///     if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
878    ///         println!("Page has {} XObjects", xobjects.0.len());
879    ///     }
880    /// }
881    /// # Ok(())
882    /// # }
883    /// ```
884    pub fn get_contents(&self) -> Option<&PdfObject> {
885        self.dict.get("Contents")
886    }
887
888    pub fn get_resources(&self) -> Option<&PdfDictionary> {
889        self.dict
890            .get("Resources")
891            .and_then(|r| r.as_dict())
892            .or(self.inherited_resources.as_ref())
893    }
894
895    /// Clone this page with all inherited resources merged into the page dictionary.
896    ///
897    /// This is useful when extracting a page for separate processing or when you need
898    /// a self-contained page object with all resources explicitly included.
899    ///
900    /// # Returns
901    ///
902    /// A cloned page with inherited resources merged into the Resources entry
903    /// of the page dictionary.
904    ///
905    /// # Example
906    ///
907    /// ```rust,no_run
908    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
909    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
910    /// # let reader = PdfReader::open("document.pdf")?;
911    /// # let document = PdfDocument::new(reader);
912    /// # let page = document.get_page(0)?;
913    /// // Get a self-contained page with all resources
914    /// let standalone_page = page.clone_with_resources();
915    ///
916    /// // The cloned page now has all resources in its dictionary
917    /// assert!(standalone_page.dict.contains_key("Resources"));
918    /// # Ok(())
919    /// # }
920    /// ```
921    pub fn clone_with_resources(&self) -> Self {
922        let mut cloned = self.clone();
923
924        // Merge inherited resources into the page dictionary if needed
925        if let Some(inherited) = &self.inherited_resources {
926            if !cloned.dict.contains_key("Resources") {
927                cloned.dict.insert(
928                    "Resources".to_string(),
929                    PdfObject::Dictionary(inherited.clone()),
930                );
931            }
932        }
933
934        cloned
935    }
936
937    /// Get the annotations array for this page.
938    ///
939    /// Returns a reference to the annotations array if present.
940    /// Each element in the array is typically a reference to an annotation dictionary.
941    ///
942    /// # Example
943    ///
944    /// ```rust,no_run
945    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
946    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
947    /// # let reader = PdfReader::open("document.pdf")?;
948    /// # let document = PdfDocument::new(reader);
949    /// # let page = document.get_page(0)?;
950    /// if let Some(annots) = page.get_annotations() {
951    ///     println!("Page has {} annotations", annots.len());
952    /// }
953    /// # Ok(())
954    /// # }
955    /// ```
956    pub fn get_annotations(&self) -> Option<&PdfArray> {
957        self.annotations.as_ref()
958    }
959
960    /// Check if the page has annotations.
961    ///
962    /// # Returns
963    ///
964    /// `true` if the page has an annotations array with at least one annotation,
965    /// `false` otherwise.
966    ///
967    /// # Example
968    ///
969    /// ```rust,no_run
970    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
971    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
972    /// # let reader = PdfReader::open("document.pdf")?;
973    /// # let document = PdfDocument::new(reader);
974    /// # let page = document.get_page(0)?;
975    /// if page.has_annotations() {
976    ///     println!("This page contains annotations");
977    /// }
978    /// # Ok(())
979    /// # }
980    /// ```
981    pub fn has_annotations(&self) -> bool {
982        self.annotations
983            .as_ref()
984            .map(|arr| !arr.is_empty())
985            .unwrap_or(false)
986    }
987
988    /// Get all objects referenced by this page (for extraction or analysis).
989    ///
990    /// This method recursively collects all objects referenced by the page, including:
991    /// - Content streams
992    /// - Resources (fonts, images, etc.)
993    /// - Nested objects within resources
994    ///
995    /// This is useful for extracting a complete page with all its dependencies or
996    /// for analyzing the object graph of a page.
997    ///
998    /// # Arguments
999    ///
1000    /// * `reader` - Mutable reference to the PDF reader
1001    ///
1002    /// # Returns
1003    ///
1004    /// A HashMap mapping object references (obj_num, gen_num) to their resolved objects.
1005    ///
1006    /// # Example
1007    ///
1008    /// ```rust,no_run
1009    /// # use oxidize_pdf::parser::{PdfReader, ParsedPage};
1010    /// # fn example(page: &ParsedPage, reader: &mut PdfReader<std::fs::File>) -> Result<(), Box<dyn std::error::Error>> {
1011    /// let referenced_objects = page.get_referenced_objects(reader)?;
1012    ///
1013    /// println!("Page references {} objects", referenced_objects.len());
1014    /// for ((obj_num, gen_num), obj) in &referenced_objects {
1015    ///     println!("  {} {} R: {:?}", obj_num, gen_num, obj);
1016    /// }
1017    /// # Ok(())
1018    /// # }
1019    /// ```
1020    pub fn get_referenced_objects<R: Read + Seek>(
1021        &self,
1022        reader: &mut PdfReader<R>,
1023    ) -> ParseResult<HashMap<(u32, u16), PdfObject>> {
1024        let mut objects = HashMap::new();
1025        let mut to_process = Vec::new();
1026
1027        // Start with Contents
1028        if let Some(contents) = self.dict.get("Contents") {
1029            Self::collect_references(contents, &mut to_process);
1030        }
1031
1032        // Add Resources
1033        if let Some(resources) = self.get_resources() {
1034            for value in resources.0.values() {
1035                Self::collect_references(value, &mut to_process);
1036            }
1037        }
1038
1039        // Process all references
1040        while let Some((obj_num, gen_num)) = to_process.pop() {
1041            if let std::collections::hash_map::Entry::Vacant(e) = objects.entry((obj_num, gen_num))
1042            {
1043                let obj = reader.get_object(obj_num, gen_num)?;
1044
1045                // Collect nested references
1046                Self::collect_references_from_object(obj, &mut to_process);
1047
1048                e.insert(obj.clone());
1049            }
1050        }
1051
1052        Ok(objects)
1053    }
1054
1055    /// Collect object references from a PDF object
1056    fn collect_references(obj: &PdfObject, refs: &mut Vec<(u32, u16)>) {
1057        match obj {
1058            PdfObject::Reference(obj_num, gen_num) => {
1059                refs.push((*obj_num, *gen_num));
1060            }
1061            PdfObject::Array(array) => {
1062                for item in &array.0 {
1063                    Self::collect_references(item, refs);
1064                }
1065            }
1066            PdfObject::Dictionary(dict) => {
1067                for value in dict.0.values() {
1068                    Self::collect_references(value, refs);
1069                }
1070            }
1071            _ => {}
1072        }
1073    }
1074
1075    /// Collect references from an object (after resolution)
1076    fn collect_references_from_object(obj: &PdfObject, refs: &mut Vec<(u32, u16)>) {
1077        match obj {
1078            PdfObject::Array(array) => {
1079                for item in &array.0 {
1080                    Self::collect_references(item, refs);
1081                }
1082            }
1083            PdfObject::Dictionary(dict) | PdfObject::Stream(PdfStream { dict, .. }) => {
1084                for value in dict.0.values() {
1085                    Self::collect_references(value, refs);
1086                }
1087            }
1088            _ => {}
1089        }
1090    }
1091}
1092
1093#[cfg(test)]
1094mod tests {
1095    use super::super::objects::{PdfArray, PdfDictionary, PdfName, PdfObject};
1096    use super::*;
1097    use std::collections::HashMap;
1098
1099    fn create_test_page() -> ParsedPage {
1100        let mut dict = PdfDictionary(HashMap::new());
1101        dict.0.insert(
1102            PdfName("Type".to_string()),
1103            PdfObject::Name(PdfName("Page".to_string())),
1104        );
1105        dict.0
1106            .insert(PdfName("Parent".to_string()), PdfObject::Reference(2, 0));
1107
1108        ParsedPage {
1109            obj_ref: (3, 0),
1110            dict,
1111            inherited_resources: None,
1112            media_box: [0.0, 0.0, 595.0, 842.0],
1113            crop_box: None,
1114            rotation: 0,
1115            annotations: None,
1116        }
1117    }
1118
1119    fn create_test_page_with_resources() -> ParsedPage {
1120        let mut dict = PdfDictionary(HashMap::new());
1121        dict.0.insert(
1122            PdfName("Type".to_string()),
1123            PdfObject::Name(PdfName("Page".to_string())),
1124        );
1125
1126        let mut resources = PdfDictionary(HashMap::new());
1127        resources.0.insert(
1128            PdfName("Font".to_string()),
1129            PdfObject::Dictionary(PdfDictionary(HashMap::new())),
1130        );
1131
1132        ParsedPage {
1133            obj_ref: (4, 0),
1134            dict,
1135            inherited_resources: Some(resources),
1136            media_box: [0.0, 0.0, 595.0, 842.0],
1137            crop_box: Some([10.0, 10.0, 585.0, 832.0]),
1138            rotation: 90,
1139            annotations: Some(PdfArray(vec![])),
1140        }
1141    }
1142
1143    #[test]
1144    fn test_page_tree_new() {
1145        let tree = PageTree::new(10);
1146        assert_eq!(tree.page_count, 10);
1147        assert_eq!(tree.pages.len(), 0);
1148        assert!(tree.pages_dict.is_none());
1149    }
1150
1151    #[test]
1152    fn test_page_tree_new_with_pages_dict() {
1153        let pages_dict = PdfDictionary(HashMap::new());
1154        let tree = PageTree::new_with_pages_dict(5, pages_dict);
1155        assert_eq!(tree.page_count, 5);
1156        assert_eq!(tree.pages.len(), 0);
1157        assert!(tree.pages_dict.is_some());
1158    }
1159
1160    #[test]
1161    fn test_get_cached_page_empty() {
1162        let tree = PageTree::new(10);
1163        assert!(tree.get_cached_page(0).is_none());
1164        assert!(tree.get_cached_page(5).is_none());
1165    }
1166
1167    #[test]
1168    fn test_cache_and_get_page() {
1169        let mut tree = PageTree::new(10);
1170        let page = create_test_page();
1171
1172        tree.cache_page(0, page);
1173
1174        let cached = tree.get_cached_page(0);
1175        assert!(cached.is_some());
1176        let cached_page = cached.unwrap();
1177        assert_eq!(cached_page.obj_ref, (3, 0));
1178        assert_eq!(cached_page.media_box, [0.0, 0.0, 595.0, 842.0]);
1179    }
1180
1181    #[test]
1182    fn test_cache_multiple_pages() {
1183        let mut tree = PageTree::new(10);
1184        let page1 = create_test_page();
1185        let page2 = create_test_page_with_resources();
1186
1187        tree.cache_page(0, page1);
1188        tree.cache_page(1, page2);
1189
1190        assert!(tree.get_cached_page(0).is_some());
1191        assert!(tree.get_cached_page(1).is_some());
1192        assert!(tree.get_cached_page(2).is_none());
1193
1194        let cached1 = tree.get_cached_page(0).unwrap();
1195        assert_eq!(cached1.rotation, 0);
1196
1197        let cached2 = tree.get_cached_page(1).unwrap();
1198        assert_eq!(cached2.rotation, 90);
1199    }
1200
1201    #[test]
1202    fn test_get_page_count() {
1203        let tree = PageTree::new(25);
1204        assert_eq!(tree.page_count, 25);
1205    }
1206
1207    #[test]
1208    fn test_clear_cache() {
1209        let mut tree = PageTree::new(10);
1210        let page = create_test_page();
1211
1212        tree.cache_page(0, page.clone());
1213        tree.cache_page(1, page);
1214        assert_eq!(tree.pages.len(), 2);
1215
1216        tree.clear_cache();
1217        assert_eq!(tree.pages.len(), 0);
1218        assert!(tree.get_cached_page(0).is_none());
1219        assert!(tree.get_cached_page(1).is_none());
1220    }
1221
1222    #[test]
1223    fn test_parsed_page_properties() {
1224        let page = create_test_page_with_resources();
1225
1226        assert_eq!(page.obj_ref, (4, 0));
1227        assert_eq!(page.rotation, 90);
1228        assert!(page.inherited_resources.is_some());
1229        assert!(page.crop_box.is_some());
1230        assert!(page.annotations.is_some());
1231
1232        let crop_box = page.crop_box.unwrap();
1233        assert_eq!(crop_box, [10.0, 10.0, 585.0, 832.0]);
1234    }
1235
1236    #[test]
1237    fn test_parsed_page_creation() {
1238        let dict = PdfDictionary::new();
1239        let page = ParsedPage {
1240            obj_ref: (1, 0),
1241            dict: dict.clone(),
1242            inherited_resources: None,
1243            media_box: [0.0, 0.0, 612.0, 792.0],
1244            crop_box: None,
1245            rotation: 0,
1246            annotations: None,
1247        };
1248
1249        assert_eq!(page.obj_ref, (1, 0));
1250        assert_eq!(page.dict, dict);
1251        assert!(page.inherited_resources.is_none());
1252        assert_eq!(page.media_box, [0.0, 0.0, 612.0, 792.0]); // Default US Letter
1253        assert!(page.crop_box.is_none());
1254        assert_eq!(page.rotation, 0);
1255        assert!(page.annotations.is_none());
1256    }
1257
1258    #[test]
1259    fn test_parsed_page_width_height() {
1260        let mut page = create_test_page();
1261
1262        // A4 size
1263        assert_eq!(page.width(), 595.0);
1264        assert_eq!(page.height(), 842.0);
1265
1266        // Test with rotation
1267        page.rotation = 90;
1268        // Width and height should swap when rotated
1269        assert_eq!(page.width(), 842.0);
1270        assert_eq!(page.height(), 595.0);
1271
1272        page.rotation = 270;
1273        assert_eq!(page.width(), 842.0);
1274        assert_eq!(page.height(), 595.0);
1275
1276        page.rotation = 180;
1277        assert_eq!(page.width(), 595.0);
1278        assert_eq!(page.height(), 842.0);
1279    }
1280
1281    #[test]
1282    fn test_parsed_page_get_resources() {
1283        let page = create_test_page_with_resources();
1284        let resources = page.get_resources();
1285
1286        assert!(resources.is_some());
1287        let res = resources.unwrap();
1288        assert!(res.contains_key("Font"));
1289    }
1290
1291    #[test]
1292    fn test_parsed_page_get_contents() {
1293        let mut page = create_test_page();
1294
1295        // Add contents to page
1296        page.dict
1297            .insert("Contents".to_string(), PdfObject::Reference(10, 0));
1298
1299        let contents = page.get_contents();
1300        assert!(contents.is_some());
1301        assert_eq!(contents, Some(&PdfObject::Reference(10, 0)));
1302    }
1303
1304    #[test]
1305    fn test_parsed_page_get_annotations() {
1306        let page = create_test_page_with_resources();
1307        let annotations = page.get_annotations();
1308
1309        assert!(annotations.is_some());
1310        if let Some(arr) = annotations {
1311            assert_eq!(arr.0.len(), 0);
1312        }
1313    }
1314
1315    #[test]
1316    fn test_parsed_page_inherited_resources() {
1317        let mut page = create_test_page();
1318        let mut parent_resources = PdfDictionary::new();
1319        parent_resources.insert(
1320            "Font".to_string(),
1321            PdfObject::Dictionary(PdfDictionary::new()),
1322        );
1323
1324        // Directly set inherited resources
1325        page.inherited_resources = Some(parent_resources.clone());
1326
1327        assert!(page.inherited_resources.is_some());
1328        assert_eq!(page.inherited_resources, Some(parent_resources));
1329    }
1330
1331    #[test]
1332    fn test_parsed_page_with_crop_box() {
1333        let mut page = create_test_page();
1334        page.crop_box = Some([50.0, 50.0, 545.0, 792.0]);
1335
1336        // CropBox affects visible area
1337        let crop = page.crop_box.unwrap();
1338        assert_eq!(crop[0], 50.0);
1339        assert_eq!(crop[1], 50.0);
1340        assert_eq!(crop[2], 545.0);
1341        assert_eq!(crop[3], 792.0);
1342    }
1343
1344    #[test]
1345    fn test_page_tree_cache_overflow() {
1346        let mut tree = PageTree::new(100);
1347
1348        // Cache more pages than typical cache size
1349        for i in 0..50 {
1350            let page = create_test_page();
1351            tree.cache_page(i, page);
1352        }
1353
1354        // All pages should be cached
1355        for i in 0..50 {
1356            assert!(tree.get_cached_page(i).is_some());
1357        }
1358    }
1359
1360    #[test]
1361    fn test_page_tree_update_cached_page() {
1362        let mut tree = PageTree::new(10);
1363        let page1 = create_test_page();
1364        let mut page2 = create_test_page();
1365        page2.rotation = 180;
1366
1367        tree.cache_page(0, page1);
1368        let cached = tree.get_cached_page(0).unwrap();
1369        assert_eq!(cached.rotation, 0);
1370
1371        // Update the same page
1372        tree.cache_page(0, page2);
1373        let cached = tree.get_cached_page(0).unwrap();
1374        assert_eq!(cached.rotation, 180);
1375    }
1376
1377    #[test]
1378    fn test_parsed_page_clone() {
1379        let page = create_test_page_with_resources();
1380        let cloned = page.clone();
1381
1382        assert_eq!(page.obj_ref, cloned.obj_ref);
1383        assert_eq!(page.dict, cloned.dict);
1384        assert_eq!(page.inherited_resources, cloned.inherited_resources);
1385        assert_eq!(page.media_box, cloned.media_box);
1386        assert_eq!(page.crop_box, cloned.crop_box);
1387        assert_eq!(page.rotation, cloned.rotation);
1388        assert_eq!(page.annotations, cloned.annotations);
1389    }
1390
1391    #[test]
1392    fn test_page_tree_get_page_bounds() {
1393        let tree = PageTree::new(100);
1394
1395        // Test bounds checking
1396        assert!(tree.get_cached_page(0).is_none()); // Not cached yet
1397        assert!(tree.get_cached_page(99).is_none()); // Within bounds but not cached
1398        assert!(tree.get_cached_page(100).is_none()); // Out of bounds
1399        assert!(tree.get_cached_page(u32::MAX).is_none()); // Way out of bounds
1400    }
1401}
1402
1403#[cfg(test)]
1404#[path = "page_tree_tests.rs"]
1405mod page_tree_tests;
oxidize_pdf/parser/page_tree.rs

oxidize_pdf/parser/
page_tree.rs