oxidize_pdf/parser/
page_tree.rs

1//! PDF Page Tree Parser
2//!
3//! This module handles navigation and extraction of pages from the PDF page tree structure.
4//! The page tree is a hierarchical structure that organizes pages in a PDF document,
5//! allowing for efficient access and inheritance of properties from parent nodes.
6//!
7//! # Overview
8//!
9//! The PDF page tree consists of:
10//! - **Page Tree Nodes**: Internal nodes that can contain other nodes or pages
11//! - **Page Objects**: Leaf nodes representing individual pages
12//! - **Inherited Properties**: Resources, MediaBox, CropBox, and Rotate can be inherited from parent nodes
13//!
14//! # Example
15//!
16//! ```rust,no_run
17//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
18//!
19//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
20//! // Open a PDF document
21//! let reader = PdfReader::open("document.pdf")?;
22//! let document = PdfDocument::new(reader);
23//!
24//! // Get a specific page
25//! let page = document.get_page(0)?;
26//!
27//! // Access page properties
28//! println!("Page size: {}x{} points", page.width(), page.height());
29//! println!("Rotation: {}°", page.rotation);
30//!
31//! // Get page resources
32//! if let Some(resources) = page.get_resources() {
33//!     println!("Page has resources");
34//! }
35//! # Ok(())
36//! # }
37//! ```
38
39use super::document::PdfDocument;
40use super::objects::{PdfArray, PdfDictionary, PdfObject, PdfStream};
41use super::reader::PdfReader;
42use super::{ParseError, ParseResult};
43use std::collections::HashMap;
44use std::io::{Read, Seek};
45
46/// Represents a single page in the PDF with all its properties and resources.
47///
48/// A `ParsedPage` contains all the information needed to render or analyze a PDF page,
49/// including its dimensions, content streams, resources, and inherited properties from
50/// parent page tree nodes.
51///
52/// # Fields
53///
54/// * `obj_ref` - Object reference (object number, generation number) pointing to this page in the PDF
55/// * `dict` - Complete page dictionary containing all page-specific entries
56/// * `inherited_resources` - Resources inherited from parent page tree nodes
57/// * `media_box` - Page dimensions in PDF units [llx, lly, urx, ury]
58/// * `crop_box` - Optional visible area of the page
59/// * `rotation` - Page rotation in degrees (0, 90, 180, or 270)
60///
61/// # Example
62///
63/// ```rust,no_run
64/// use oxidize_pdf::parser::{PdfDocument, PdfReader};
65///
66/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
67/// let reader = PdfReader::open("document.pdf")?;
68/// let document = PdfDocument::new(reader);
69/// let page = document.get_page(0)?;
70///
71/// // Access page properties
72/// let (obj_num, gen_num) = page.obj_ref;
73/// println!("Page object: {} {} R", obj_num, gen_num);
74///
75/// // Get page dimensions
76/// let [llx, lly, urx, ury] = page.media_box;
77/// println!("MediaBox: ({}, {}) to ({}, {})", llx, lly, urx, ury);
78///
79/// // Check for content
80/// if let Some(contents) = page.dict.get("Contents") {
81///     println!("Page has content streams");
82/// }
83/// # Ok(())
84/// # }
85/// ```
86#[derive(Debug, Clone)]
87pub struct ParsedPage {
88    /// Object reference to this page in the form (object_number, generation_number).
89    /// This uniquely identifies the page object in the PDF file.
90    pub obj_ref: (u32, u16),
91
92    /// Page dictionary containing all page-specific entries like Contents, Resources, etc.
93    /// This is the raw PDF dictionary for the page object.
94    pub dict: PdfDictionary,
95
96    /// Resources inherited from parent page tree nodes.
97    /// These are automatically merged during page tree traversal.
98    pub inherited_resources: Option<PdfDictionary>,
99
100    /// MediaBox defining the page dimensions in PDF units (typically points).
101    /// Format: [lower_left_x, lower_left_y, upper_right_x, upper_right_y]
102    pub media_box: [f64; 4],
103
104    /// CropBox defining the visible area of the page.
105    /// If None, the entire MediaBox is visible.
106    pub crop_box: Option<[f64; 4]>,
107
108    /// Page rotation in degrees. Valid values are 0, 90, 180, or 270.
109    /// The rotation is applied clockwise.
110    pub rotation: i32,
111
112    /// Annotations array containing references to annotation objects.
113    /// This is parsed from the page's /Annots entry.
114    pub annotations: Option<PdfArray>,
115}
116
117/// Page tree navigator
118pub struct PageTree {
119    /// Total number of pages
120    page_count: u32,
121    /// Cached pages by index
122    pages: HashMap<u32, ParsedPage>,
123    /// Root pages dictionary (for navigation)
124    #[allow(dead_code)]
125    pages_dict: Option<PdfDictionary>,
126}
127
128impl PageTree {
129    /// Create a new page tree navigator
130    pub fn new(page_count: u32) -> Self {
131        Self {
132            page_count,
133            pages: HashMap::new(),
134            pages_dict: None,
135        }
136    }
137
138    /// Create a new page tree navigator with pages dictionary
139    pub fn new_with_pages_dict(page_count: u32, pages_dict: PdfDictionary) -> Self {
140        Self {
141            page_count,
142            pages: HashMap::new(),
143            pages_dict: Some(pages_dict),
144        }
145    }
146
147    /// Get a cached page by index (0-based)
148    pub fn get_cached_page(&self, index: u32) -> Option<&ParsedPage> {
149        self.pages.get(&index)
150    }
151
152    /// Cache a page
153    pub fn cache_page(&mut self, index: u32, page: ParsedPage) {
154        self.pages.insert(index, page);
155    }
156
157    /// Clear all cached pages
158    pub fn clear_cache(&mut self) {
159        self.pages.clear();
160    }
161
162    /// Get the total page count
163    pub fn page_count(&self) -> u32 {
164        self.page_count
165    }
166
167    /// Load a specific page by traversing the page tree
168    ///
169    /// Note: This method is currently not fully implemented due to architectural constraints
170    /// with recursive page tree traversal and borrow checker issues.
171    #[allow(dead_code)]
172    fn load_page_at_index<R: Read + Seek>(
173        &self,
174        reader: &mut PdfReader<R>,
175        node: &PdfDictionary,
176        node_ref: (u32, u16),
177        target_index: u32,
178        inherited: Option<&PdfDictionary>,
179    ) -> ParseResult<ParsedPage> {
180        let node_type = node
181            .get_type()
182            .or_else(|| {
183                // If Type is missing, try to infer from content
184                if node.contains_key("Kids") && node.contains_key("Count") {
185                    Some("Pages")
186                } else if node.contains_key("Contents") || node.contains_key("MediaBox") {
187                    Some("Page")
188                } else {
189                    None
190                }
191            })
192            .or_else(|| {
193                // If Type is missing and we have lenient parsing, try to infer
194                let lenient_syntax = reader.options().lenient_syntax;
195                let collect_warnings = reader.options().collect_warnings;
196
197                if lenient_syntax || collect_warnings {
198                    // If it has Kids, it's likely a Pages node
199                    if node.contains_key("Kids") {
200                        if collect_warnings {
201                            tracing::debug!(
202                                "Warning: Inferred Type=Pages for object {} {} R (missing Type field, has Kids)",
203                                node_ref.0, node_ref.1
204                            );
205                        }
206                        Some("Pages")
207                    }
208                    // If it has Contents or MediaBox but no Kids, it's likely a Page
209                    else if node.contains_key("Contents")
210                        || (node.contains_key("MediaBox") && !node.contains_key("Kids"))
211                    {
212                        if collect_warnings {
213                            tracing::debug!(
214                                "Warning: Inferred Type=Page for object {} {} R (missing Type field, has Contents/MediaBox)",
215                                node_ref.0, node_ref.1
216                            );
217                        }
218                        Some("Page")
219                    } else {
220                        None
221                    }
222                } else {
223                    None
224                }
225            })
226            .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
227
228        match node_type {
229            "Pages" => {
230                // This is a page tree node
231                let kids = node
232                    .get("Kids")
233                    .and_then(|obj| obj.as_array())
234                    .or_else(|| {
235                        // If Kids is missing and we have lenient parsing, use empty array
236                        if reader.options().lenient_syntax {
237                            if reader.options().collect_warnings {
238                                tracing::debug!(
239                                    "Warning: Missing Kids array in Pages node, using empty array"
240                                );
241                            }
242                            Some(&super::objects::EMPTY_PDF_ARRAY)
243                        } else {
244                            None
245                        }
246                    })
247                    .ok_or_else(|| ParseError::MissingKey("Kids".to_string()))?;
248
249                // Merge inherited attributes
250                let mut merged_inherited = inherited.cloned().unwrap_or_else(PdfDictionary::new);
251
252                // Inheritable attributes: Resources, MediaBox, CropBox, Rotate
253                if let Some(resources) = node.get("Resources") {
254                    if !merged_inherited.contains_key("Resources") {
255                        merged_inherited.insert("Resources".to_string(), resources.clone());
256                    }
257                }
258                if let Some(media_box) = node.get("MediaBox") {
259                    if !merged_inherited.contains_key("MediaBox") {
260                        merged_inherited.insert("MediaBox".to_string(), media_box.clone());
261                    }
262                }
263                if let Some(crop_box) = node.get("CropBox") {
264                    if !merged_inherited.contains_key("CropBox") {
265                        merged_inherited.insert("CropBox".to_string(), crop_box.clone());
266                    }
267                }
268                if let Some(rotate) = node.get("Rotate") {
269                    if !merged_inherited.contains_key("Rotate") {
270                        merged_inherited.insert("Rotate".to_string(), rotate.clone());
271                    }
272                }
273
274                // Find which kid contains our target page
275                let mut current_index = 0;
276                for kid_ref in &kids.0 {
277                    let kid_ref =
278                        kid_ref
279                            .as_reference()
280                            .ok_or_else(|| ParseError::SyntaxError {
281                                position: 0,
282                                message: "Kids array must contain references".to_string(),
283                            })?;
284
285                    // Get the kid object info first
286                    let (_kid_type, count, is_target) = {
287                        // Cache parse options to avoid borrow checker issues
288                        let lenient_syntax = reader.options().lenient_syntax;
289                        let collect_warnings = reader.options().collect_warnings;
290
291                        let kid_obj = reader.get_object(kid_ref.0, kid_ref.1)?;
292                        let kid_dict =
293                            kid_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
294                                position: 0,
295                                message: "Page tree node must be a dictionary".to_string(),
296                            })?;
297
298                        let kid_type = kid_dict
299                            .get_type()
300                            .or_else(|| {
301                                // If Type is missing, try to infer from content
302                                if kid_dict.contains_key("Kids") && kid_dict.contains_key("Count") {
303                                    Some("Pages")
304                                } else if kid_dict.contains_key("Contents")
305                                    || kid_dict.contains_key("MediaBox")
306                                {
307                                    Some("Page")
308                                } else {
309                                    None
310                                }
311                            })
312                            .or_else(|| {
313                                // Additional inference for reconstructed/corrupted objects
314                                if lenient_syntax || collect_warnings {
315                                    // If it has Kids, it's likely a Pages node
316                                    if kid_dict.contains_key("Kids") {
317                                        if collect_warnings {
318                                            tracing::debug!(
319                                                "Warning: Inferred Type=Pages for object {} 0 R (missing Type field, has Kids)",
320                                                kid_ref.0
321                                            );
322                                        }
323                                        Some("Pages")
324                                    }
325                                    // If it has Contents or MediaBox but no Kids, it's likely a Page
326                                    else if kid_dict.contains_key("Contents")
327                                        || (kid_dict.contains_key("MediaBox") && !kid_dict.contains_key("Kids"))
328                                    {
329                                        if collect_warnings {
330                                            tracing::debug!(
331                                                "Warning: Inferred Type=Page for object {} 0 R (missing Type field, has Contents/MediaBox)",
332                                                kid_ref.0
333                                            );
334                                        }
335                                        Some("Page")
336                                    } else {
337                                        None
338                                    }
339                                } else {
340                                    None
341                                }
342                            })
343                            .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
344
345                        let count = if kid_type == "Pages" {
346                            // This is another page tree node
347                            if let Some(count_obj) = kid_dict.get("Count") {
348                                count_obj.as_integer().unwrap_or(0) as u32
349                            } else {
350                                // Missing Count - use size of Kids array as approximation
351                                if let Some(nested_kids_obj) = kid_dict.get("Kids") {
352                                    if let Some(nested_kids_array) = nested_kids_obj.as_array() {
353                                        // Use array length as page count approximation
354                                        nested_kids_array.0.len() as u32
355                                    } else {
356                                        1 // Default if Kids is not an array
357                                    }
358                                } else {
359                                    1 // Default if no Kids array
360                                }
361                            }
362                        } else {
363                            // This is a page
364                            1
365                        };
366
367                        let is_target = target_index < current_index + count;
368                        (kid_type.to_string(), count, is_target)
369                    };
370
371                    if is_target {
372                        // Found the right subtree/page
373                        // Due to borrow checker constraints with recursive calls,
374                        // we return a placeholder page for now.
375                        // A proper implementation would require refactoring the page tree
376                        // traversal to use an iterative approach instead of recursion.
377
378                        return Ok(ParsedPage {
379                            obj_ref: kid_ref,
380                            dict: PdfDictionary::new(),
381                            inherited_resources: Some(merged_inherited.clone()),
382                            media_box: [0.0, 0.0, 612.0, 792.0],
383                            crop_box: None,
384                            rotation: 0,
385                            annotations: None,
386                        });
387                    }
388
389                    current_index += count;
390                }
391
392                Err(ParseError::SyntaxError {
393                    position: 0,
394                    message: "Page not found in tree".to_string(),
395                })
396            }
397            "Page" => {
398                // This is a page object
399                if target_index != 0 {
400                    return Err(ParseError::SyntaxError {
401                        position: 0,
402                        message: "Page index mismatch".to_string(),
403                    });
404                }
405
406                // Use the object reference passed as parameter
407                let obj_ref = node_ref;
408
409                // Extract page attributes
410                let media_box =
411                    Self::get_rectangle(node, inherited, "MediaBox")?.unwrap_or_else(|| {
412                        // Use default Letter size if MediaBox is missing
413                        #[cfg(debug_assertions)]
414                        tracing::debug!(
415                            "Warning: Page {} {} R missing MediaBox, using default Letter size",
416                            obj_ref.0,
417                            obj_ref.1
418                        );
419                        [0.0, 0.0, 612.0, 792.0]
420                    });
421
422                let crop_box = Self::get_rectangle(node, inherited, "CropBox")?;
423
424                let rotation = Self::get_integer(node, inherited, "Rotate")?.unwrap_or(0) as i32;
425
426                // Get resources
427                let inherited_resources = if let Some(inherited) = inherited {
428                    inherited
429                        .get("Resources")
430                        .and_then(|r| r.as_dict())
431                        .cloned()
432                } else {
433                    None
434                };
435
436                // Get annotations if present
437                let annotations = node.get("Annots").and_then(|obj| obj.as_array()).cloned();
438
439                Ok(ParsedPage {
440                    obj_ref,
441                    dict: node.clone(),
442                    inherited_resources,
443                    media_box,
444                    crop_box,
445                    rotation,
446                    annotations,
447                })
448            }
449            _ => Err(ParseError::SyntaxError {
450                position: 0,
451                message: format!("Invalid page tree node type: {node_type}"),
452            }),
453        }
454    }
455
456    /// Get a rectangle value, checking both node and inherited dictionaries
457    #[allow(dead_code)]
458    fn get_rectangle(
459        node: &PdfDictionary,
460        inherited: Option<&PdfDictionary>,
461        key: &str,
462    ) -> ParseResult<Option<[f64; 4]>> {
463        let array = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
464
465        if let Some(array) = array.and_then(|obj| obj.as_array()) {
466            if array.len() != 4 {
467                return Err(ParseError::SyntaxError {
468                    position: 0,
469                    message: format!("{key} must have 4 elements"),
470                });
471            }
472
473            // Safe: array length is guaranteed to be 4 after validation above
474            let rect = [
475                array.0[0].as_real().unwrap_or(0.0),
476                array.0[1].as_real().unwrap_or(0.0),
477                array.0[2].as_real().unwrap_or(0.0),
478                array.0[3].as_real().unwrap_or(0.0),
479            ];
480
481            Ok(Some(rect))
482        } else {
483            Ok(None)
484        }
485    }
486
487    /// Get an integer value, checking both node and inherited dictionaries
488    #[allow(dead_code)]
489    fn get_integer(
490        node: &PdfDictionary,
491        inherited: Option<&PdfDictionary>,
492        key: &str,
493    ) -> ParseResult<Option<i64>> {
494        let value = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
495
496        Ok(value.and_then(|obj| obj.as_integer()))
497    }
498}
499
500impl ParsedPage {
501    /// Get the effective page width accounting for rotation.
502    ///
503    /// The width is calculated from the MediaBox and adjusted based on the page rotation.
504    /// For 90° or 270° rotations, the width and height are swapped.
505    ///
506    /// # Returns
507    ///
508    /// The page width in PDF units (typically points, where 1 point = 1/72 inch)
509    ///
510    /// # Example
511    ///
512    /// ```rust,no_run
513    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
514    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
515    /// # let reader = PdfReader::open("document.pdf")?;
516    /// # let document = PdfDocument::new(reader);
517    /// let page = document.get_page(0)?;
518    /// let width_pts = page.width();
519    /// let width_inches = width_pts / 72.0;
520    /// let width_mm = width_pts * 25.4 / 72.0;
521    /// println!("Page width: {} points ({:.2} inches, {:.2} mm)", width_pts, width_inches, width_mm);
522    /// # Ok(())
523    /// # }
524    /// ```
525    pub fn width(&self) -> f64 {
526        match self.rotation {
527            90 | 270 => self.media_box[3] - self.media_box[1],
528            _ => self.media_box[2] - self.media_box[0],
529        }
530    }
531
532    /// Get the effective page height accounting for rotation.
533    ///
534    /// The height is calculated from the MediaBox and adjusted based on the page rotation.
535    /// For 90° or 270° rotations, the width and height are swapped.
536    ///
537    /// # Returns
538    ///
539    /// The page height in PDF units (typically points, where 1 point = 1/72 inch)
540    ///
541    /// # Example
542    ///
543    /// ```rust,no_run
544    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
545    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
546    /// # let reader = PdfReader::open("document.pdf")?;
547    /// # let document = PdfDocument::new(reader);
548    /// let page = document.get_page(0)?;
549    /// println!("Page dimensions: {}x{} points", page.width(), page.height());
550    /// if page.rotation != 0 {
551    ///     println!("Page is rotated {} degrees", page.rotation);
552    /// }
553    /// # Ok(())
554    /// # }
555    /// ```
556    pub fn height(&self) -> f64 {
557        match self.rotation {
558            90 | 270 => self.media_box[2] - self.media_box[0],
559            _ => self.media_box[3] - self.media_box[1],
560        }
561    }
562
563    /// Get the content streams for this page using a PdfReader.
564    ///
565    /// Content streams contain the actual drawing instructions (operators) that render
566    /// text, graphics, and images on the page. A page may have multiple content streams
567    /// which are concatenated during rendering.
568    ///
569    /// # Arguments
570    ///
571    /// * `reader` - Mutable reference to the PDF reader
572    ///
573    /// # Returns
574    ///
575    /// A vector of decompressed content stream data. Each vector contains the raw bytes
576    /// of a content stream ready for parsing.
577    ///
578    /// # Errors
579    ///
580    /// Returns an error if:
581    /// - The Contents entry is malformed
582    /// - Stream decompression fails
583    /// - Referenced objects cannot be resolved
584    ///
585    /// # Example
586    ///
587    /// ```rust,no_run
588    /// # use oxidize_pdf::parser::{PdfReader, ParsedPage};
589    /// # fn example(page: &ParsedPage, reader: &mut PdfReader<std::fs::File>) -> Result<(), Box<dyn std::error::Error>> {
590    /// let streams = page.content_streams(reader)?;
591    /// for (i, stream) in streams.iter().enumerate() {
592    ///     println!("Content stream {}: {} bytes", i, stream.len());
593    /// }
594    /// # Ok(())
595    /// # }
596    /// ```
597    pub fn content_streams<R: Read + Seek>(
598        &self,
599        reader: &mut PdfReader<R>,
600    ) -> ParseResult<Vec<Vec<u8>>> {
601        let mut streams = Vec::new();
602
603        if let Some(contents) = self.dict.get("Contents") {
604            // First resolve contents to check its type
605            let contents_type = match contents {
606                PdfObject::Reference(obj_num, gen_num) => {
607                    let resolved = reader.get_object(*obj_num, *gen_num)?;
608                    match resolved {
609                        PdfObject::Stream(_) => "stream",
610                        PdfObject::Array(_) => "array",
611                        _ => "other",
612                    }
613                }
614                PdfObject::Stream(_) => "stream",
615                PdfObject::Array(_) => "array",
616                _ => "other",
617            };
618
619            let options = reader.options().clone();
620            match contents_type {
621                "stream" => {
622                    let resolved = reader.resolve(contents)?;
623                    if let PdfObject::Stream(stream) = resolved {
624                        streams.push(stream.decode(&options)?);
625                    }
626                }
627                "array" => {
628                    // Get array references first
629                    let refs: Vec<(u32, u16)> = {
630                        let resolved = reader.resolve(contents)?;
631                        if let PdfObject::Array(array) = resolved {
632                            array
633                                .0
634                                .iter()
635                                .filter_map(|obj| {
636                                    if let PdfObject::Reference(num, gen) = obj {
637                                        Some((*num, *gen))
638                                    } else {
639                                        None
640                                    }
641                                })
642                                .collect()
643                        } else {
644                            Vec::new()
645                        }
646                    };
647
648                    // Now resolve each reference
649                    for (obj_num, gen_num) in refs {
650                        let obj = reader.get_object(obj_num, gen_num)?;
651                        if let PdfObject::Stream(stream) = obj {
652                            streams.push(stream.decode(&options)?);
653                        }
654                    }
655                }
656                _ => {
657                    return Err(ParseError::SyntaxError {
658                        position: 0,
659                        message: "Contents must be a stream or array of streams".to_string(),
660                    })
661                }
662            }
663        }
664
665        Ok(streams)
666    }
667
668    /// Get content streams using PdfDocument (recommended method).
669    ///
670    /// This is the preferred method for accessing content streams as it uses the
671    /// document's caching and resource management capabilities.
672    ///
673    /// # Arguments
674    ///
675    /// * `document` - Reference to the PDF document
676    ///
677    /// # Returns
678    ///
679    /// A vector of decompressed content stream data ready for parsing with `ContentParser`.
680    ///
681    /// # Example
682    ///
683    /// ```rust,no_run
684    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
685    /// # use oxidize_pdf::parser::content::ContentParser;
686    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
687    /// let reader = PdfReader::open("document.pdf")?;
688    /// let document = PdfDocument::new(reader);
689    /// let page = document.get_page(0)?;
690    ///
691    /// // Get content streams
692    /// let streams = page.content_streams_with_document(&document)?;
693    ///
694    /// // Parse each stream
695    /// for stream_data in streams {
696    ///     let operations = ContentParser::parse_content(&stream_data)?;
697    ///     println!("Stream has {} operations", operations.len());
698    /// }
699    /// # Ok(())
700    /// # }
701    /// ```
702    pub fn content_streams_with_document<R: Read + Seek>(
703        &self,
704        document: &PdfDocument<R>,
705    ) -> ParseResult<Vec<Vec<u8>>> {
706        document.get_page_content_streams(self)
707    }
708
709    /// Get the effective resources for this page (including inherited).
710    ///
711    /// Resources include fonts, images (XObjects), color spaces, patterns, and other
712    /// assets needed to render the page. This method returns page-specific resources
713    /// if present, otherwise falls back to inherited resources from parent nodes.
714    ///
715    /// # Returns
716    ///
717    /// The Resources dictionary if available, or None if the page has no resources.
718    ///
719    /// # Resource Categories
720    ///
721    /// The Resources dictionary may contain:
722    /// - `Font` - Font definitions used by text operators
723    /// - `XObject` - External objects (images, form XObjects)
724    /// - `ColorSpace` - Color space definitions
725    /// - `Pattern` - Pattern definitions for fills
726    /// - `Shading` - Shading dictionaries
727    /// - `ExtGState` - Graphics state parameter dictionaries
728    /// - `Properties` - Property list dictionaries
729    ///
730    /// # Example
731    ///
732    /// ```rust,no_run
733    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
734    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
735    /// # let reader = PdfReader::open("document.pdf")?;
736    /// # let document = PdfDocument::new(reader);
737    /// # let page = document.get_page(0)?;
738    /// if let Some(resources) = page.get_resources() {
739    ///     // Check for fonts
740    ///     if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
741    ///         println!("Page uses {} fonts", fonts.0.len());
742    ///     }
743    ///     
744    ///     // Check for images
745    ///     if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
746    ///         println!("Page has {} XObjects", xobjects.0.len());
747    ///     }
748    /// }
749    /// # Ok(())
750    /// # }
751    /// ```
752    pub fn get_contents(&self) -> Option<&PdfObject> {
753        self.dict.get("Contents")
754    }
755
756    pub fn get_resources(&self) -> Option<&PdfDictionary> {
757        self.dict
758            .get("Resources")
759            .and_then(|r| r.as_dict())
760            .or(self.inherited_resources.as_ref())
761    }
762
763    /// Clone this page with all inherited resources merged into the page dictionary.
764    ///
765    /// This is useful when extracting a page for separate processing or when you need
766    /// a self-contained page object with all resources explicitly included.
767    ///
768    /// # Returns
769    ///
770    /// A cloned page with inherited resources merged into the Resources entry
771    /// of the page dictionary.
772    ///
773    /// # Example
774    ///
775    /// ```rust,no_run
776    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
777    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
778    /// # let reader = PdfReader::open("document.pdf")?;
779    /// # let document = PdfDocument::new(reader);
780    /// # let page = document.get_page(0)?;
781    /// // Get a self-contained page with all resources
782    /// let standalone_page = page.clone_with_resources();
783    ///
784    /// // The cloned page now has all resources in its dictionary
785    /// assert!(standalone_page.dict.contains_key("Resources"));
786    /// # Ok(())
787    /// # }
788    /// ```
789    pub fn clone_with_resources(&self) -> Self {
790        let mut cloned = self.clone();
791
792        // Merge inherited resources into the page dictionary if needed
793        if let Some(inherited) = &self.inherited_resources {
794            if !cloned.dict.contains_key("Resources") {
795                cloned.dict.insert(
796                    "Resources".to_string(),
797                    PdfObject::Dictionary(inherited.clone()),
798                );
799            }
800        }
801
802        cloned
803    }
804
805    /// Get the annotations array for this page.
806    ///
807    /// Returns a reference to the annotations array if present.
808    /// Each element in the array is typically a reference to an annotation dictionary.
809    ///
810    /// # Example
811    ///
812    /// ```rust,no_run
813    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
814    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
815    /// # let reader = PdfReader::open("document.pdf")?;
816    /// # let document = PdfDocument::new(reader);
817    /// # let page = document.get_page(0)?;
818    /// if let Some(annots) = page.get_annotations() {
819    ///     println!("Page has {} annotations", annots.len());
820    /// }
821    /// # Ok(())
822    /// # }
823    /// ```
824    pub fn get_annotations(&self) -> Option<&PdfArray> {
825        self.annotations.as_ref()
826    }
827
828    /// Check if the page has annotations.
829    ///
830    /// # Returns
831    ///
832    /// `true` if the page has an annotations array with at least one annotation,
833    /// `false` otherwise.
834    ///
835    /// # Example
836    ///
837    /// ```rust,no_run
838    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
839    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
840    /// # let reader = PdfReader::open("document.pdf")?;
841    /// # let document = PdfDocument::new(reader);
842    /// # let page = document.get_page(0)?;
843    /// if page.has_annotations() {
844    ///     println!("This page contains annotations");
845    /// }
846    /// # Ok(())
847    /// # }
848    /// ```
849    pub fn has_annotations(&self) -> bool {
850        self.annotations
851            .as_ref()
852            .map(|arr| !arr.is_empty())
853            .unwrap_or(false)
854    }
855
856    /// Get all objects referenced by this page (for extraction or analysis).
857    ///
858    /// This method recursively collects all objects referenced by the page, including:
859    /// - Content streams
860    /// - Resources (fonts, images, etc.)
861    /// - Nested objects within resources
862    ///
863    /// This is useful for extracting a complete page with all its dependencies or
864    /// for analyzing the object graph of a page.
865    ///
866    /// # Arguments
867    ///
868    /// * `reader` - Mutable reference to the PDF reader
869    ///
870    /// # Returns
871    ///
872    /// A HashMap mapping object references (obj_num, gen_num) to their resolved objects.
873    ///
874    /// # Example
875    ///
876    /// ```rust,no_run
877    /// # use oxidize_pdf::parser::{PdfReader, ParsedPage};
878    /// # fn example(page: &ParsedPage, reader: &mut PdfReader<std::fs::File>) -> Result<(), Box<dyn std::error::Error>> {
879    /// let referenced_objects = page.get_referenced_objects(reader)?;
880    ///
881    /// println!("Page references {} objects", referenced_objects.len());
882    /// for ((obj_num, gen_num), obj) in &referenced_objects {
883    ///     println!("  {} {} R: {:?}", obj_num, gen_num, obj);
884    /// }
885    /// # Ok(())
886    /// # }
887    /// ```
888    pub fn get_referenced_objects<R: Read + Seek>(
889        &self,
890        reader: &mut PdfReader<R>,
891    ) -> ParseResult<HashMap<(u32, u16), PdfObject>> {
892        let mut objects = HashMap::new();
893        let mut to_process = Vec::new();
894
895        // Start with Contents
896        if let Some(contents) = self.dict.get("Contents") {
897            Self::collect_references(contents, &mut to_process);
898        }
899
900        // Add Resources
901        if let Some(resources) = self.get_resources() {
902            for value in resources.0.values() {
903                Self::collect_references(value, &mut to_process);
904            }
905        }
906
907        // Process all references
908        while let Some((obj_num, gen_num)) = to_process.pop() {
909            if let std::collections::hash_map::Entry::Vacant(e) = objects.entry((obj_num, gen_num))
910            {
911                let obj = reader.get_object(obj_num, gen_num)?;
912
913                // Collect nested references
914                Self::collect_references_from_object(obj, &mut to_process);
915
916                e.insert(obj.clone());
917            }
918        }
919
920        Ok(objects)
921    }
922
923    /// Collect object references from a PDF object
924    fn collect_references(obj: &PdfObject, refs: &mut Vec<(u32, u16)>) {
925        match obj {
926            PdfObject::Reference(obj_num, gen_num) => {
927                refs.push((*obj_num, *gen_num));
928            }
929            PdfObject::Array(array) => {
930                for item in &array.0 {
931                    Self::collect_references(item, refs);
932                }
933            }
934            PdfObject::Dictionary(dict) => {
935                for value in dict.0.values() {
936                    Self::collect_references(value, refs);
937                }
938            }
939            _ => {}
940        }
941    }
942
943    /// Collect references from an object (after resolution)
944    fn collect_references_from_object(obj: &PdfObject, refs: &mut Vec<(u32, u16)>) {
945        match obj {
946            PdfObject::Array(array) => {
947                for item in &array.0 {
948                    Self::collect_references(item, refs);
949                }
950            }
951            PdfObject::Dictionary(dict) | PdfObject::Stream(PdfStream { dict, .. }) => {
952                for value in dict.0.values() {
953                    Self::collect_references(value, refs);
954                }
955            }
956            _ => {}
957        }
958    }
959}
960
961#[cfg(test)]
962mod tests {
963    use super::super::objects::{PdfArray, PdfDictionary, PdfName, PdfObject};
964    use super::*;
965    use std::collections::HashMap;
966
967    fn create_test_page() -> ParsedPage {
968        let mut dict = PdfDictionary(HashMap::new());
969        dict.0.insert(
970            PdfName("Type".to_string()),
971            PdfObject::Name(PdfName("Page".to_string())),
972        );
973        dict.0
974            .insert(PdfName("Parent".to_string()), PdfObject::Reference(2, 0));
975
976        ParsedPage {
977            obj_ref: (3, 0),
978            dict,
979            inherited_resources: None,
980            media_box: [0.0, 0.0, 595.0, 842.0],
981            crop_box: None,
982            rotation: 0,
983            annotations: None,
984        }
985    }
986
987    fn create_test_page_with_resources() -> ParsedPage {
988        let mut dict = PdfDictionary(HashMap::new());
989        dict.0.insert(
990            PdfName("Type".to_string()),
991            PdfObject::Name(PdfName("Page".to_string())),
992        );
993
994        let mut resources = PdfDictionary(HashMap::new());
995        resources.0.insert(
996            PdfName("Font".to_string()),
997            PdfObject::Dictionary(PdfDictionary(HashMap::new())),
998        );
999
1000        ParsedPage {
1001            obj_ref: (4, 0),
1002            dict,
1003            inherited_resources: Some(resources),
1004            media_box: [0.0, 0.0, 595.0, 842.0],
1005            crop_box: Some([10.0, 10.0, 585.0, 832.0]),
1006            rotation: 90,
1007            annotations: Some(PdfArray(vec![])),
1008        }
1009    }
1010
1011    #[test]
1012    fn test_page_tree_new() {
1013        let tree = PageTree::new(10);
1014        assert_eq!(tree.page_count, 10);
1015        assert_eq!(tree.pages.len(), 0);
1016        assert!(tree.pages_dict.is_none());
1017    }
1018
1019    #[test]
1020    fn test_page_tree_new_with_pages_dict() {
1021        let pages_dict = PdfDictionary(HashMap::new());
1022        let tree = PageTree::new_with_pages_dict(5, pages_dict);
1023        assert_eq!(tree.page_count, 5);
1024        assert_eq!(tree.pages.len(), 0);
1025        assert!(tree.pages_dict.is_some());
1026    }
1027
1028    #[test]
1029    fn test_get_cached_page_empty() {
1030        let tree = PageTree::new(10);
1031        assert!(tree.get_cached_page(0).is_none());
1032        assert!(tree.get_cached_page(5).is_none());
1033    }
1034
1035    #[test]
1036    fn test_cache_and_get_page() {
1037        let mut tree = PageTree::new(10);
1038        let page = create_test_page();
1039
1040        tree.cache_page(0, page);
1041
1042        let cached = tree.get_cached_page(0);
1043        assert!(cached.is_some());
1044        let cached_page = cached.unwrap();
1045        assert_eq!(cached_page.obj_ref, (3, 0));
1046        assert_eq!(cached_page.media_box, [0.0, 0.0, 595.0, 842.0]);
1047    }
1048
1049    #[test]
1050    fn test_cache_multiple_pages() {
1051        let mut tree = PageTree::new(10);
1052        let page1 = create_test_page();
1053        let page2 = create_test_page_with_resources();
1054
1055        tree.cache_page(0, page1);
1056        tree.cache_page(1, page2);
1057
1058        assert!(tree.get_cached_page(0).is_some());
1059        assert!(tree.get_cached_page(1).is_some());
1060        assert!(tree.get_cached_page(2).is_none());
1061
1062        let cached1 = tree.get_cached_page(0).unwrap();
1063        assert_eq!(cached1.rotation, 0);
1064
1065        let cached2 = tree.get_cached_page(1).unwrap();
1066        assert_eq!(cached2.rotation, 90);
1067    }
1068
1069    #[test]
1070    fn test_get_page_count() {
1071        let tree = PageTree::new(25);
1072        assert_eq!(tree.page_count, 25);
1073    }
1074
1075    #[test]
1076    fn test_clear_cache() {
1077        let mut tree = PageTree::new(10);
1078        let page = create_test_page();
1079
1080        tree.cache_page(0, page.clone());
1081        tree.cache_page(1, page);
1082        assert_eq!(tree.pages.len(), 2);
1083
1084        tree.clear_cache();
1085        assert_eq!(tree.pages.len(), 0);
1086        assert!(tree.get_cached_page(0).is_none());
1087        assert!(tree.get_cached_page(1).is_none());
1088    }
1089
1090    #[test]
1091    fn test_parsed_page_properties() {
1092        let page = create_test_page_with_resources();
1093
1094        assert_eq!(page.obj_ref, (4, 0));
1095        assert_eq!(page.rotation, 90);
1096        assert!(page.inherited_resources.is_some());
1097        assert!(page.crop_box.is_some());
1098        assert!(page.annotations.is_some());
1099
1100        let crop_box = page.crop_box.unwrap();
1101        assert_eq!(crop_box, [10.0, 10.0, 585.0, 832.0]);
1102    }
1103
1104    #[test]
1105    fn test_parsed_page_creation() {
1106        let dict = PdfDictionary::new();
1107        let page = ParsedPage {
1108            obj_ref: (1, 0),
1109            dict: dict.clone(),
1110            inherited_resources: None,
1111            media_box: [0.0, 0.0, 612.0, 792.0],
1112            crop_box: None,
1113            rotation: 0,
1114            annotations: None,
1115        };
1116
1117        assert_eq!(page.obj_ref, (1, 0));
1118        assert_eq!(page.dict, dict);
1119        assert!(page.inherited_resources.is_none());
1120        assert_eq!(page.media_box, [0.0, 0.0, 612.0, 792.0]); // Default US Letter
1121        assert!(page.crop_box.is_none());
1122        assert_eq!(page.rotation, 0);
1123        assert!(page.annotations.is_none());
1124    }
1125
1126    #[test]
1127    fn test_parsed_page_width_height() {
1128        let mut page = create_test_page();
1129
1130        // A4 size
1131        assert_eq!(page.width(), 595.0);
1132        assert_eq!(page.height(), 842.0);
1133
1134        // Test with rotation
1135        page.rotation = 90;
1136        // Width and height should swap when rotated
1137        assert_eq!(page.width(), 842.0);
1138        assert_eq!(page.height(), 595.0);
1139
1140        page.rotation = 270;
1141        assert_eq!(page.width(), 842.0);
1142        assert_eq!(page.height(), 595.0);
1143
1144        page.rotation = 180;
1145        assert_eq!(page.width(), 595.0);
1146        assert_eq!(page.height(), 842.0);
1147    }
1148
1149    #[test]
1150    fn test_parsed_page_get_resources() {
1151        let page = create_test_page_with_resources();
1152        let resources = page.get_resources();
1153
1154        assert!(resources.is_some());
1155        let res = resources.unwrap();
1156        assert!(res.contains_key("Font"));
1157    }
1158
1159    #[test]
1160    fn test_parsed_page_get_contents() {
1161        let mut page = create_test_page();
1162
1163        // Add contents to page
1164        page.dict
1165            .insert("Contents".to_string(), PdfObject::Reference(10, 0));
1166
1167        let contents = page.get_contents();
1168        assert!(contents.is_some());
1169        assert_eq!(contents, Some(&PdfObject::Reference(10, 0)));
1170    }
1171
1172    #[test]
1173    fn test_parsed_page_get_annotations() {
1174        let page = create_test_page_with_resources();
1175        let annotations = page.get_annotations();
1176
1177        assert!(annotations.is_some());
1178        if let Some(arr) = annotations {
1179            assert_eq!(arr.0.len(), 0);
1180        }
1181    }
1182
1183    #[test]
1184    fn test_parsed_page_inherited_resources() {
1185        let mut page = create_test_page();
1186        let mut parent_resources = PdfDictionary::new();
1187        parent_resources.insert(
1188            "Font".to_string(),
1189            PdfObject::Dictionary(PdfDictionary::new()),
1190        );
1191
1192        // Directly set inherited resources
1193        page.inherited_resources = Some(parent_resources.clone());
1194
1195        assert!(page.inherited_resources.is_some());
1196        assert_eq!(page.inherited_resources, Some(parent_resources));
1197    }
1198
1199    #[test]
1200    fn test_parsed_page_with_crop_box() {
1201        let mut page = create_test_page();
1202        page.crop_box = Some([50.0, 50.0, 545.0, 792.0]);
1203
1204        // CropBox affects visible area
1205        let crop = page.crop_box.unwrap();
1206        assert_eq!(crop[0], 50.0);
1207        assert_eq!(crop[1], 50.0);
1208        assert_eq!(crop[2], 545.0);
1209        assert_eq!(crop[3], 792.0);
1210    }
1211
1212    #[test]
1213    fn test_page_tree_cache_overflow() {
1214        let mut tree = PageTree::new(100);
1215
1216        // Cache more pages than typical cache size
1217        for i in 0..50 {
1218            let page = create_test_page();
1219            tree.cache_page(i, page);
1220        }
1221
1222        // All pages should be cached
1223        for i in 0..50 {
1224            assert!(tree.get_cached_page(i).is_some());
1225        }
1226    }
1227
1228    #[test]
1229    fn test_page_tree_update_cached_page() {
1230        let mut tree = PageTree::new(10);
1231        let page1 = create_test_page();
1232        let mut page2 = create_test_page();
1233        page2.rotation = 180;
1234
1235        tree.cache_page(0, page1);
1236        let cached = tree.get_cached_page(0).unwrap();
1237        assert_eq!(cached.rotation, 0);
1238
1239        // Update the same page
1240        tree.cache_page(0, page2);
1241        let cached = tree.get_cached_page(0).unwrap();
1242        assert_eq!(cached.rotation, 180);
1243    }
1244
1245    #[test]
1246    fn test_parsed_page_clone() {
1247        let page = create_test_page_with_resources();
1248        let cloned = page.clone();
1249
1250        assert_eq!(page.obj_ref, cloned.obj_ref);
1251        assert_eq!(page.dict, cloned.dict);
1252        assert_eq!(page.inherited_resources, cloned.inherited_resources);
1253        assert_eq!(page.media_box, cloned.media_box);
1254        assert_eq!(page.crop_box, cloned.crop_box);
1255        assert_eq!(page.rotation, cloned.rotation);
1256        assert_eq!(page.annotations, cloned.annotations);
1257    }
1258
1259    #[test]
1260    fn test_page_tree_get_page_bounds() {
1261        let tree = PageTree::new(100);
1262
1263        // Test bounds checking
1264        assert!(tree.get_cached_page(0).is_none()); // Not cached yet
1265        assert!(tree.get_cached_page(99).is_none()); // Within bounds but not cached
1266        assert!(tree.get_cached_page(100).is_none()); // Out of bounds
1267        assert!(tree.get_cached_page(u32::MAX).is_none()); // Way out of bounds
1268    }
1269}
1270
1271#[cfg(test)]
1272#[path = "page_tree_tests.rs"]
1273mod page_tree_tests;
oxidize_pdf/parser/page_tree.rs

oxidize_pdf/parser/
page_tree.rs