oxidize_pdf/parser/page_tree.rs
1//! PDF Page Tree Parser
2//!
3//! This module handles navigation and extraction of pages from the PDF page tree structure.
4//! The page tree is a hierarchical structure that organizes pages in a PDF document,
5//! allowing for efficient access and inheritance of properties from parent nodes.
6//!
7//! # Overview
8//!
9//! The PDF page tree consists of:
10//! - **Page Tree Nodes**: Internal nodes that can contain other nodes or pages
11//! - **Page Objects**: Leaf nodes representing individual pages
12//! - **Inherited Properties**: Resources, MediaBox, CropBox, and Rotate can be inherited from parent nodes
13//!
14//! # Example
15//!
16//! ```rust,no_run
17//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
18//!
19//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
20//! // Open a PDF document
21//! let reader = PdfReader::open("document.pdf")?;
22//! let document = PdfDocument::new(reader);
23//!
24//! // Get a specific page
25//! let page = document.get_page(0)?;
26//!
27//! // Access page properties
28//! println!("Page size: {}x{} points", page.width(), page.height());
29//! println!("Rotation: {}°", page.rotation);
30//!
31//! // Get page resources
32//! if let Some(resources) = page.get_resources() {
33//! println!("Page has resources");
34//! }
35//! # Ok(())
36//! # }
37//! ```
38
39use super::document::PdfDocument;
40use super::objects::{PdfArray, PdfDictionary, PdfObject, PdfStream};
41use super::reader::PdfReader;
42use super::{ParseError, ParseResult};
43use std::collections::HashMap;
44use std::io::{Read, Seek};
45
46/// Represents a single page in the PDF with all its properties and resources.
47///
48/// A `ParsedPage` contains all the information needed to render or analyze a PDF page,
49/// including its dimensions, content streams, resources, and inherited properties from
50/// parent page tree nodes.
51///
52/// # Fields
53///
54/// * `obj_ref` - Object reference (object number, generation number) pointing to this page in the PDF
55/// * `dict` - Complete page dictionary containing all page-specific entries
56/// * `inherited_resources` - Resources inherited from parent page tree nodes
57/// * `media_box` - Page dimensions in PDF units [llx, lly, urx, ury]
58/// * `crop_box` - Optional visible area of the page
59/// * `rotation` - Page rotation in degrees (0, 90, 180, or 270)
60///
61/// # Example
62///
63/// ```rust,no_run
64/// use oxidize_pdf::parser::{PdfDocument, PdfReader};
65///
66/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
67/// let reader = PdfReader::open("document.pdf")?;
68/// let document = PdfDocument::new(reader);
69/// let page = document.get_page(0)?;
70///
71/// // Access page properties
72/// let (obj_num, gen_num) = page.obj_ref;
73/// println!("Page object: {} {} R", obj_num, gen_num);
74///
75/// // Get page dimensions
76/// let [llx, lly, urx, ury] = page.media_box;
77/// println!("MediaBox: ({}, {}) to ({}, {})", llx, lly, urx, ury);
78///
79/// // Check for content
80/// if let Some(contents) = page.dict.get("Contents") {
81/// println!("Page has content streams");
82/// }
83/// # Ok(())
84/// # }
85/// ```
86#[derive(Debug, Clone)]
87pub struct ParsedPage {
88 /// Object reference to this page in the form (object_number, generation_number).
89 /// This uniquely identifies the page object in the PDF file.
90 pub obj_ref: (u32, u16),
91
92 /// Page dictionary containing all page-specific entries like Contents, Resources, etc.
93 /// This is the raw PDF dictionary for the page object.
94 pub dict: PdfDictionary,
95
96 /// Resources inherited from parent page tree nodes.
97 /// These are automatically merged during page tree traversal.
98 pub inherited_resources: Option<PdfDictionary>,
99
100 /// MediaBox defining the page dimensions in PDF units (typically points).
101 /// Format: [lower_left_x, lower_left_y, upper_right_x, upper_right_y]
102 pub media_box: [f64; 4],
103
104 /// CropBox defining the visible area of the page.
105 /// If None, the entire MediaBox is visible.
106 pub crop_box: Option<[f64; 4]>,
107
108 /// Page rotation in degrees. Valid values are 0, 90, 180, or 270.
109 /// The rotation is applied clockwise.
110 pub rotation: i32,
111
112 /// Annotations array containing references to annotation objects.
113 /// This is parsed from the page's /Annots entry.
114 pub annotations: Option<PdfArray>,
115}
116
117/// Page tree navigator
118pub struct PageTree {
119 /// Total number of pages
120 page_count: u32,
121 /// Cached pages by index
122 pages: HashMap<u32, ParsedPage>,
123 /// Root pages dictionary (for navigation)
124 #[allow(dead_code)]
125 pages_dict: Option<PdfDictionary>,
126}
127
128impl PageTree {
129 /// Create a new page tree navigator
130 pub fn new(page_count: u32) -> Self {
131 Self {
132 page_count,
133 pages: HashMap::new(),
134 pages_dict: None,
135 }
136 }
137
138 /// Create a new page tree navigator with pages dictionary
139 pub fn new_with_pages_dict(page_count: u32, pages_dict: PdfDictionary) -> Self {
140 Self {
141 page_count,
142 pages: HashMap::new(),
143 pages_dict: Some(pages_dict),
144 }
145 }
146
147 /// Get a cached page by index (0-based)
148 pub fn get_cached_page(&self, index: u32) -> Option<&ParsedPage> {
149 self.pages.get(&index)
150 }
151
152 /// Cache a page
153 pub fn cache_page(&mut self, index: u32, page: ParsedPage) {
154 self.pages.insert(index, page);
155 }
156
157 /// Clear all cached pages
158 pub fn clear_cache(&mut self) {
159 self.pages.clear();
160 }
161
162 /// Get the total page count
163 pub fn page_count(&self) -> u32 {
164 self.page_count
165 }
166
167 /// Load a specific page by traversing the page tree
168 ///
169 /// Note: This method is currently not fully implemented due to architectural constraints
170 /// with recursive page tree traversal and borrow checker issues.
171 #[allow(dead_code)]
172 fn load_page_at_index<R: Read + Seek>(
173 &self,
174 reader: &mut PdfReader<R>,
175 node: &PdfDictionary,
176 node_ref: (u32, u16),
177 target_index: u32,
178 inherited: Option<&PdfDictionary>,
179 ) -> ParseResult<ParsedPage> {
180 let node_type = node
181 .get_type()
182 .or_else(|| {
183 // If Type is missing, try to infer from content
184 if node.contains_key("Kids") && node.contains_key("Count") {
185 Some("Pages")
186 } else if node.contains_key("Contents") || node.contains_key("MediaBox") {
187 Some("Page")
188 } else {
189 None
190 }
191 })
192 .or_else(|| {
193 // If Type is missing and we have lenient parsing, try to infer
194 let lenient_syntax = reader.options().lenient_syntax;
195 let collect_warnings = reader.options().collect_warnings;
196
197 if lenient_syntax || collect_warnings {
198 // If it has Kids, it's likely a Pages node
199 if node.contains_key("Kids") {
200 if collect_warnings {
201 tracing::debug!(
202 "Warning: Inferred Type=Pages for object {} {} R (missing Type field, has Kids)",
203 node_ref.0, node_ref.1
204 );
205 }
206 Some("Pages")
207 }
208 // If it has Contents or MediaBox but no Kids, it's likely a Page
209 else if node.contains_key("Contents")
210 || (node.contains_key("MediaBox") && !node.contains_key("Kids"))
211 {
212 if collect_warnings {
213 tracing::debug!(
214 "Warning: Inferred Type=Page for object {} {} R (missing Type field, has Contents/MediaBox)",
215 node_ref.0, node_ref.1
216 );
217 }
218 Some("Page")
219 } else {
220 None
221 }
222 } else {
223 None
224 }
225 })
226 .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
227
228 match node_type {
229 "Pages" => {
230 // This is a page tree node
231 let kids = node
232 .get("Kids")
233 .and_then(|obj| obj.as_array())
234 .or_else(|| {
235 // If Kids is missing and we have lenient parsing, use empty array
236 if reader.options().lenient_syntax {
237 if reader.options().collect_warnings {
238 tracing::debug!(
239 "Warning: Missing Kids array in Pages node, using empty array"
240 );
241 }
242 Some(&super::objects::EMPTY_PDF_ARRAY)
243 } else {
244 None
245 }
246 })
247 .ok_or_else(|| ParseError::MissingKey("Kids".to_string()))?;
248
249 // Merge inherited attributes
250 let mut merged_inherited = inherited.cloned().unwrap_or_else(PdfDictionary::new);
251
252 // Inheritable attributes: Resources, MediaBox, CropBox, Rotate
253 if let Some(resources) = node.get("Resources") {
254 if !merged_inherited.contains_key("Resources") {
255 merged_inherited.insert("Resources".to_string(), resources.clone());
256 }
257 }
258 if let Some(media_box) = node.get("MediaBox") {
259 if !merged_inherited.contains_key("MediaBox") {
260 merged_inherited.insert("MediaBox".to_string(), media_box.clone());
261 }
262 }
263 if let Some(crop_box) = node.get("CropBox") {
264 if !merged_inherited.contains_key("CropBox") {
265 merged_inherited.insert("CropBox".to_string(), crop_box.clone());
266 }
267 }
268 if let Some(rotate) = node.get("Rotate") {
269 if !merged_inherited.contains_key("Rotate") {
270 merged_inherited.insert("Rotate".to_string(), rotate.clone());
271 }
272 }
273
274 // Find which kid contains our target page
275 let mut current_index = 0;
276 for kid_ref in &kids.0 {
277 let kid_ref =
278 kid_ref
279 .as_reference()
280 .ok_or_else(|| ParseError::SyntaxError {
281 position: 0,
282 message: "Kids array must contain references".to_string(),
283 })?;
284
285 // Get the kid object info first
286 let (_kid_type, count, is_target) = {
287 // Cache parse options to avoid borrow checker issues
288 let lenient_syntax = reader.options().lenient_syntax;
289 let collect_warnings = reader.options().collect_warnings;
290
291 let kid_obj = reader.get_object(kid_ref.0, kid_ref.1)?;
292 let kid_dict =
293 kid_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
294 position: 0,
295 message: "Page tree node must be a dictionary".to_string(),
296 })?;
297
298 let kid_type = kid_dict
299 .get_type()
300 .or_else(|| {
301 // If Type is missing, try to infer from content
302 if kid_dict.contains_key("Kids") && kid_dict.contains_key("Count") {
303 Some("Pages")
304 } else if kid_dict.contains_key("Contents")
305 || kid_dict.contains_key("MediaBox")
306 {
307 Some("Page")
308 } else {
309 None
310 }
311 })
312 .or_else(|| {
313 // Additional inference for reconstructed/corrupted objects
314 if lenient_syntax || collect_warnings {
315 // If it has Kids, it's likely a Pages node
316 if kid_dict.contains_key("Kids") {
317 if collect_warnings {
318 tracing::debug!(
319 "Warning: Inferred Type=Pages for object {} 0 R (missing Type field, has Kids)",
320 kid_ref.0
321 );
322 }
323 Some("Pages")
324 }
325 // If it has Contents or MediaBox but no Kids, it's likely a Page
326 else if kid_dict.contains_key("Contents")
327 || (kid_dict.contains_key("MediaBox") && !kid_dict.contains_key("Kids"))
328 {
329 if collect_warnings {
330 tracing::debug!(
331 "Warning: Inferred Type=Page for object {} 0 R (missing Type field, has Contents/MediaBox)",
332 kid_ref.0
333 );
334 }
335 Some("Page")
336 } else {
337 None
338 }
339 } else {
340 None
341 }
342 })
343 .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
344
345 let count = if kid_type == "Pages" {
346 // This is another page tree node
347 if let Some(count_obj) = kid_dict.get("Count") {
348 count_obj.as_integer().unwrap_or(0) as u32
349 } else {
350 // Missing Count - use size of Kids array as approximation
351 if let Some(nested_kids_obj) = kid_dict.get("Kids") {
352 if let Some(nested_kids_array) = nested_kids_obj.as_array() {
353 // Use array length as page count approximation
354 nested_kids_array.0.len() as u32
355 } else {
356 1 // Default if Kids is not an array
357 }
358 } else {
359 1 // Default if no Kids array
360 }
361 }
362 } else {
363 // This is a page
364 1
365 };
366
367 let is_target = target_index < current_index + count;
368 (kid_type.to_string(), count, is_target)
369 };
370
371 if is_target {
372 // Found the right subtree/page
373 // Due to borrow checker constraints with recursive calls,
374 // we return a placeholder page for now.
375 // A proper implementation would require refactoring the page tree
376 // traversal to use an iterative approach instead of recursion.
377
378 return Ok(ParsedPage {
379 obj_ref: kid_ref,
380 dict: PdfDictionary::new(),
381 inherited_resources: Some(merged_inherited.clone()),
382 media_box: [0.0, 0.0, 612.0, 792.0],
383 crop_box: None,
384 rotation: 0,
385 annotations: None,
386 });
387 }
388
389 current_index += count;
390 }
391
392 Err(ParseError::SyntaxError {
393 position: 0,
394 message: "Page not found in tree".to_string(),
395 })
396 }
397 "Page" => {
398 // This is a page object
399 if target_index != 0 {
400 return Err(ParseError::SyntaxError {
401 position: 0,
402 message: "Page index mismatch".to_string(),
403 });
404 }
405
406 // Use the object reference passed as parameter
407 let obj_ref = node_ref;
408
409 // Extract page attributes
410 let media_box =
411 Self::get_rectangle(node, inherited, "MediaBox")?.unwrap_or_else(|| {
412 // Use default Letter size if MediaBox is missing
413 #[cfg(debug_assertions)]
414 tracing::debug!(
415 "Warning: Page {} {} R missing MediaBox, using default Letter size",
416 obj_ref.0,
417 obj_ref.1
418 );
419 [0.0, 0.0, 612.0, 792.0]
420 });
421
422 let crop_box = Self::get_rectangle(node, inherited, "CropBox")?;
423
424 let rotation = Self::get_integer(node, inherited, "Rotate")?.unwrap_or(0) as i32;
425
426 // Get resources
427 let inherited_resources = if let Some(inherited) = inherited {
428 inherited
429 .get("Resources")
430 .and_then(|r| r.as_dict())
431 .cloned()
432 } else {
433 None
434 };
435
436 // Get annotations if present
437 let annotations = node.get("Annots").and_then(|obj| obj.as_array()).cloned();
438
439 Ok(ParsedPage {
440 obj_ref,
441 dict: node.clone(),
442 inherited_resources,
443 media_box,
444 crop_box,
445 rotation,
446 annotations,
447 })
448 }
449 _ => Err(ParseError::SyntaxError {
450 position: 0,
451 message: format!("Invalid page tree node type: {node_type}"),
452 }),
453 }
454 }
455
456 /// Get a rectangle value, checking both node and inherited dictionaries
457 #[allow(dead_code)]
458 fn get_rectangle(
459 node: &PdfDictionary,
460 inherited: Option<&PdfDictionary>,
461 key: &str,
462 ) -> ParseResult<Option<[f64; 4]>> {
463 let array = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
464
465 if let Some(array) = array.and_then(|obj| obj.as_array()) {
466 if array.len() != 4 {
467 return Err(ParseError::SyntaxError {
468 position: 0,
469 message: format!("{key} must have 4 elements"),
470 });
471 }
472
473 // Safe: array length is guaranteed to be 4 after validation above
474 let rect = [
475 array.0[0].as_real().unwrap_or(0.0),
476 array.0[1].as_real().unwrap_or(0.0),
477 array.0[2].as_real().unwrap_or(0.0),
478 array.0[3].as_real().unwrap_or(0.0),
479 ];
480
481 Ok(Some(rect))
482 } else {
483 Ok(None)
484 }
485 }
486
487 /// Get an integer value, checking both node and inherited dictionaries
488 #[allow(dead_code)]
489 fn get_integer(
490 node: &PdfDictionary,
491 inherited: Option<&PdfDictionary>,
492 key: &str,
493 ) -> ParseResult<Option<i64>> {
494 let value = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
495
496 Ok(value.and_then(|obj| obj.as_integer()))
497 }
498}
499
500impl ParsedPage {
501 /// Get the effective page width accounting for rotation.
502 ///
503 /// The width is calculated from the MediaBox and adjusted based on the page rotation.
504 /// For 90° or 270° rotations, the width and height are swapped.
505 ///
506 /// # Returns
507 ///
508 /// The page width in PDF units (typically points, where 1 point = 1/72 inch)
509 ///
510 /// # Example
511 ///
512 /// ```rust,no_run
513 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
514 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
515 /// # let reader = PdfReader::open("document.pdf")?;
516 /// # let document = PdfDocument::new(reader);
517 /// let page = document.get_page(0)?;
518 /// let width_pts = page.width();
519 /// let width_inches = width_pts / 72.0;
520 /// let width_mm = width_pts * 25.4 / 72.0;
521 /// println!("Page width: {} points ({:.2} inches, {:.2} mm)", width_pts, width_inches, width_mm);
522 /// # Ok(())
523 /// # }
524 /// ```
525 pub fn width(&self) -> f64 {
526 match self.rotation {
527 90 | 270 => self.media_box[3] - self.media_box[1],
528 _ => self.media_box[2] - self.media_box[0],
529 }
530 }
531
532 /// Get the effective page height accounting for rotation.
533 ///
534 /// The height is calculated from the MediaBox and adjusted based on the page rotation.
535 /// For 90° or 270° rotations, the width and height are swapped.
536 ///
537 /// # Returns
538 ///
539 /// The page height in PDF units (typically points, where 1 point = 1/72 inch)
540 ///
541 /// # Example
542 ///
543 /// ```rust,no_run
544 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
545 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
546 /// # let reader = PdfReader::open("document.pdf")?;
547 /// # let document = PdfDocument::new(reader);
548 /// let page = document.get_page(0)?;
549 /// println!("Page dimensions: {}x{} points", page.width(), page.height());
550 /// if page.rotation != 0 {
551 /// println!("Page is rotated {} degrees", page.rotation);
552 /// }
553 /// # Ok(())
554 /// # }
555 /// ```
556 pub fn height(&self) -> f64 {
557 match self.rotation {
558 90 | 270 => self.media_box[2] - self.media_box[0],
559 _ => self.media_box[3] - self.media_box[1],
560 }
561 }
562
563 /// Get the content streams for this page using a PdfReader.
564 ///
565 /// Content streams contain the actual drawing instructions (operators) that render
566 /// text, graphics, and images on the page. A page may have multiple content streams
567 /// which are concatenated during rendering.
568 ///
569 /// # Arguments
570 ///
571 /// * `reader` - Mutable reference to the PDF reader
572 ///
573 /// # Returns
574 ///
575 /// A vector of decompressed content stream data. Each vector contains the raw bytes
576 /// of a content stream ready for parsing.
577 ///
578 /// # Errors
579 ///
580 /// Returns an error if:
581 /// - The Contents entry is malformed
582 /// - Stream decompression fails
583 /// - Referenced objects cannot be resolved
584 ///
585 /// # Example
586 ///
587 /// ```rust,no_run
588 /// # use oxidize_pdf::parser::{PdfReader, ParsedPage};
589 /// # fn example(page: &ParsedPage, reader: &mut PdfReader<std::fs::File>) -> Result<(), Box<dyn std::error::Error>> {
590 /// let streams = page.content_streams(reader)?;
591 /// for (i, stream) in streams.iter().enumerate() {
592 /// println!("Content stream {}: {} bytes", i, stream.len());
593 /// }
594 /// # Ok(())
595 /// # }
596 /// ```
597 pub fn content_streams<R: Read + Seek>(
598 &self,
599 reader: &mut PdfReader<R>,
600 ) -> ParseResult<Vec<Vec<u8>>> {
601 let mut streams = Vec::new();
602
603 if let Some(contents) = self.dict.get("Contents") {
604 // First resolve contents to check its type
605 let contents_type = match contents {
606 PdfObject::Reference(obj_num, gen_num) => {
607 let resolved = reader.get_object(*obj_num, *gen_num)?;
608 match resolved {
609 PdfObject::Stream(_) => "stream",
610 PdfObject::Array(_) => "array",
611 _ => "other",
612 }
613 }
614 PdfObject::Stream(_) => "stream",
615 PdfObject::Array(_) => "array",
616 _ => "other",
617 };
618
619 let options = reader.options().clone();
620 match contents_type {
621 "stream" => {
622 let resolved = reader.resolve(contents)?;
623 if let PdfObject::Stream(stream) = resolved {
624 streams.push(stream.decode(&options)?);
625 }
626 }
627 "array" => {
628 // Get array references first
629 let refs: Vec<(u32, u16)> = {
630 let resolved = reader.resolve(contents)?;
631 if let PdfObject::Array(array) = resolved {
632 array
633 .0
634 .iter()
635 .filter_map(|obj| {
636 if let PdfObject::Reference(num, gen) = obj {
637 Some((*num, *gen))
638 } else {
639 None
640 }
641 })
642 .collect()
643 } else {
644 Vec::new()
645 }
646 };
647
648 // Now resolve each reference
649 for (obj_num, gen_num) in refs {
650 let obj = reader.get_object(obj_num, gen_num)?;
651 if let PdfObject::Stream(stream) = obj {
652 streams.push(stream.decode(&options)?);
653 }
654 }
655 }
656 _ => {
657 return Err(ParseError::SyntaxError {
658 position: 0,
659 message: "Contents must be a stream or array of streams".to_string(),
660 })
661 }
662 }
663 }
664
665 Ok(streams)
666 }
667
668 /// Get content streams using PdfDocument (recommended method).
669 ///
670 /// This is the preferred method for accessing content streams as it uses the
671 /// document's caching and resource management capabilities.
672 ///
673 /// # Arguments
674 ///
675 /// * `document` - Reference to the PDF document
676 ///
677 /// # Returns
678 ///
679 /// A vector of decompressed content stream data ready for parsing with `ContentParser`.
680 ///
681 /// # Example
682 ///
683 /// ```rust,no_run
684 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
685 /// # use oxidize_pdf::parser::content::ContentParser;
686 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
687 /// let reader = PdfReader::open("document.pdf")?;
688 /// let document = PdfDocument::new(reader);
689 /// let page = document.get_page(0)?;
690 ///
691 /// // Get content streams
692 /// let streams = page.content_streams_with_document(&document)?;
693 ///
694 /// // Parse each stream
695 /// for stream_data in streams {
696 /// let operations = ContentParser::parse_content(&stream_data)?;
697 /// println!("Stream has {} operations", operations.len());
698 /// }
699 /// # Ok(())
700 /// # }
701 /// ```
702 pub fn content_streams_with_document<R: Read + Seek>(
703 &self,
704 document: &PdfDocument<R>,
705 ) -> ParseResult<Vec<Vec<u8>>> {
706 document.get_page_content_streams(self)
707 }
708
709 /// Get the effective resources for this page (including inherited).
710 ///
711 /// Resources include fonts, images (XObjects), color spaces, patterns, and other
712 /// assets needed to render the page. This method returns page-specific resources
713 /// if present, otherwise falls back to inherited resources from parent nodes.
714 ///
715 /// # Returns
716 ///
717 /// The Resources dictionary if available, or None if the page has no resources.
718 ///
719 /// # Resource Categories
720 ///
721 /// The Resources dictionary may contain:
722 /// - `Font` - Font definitions used by text operators
723 /// - `XObject` - External objects (images, form XObjects)
724 /// - `ColorSpace` - Color space definitions
725 /// - `Pattern` - Pattern definitions for fills
726 /// - `Shading` - Shading dictionaries
727 /// - `ExtGState` - Graphics state parameter dictionaries
728 /// - `Properties` - Property list dictionaries
729 ///
730 /// # Example
731 ///
732 /// ```rust,no_run
733 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
734 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
735 /// # let reader = PdfReader::open("document.pdf")?;
736 /// # let document = PdfDocument::new(reader);
737 /// # let page = document.get_page(0)?;
738 /// if let Some(resources) = page.get_resources() {
739 /// // Check for fonts
740 /// if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
741 /// println!("Page uses {} fonts", fonts.0.len());
742 /// }
743 ///
744 /// // Check for images
745 /// if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
746 /// println!("Page has {} XObjects", xobjects.0.len());
747 /// }
748 /// }
749 /// # Ok(())
750 /// # }
751 /// ```
752 pub fn get_contents(&self) -> Option<&PdfObject> {
753 self.dict.get("Contents")
754 }
755
756 pub fn get_resources(&self) -> Option<&PdfDictionary> {
757 self.dict
758 .get("Resources")
759 .and_then(|r| r.as_dict())
760 .or(self.inherited_resources.as_ref())
761 }
762
763 /// Clone this page with all inherited resources merged into the page dictionary.
764 ///
765 /// This is useful when extracting a page for separate processing or when you need
766 /// a self-contained page object with all resources explicitly included.
767 ///
768 /// # Returns
769 ///
770 /// A cloned page with inherited resources merged into the Resources entry
771 /// of the page dictionary.
772 ///
773 /// # Example
774 ///
775 /// ```rust,no_run
776 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
777 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
778 /// # let reader = PdfReader::open("document.pdf")?;
779 /// # let document = PdfDocument::new(reader);
780 /// # let page = document.get_page(0)?;
781 /// // Get a self-contained page with all resources
782 /// let standalone_page = page.clone_with_resources();
783 ///
784 /// // The cloned page now has all resources in its dictionary
785 /// assert!(standalone_page.dict.contains_key("Resources"));
786 /// # Ok(())
787 /// # }
788 /// ```
789 pub fn clone_with_resources(&self) -> Self {
790 let mut cloned = self.clone();
791
792 // Merge inherited resources into the page dictionary if needed
793 if let Some(inherited) = &self.inherited_resources {
794 if !cloned.dict.contains_key("Resources") {
795 cloned.dict.insert(
796 "Resources".to_string(),
797 PdfObject::Dictionary(inherited.clone()),
798 );
799 }
800 }
801
802 cloned
803 }
804
805 /// Get the annotations array for this page.
806 ///
807 /// Returns a reference to the annotations array if present.
808 /// Each element in the array is typically a reference to an annotation dictionary.
809 ///
810 /// # Example
811 ///
812 /// ```rust,no_run
813 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
814 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
815 /// # let reader = PdfReader::open("document.pdf")?;
816 /// # let document = PdfDocument::new(reader);
817 /// # let page = document.get_page(0)?;
818 /// if let Some(annots) = page.get_annotations() {
819 /// println!("Page has {} annotations", annots.len());
820 /// }
821 /// # Ok(())
822 /// # }
823 /// ```
824 pub fn get_annotations(&self) -> Option<&PdfArray> {
825 self.annotations.as_ref()
826 }
827
828 /// Check if the page has annotations.
829 ///
830 /// # Returns
831 ///
832 /// `true` if the page has an annotations array with at least one annotation,
833 /// `false` otherwise.
834 ///
835 /// # Example
836 ///
837 /// ```rust,no_run
838 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
839 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
840 /// # let reader = PdfReader::open("document.pdf")?;
841 /// # let document = PdfDocument::new(reader);
842 /// # let page = document.get_page(0)?;
843 /// if page.has_annotations() {
844 /// println!("This page contains annotations");
845 /// }
846 /// # Ok(())
847 /// # }
848 /// ```
849 pub fn has_annotations(&self) -> bool {
850 self.annotations
851 .as_ref()
852 .map(|arr| !arr.is_empty())
853 .unwrap_or(false)
854 }
855
856 /// Get all objects referenced by this page (for extraction or analysis).
857 ///
858 /// This method recursively collects all objects referenced by the page, including:
859 /// - Content streams
860 /// - Resources (fonts, images, etc.)
861 /// - Nested objects within resources
862 ///
863 /// This is useful for extracting a complete page with all its dependencies or
864 /// for analyzing the object graph of a page.
865 ///
866 /// # Arguments
867 ///
868 /// * `reader` - Mutable reference to the PDF reader
869 ///
870 /// # Returns
871 ///
872 /// A HashMap mapping object references (obj_num, gen_num) to their resolved objects.
873 ///
874 /// # Example
875 ///
876 /// ```rust,no_run
877 /// # use oxidize_pdf::parser::{PdfReader, ParsedPage};
878 /// # fn example(page: &ParsedPage, reader: &mut PdfReader<std::fs::File>) -> Result<(), Box<dyn std::error::Error>> {
879 /// let referenced_objects = page.get_referenced_objects(reader)?;
880 ///
881 /// println!("Page references {} objects", referenced_objects.len());
882 /// for ((obj_num, gen_num), obj) in &referenced_objects {
883 /// println!(" {} {} R: {:?}", obj_num, gen_num, obj);
884 /// }
885 /// # Ok(())
886 /// # }
887 /// ```
888 pub fn get_referenced_objects<R: Read + Seek>(
889 &self,
890 reader: &mut PdfReader<R>,
891 ) -> ParseResult<HashMap<(u32, u16), PdfObject>> {
892 let mut objects = HashMap::new();
893 let mut to_process = Vec::new();
894
895 // Start with Contents
896 if let Some(contents) = self.dict.get("Contents") {
897 Self::collect_references(contents, &mut to_process);
898 }
899
900 // Add Resources
901 if let Some(resources) = self.get_resources() {
902 for value in resources.0.values() {
903 Self::collect_references(value, &mut to_process);
904 }
905 }
906
907 // Process all references
908 while let Some((obj_num, gen_num)) = to_process.pop() {
909 if let std::collections::hash_map::Entry::Vacant(e) = objects.entry((obj_num, gen_num))
910 {
911 let obj = reader.get_object(obj_num, gen_num)?;
912
913 // Collect nested references
914 Self::collect_references_from_object(obj, &mut to_process);
915
916 e.insert(obj.clone());
917 }
918 }
919
920 Ok(objects)
921 }
922
923 /// Collect object references from a PDF object
924 fn collect_references(obj: &PdfObject, refs: &mut Vec<(u32, u16)>) {
925 match obj {
926 PdfObject::Reference(obj_num, gen_num) => {
927 refs.push((*obj_num, *gen_num));
928 }
929 PdfObject::Array(array) => {
930 for item in &array.0 {
931 Self::collect_references(item, refs);
932 }
933 }
934 PdfObject::Dictionary(dict) => {
935 for value in dict.0.values() {
936 Self::collect_references(value, refs);
937 }
938 }
939 _ => {}
940 }
941 }
942
943 /// Collect references from an object (after resolution)
944 fn collect_references_from_object(obj: &PdfObject, refs: &mut Vec<(u32, u16)>) {
945 match obj {
946 PdfObject::Array(array) => {
947 for item in &array.0 {
948 Self::collect_references(item, refs);
949 }
950 }
951 PdfObject::Dictionary(dict) | PdfObject::Stream(PdfStream { dict, .. }) => {
952 for value in dict.0.values() {
953 Self::collect_references(value, refs);
954 }
955 }
956 _ => {}
957 }
958 }
959}
960
961#[cfg(test)]
962mod tests {
963 use super::super::objects::{PdfArray, PdfDictionary, PdfName, PdfObject};
964 use super::*;
965 use std::collections::HashMap;
966
967 fn create_test_page() -> ParsedPage {
968 let mut dict = PdfDictionary(HashMap::new());
969 dict.0.insert(
970 PdfName("Type".to_string()),
971 PdfObject::Name(PdfName("Page".to_string())),
972 );
973 dict.0
974 .insert(PdfName("Parent".to_string()), PdfObject::Reference(2, 0));
975
976 ParsedPage {
977 obj_ref: (3, 0),
978 dict,
979 inherited_resources: None,
980 media_box: [0.0, 0.0, 595.0, 842.0],
981 crop_box: None,
982 rotation: 0,
983 annotations: None,
984 }
985 }
986
987 fn create_test_page_with_resources() -> ParsedPage {
988 let mut dict = PdfDictionary(HashMap::new());
989 dict.0.insert(
990 PdfName("Type".to_string()),
991 PdfObject::Name(PdfName("Page".to_string())),
992 );
993
994 let mut resources = PdfDictionary(HashMap::new());
995 resources.0.insert(
996 PdfName("Font".to_string()),
997 PdfObject::Dictionary(PdfDictionary(HashMap::new())),
998 );
999
1000 ParsedPage {
1001 obj_ref: (4, 0),
1002 dict,
1003 inherited_resources: Some(resources),
1004 media_box: [0.0, 0.0, 595.0, 842.0],
1005 crop_box: Some([10.0, 10.0, 585.0, 832.0]),
1006 rotation: 90,
1007 annotations: Some(PdfArray(vec![])),
1008 }
1009 }
1010
1011 #[test]
1012 fn test_page_tree_new() {
1013 let tree = PageTree::new(10);
1014 assert_eq!(tree.page_count, 10);
1015 assert_eq!(tree.pages.len(), 0);
1016 assert!(tree.pages_dict.is_none());
1017 }
1018
1019 #[test]
1020 fn test_page_tree_new_with_pages_dict() {
1021 let pages_dict = PdfDictionary(HashMap::new());
1022 let tree = PageTree::new_with_pages_dict(5, pages_dict);
1023 assert_eq!(tree.page_count, 5);
1024 assert_eq!(tree.pages.len(), 0);
1025 assert!(tree.pages_dict.is_some());
1026 }
1027
1028 #[test]
1029 fn test_get_cached_page_empty() {
1030 let tree = PageTree::new(10);
1031 assert!(tree.get_cached_page(0).is_none());
1032 assert!(tree.get_cached_page(5).is_none());
1033 }
1034
1035 #[test]
1036 fn test_cache_and_get_page() {
1037 let mut tree = PageTree::new(10);
1038 let page = create_test_page();
1039
1040 tree.cache_page(0, page);
1041
1042 let cached = tree.get_cached_page(0);
1043 assert!(cached.is_some());
1044 let cached_page = cached.unwrap();
1045 assert_eq!(cached_page.obj_ref, (3, 0));
1046 assert_eq!(cached_page.media_box, [0.0, 0.0, 595.0, 842.0]);
1047 }
1048
1049 #[test]
1050 fn test_cache_multiple_pages() {
1051 let mut tree = PageTree::new(10);
1052 let page1 = create_test_page();
1053 let page2 = create_test_page_with_resources();
1054
1055 tree.cache_page(0, page1);
1056 tree.cache_page(1, page2);
1057
1058 assert!(tree.get_cached_page(0).is_some());
1059 assert!(tree.get_cached_page(1).is_some());
1060 assert!(tree.get_cached_page(2).is_none());
1061
1062 let cached1 = tree.get_cached_page(0).unwrap();
1063 assert_eq!(cached1.rotation, 0);
1064
1065 let cached2 = tree.get_cached_page(1).unwrap();
1066 assert_eq!(cached2.rotation, 90);
1067 }
1068
1069 #[test]
1070 fn test_get_page_count() {
1071 let tree = PageTree::new(25);
1072 assert_eq!(tree.page_count, 25);
1073 }
1074
1075 #[test]
1076 fn test_clear_cache() {
1077 let mut tree = PageTree::new(10);
1078 let page = create_test_page();
1079
1080 tree.cache_page(0, page.clone());
1081 tree.cache_page(1, page);
1082 assert_eq!(tree.pages.len(), 2);
1083
1084 tree.clear_cache();
1085 assert_eq!(tree.pages.len(), 0);
1086 assert!(tree.get_cached_page(0).is_none());
1087 assert!(tree.get_cached_page(1).is_none());
1088 }
1089
1090 #[test]
1091 fn test_parsed_page_properties() {
1092 let page = create_test_page_with_resources();
1093
1094 assert_eq!(page.obj_ref, (4, 0));
1095 assert_eq!(page.rotation, 90);
1096 assert!(page.inherited_resources.is_some());
1097 assert!(page.crop_box.is_some());
1098 assert!(page.annotations.is_some());
1099
1100 let crop_box = page.crop_box.unwrap();
1101 assert_eq!(crop_box, [10.0, 10.0, 585.0, 832.0]);
1102 }
1103
1104 #[test]
1105 fn test_parsed_page_creation() {
1106 let dict = PdfDictionary::new();
1107 let page = ParsedPage {
1108 obj_ref: (1, 0),
1109 dict: dict.clone(),
1110 inherited_resources: None,
1111 media_box: [0.0, 0.0, 612.0, 792.0],
1112 crop_box: None,
1113 rotation: 0,
1114 annotations: None,
1115 };
1116
1117 assert_eq!(page.obj_ref, (1, 0));
1118 assert_eq!(page.dict, dict);
1119 assert!(page.inherited_resources.is_none());
1120 assert_eq!(page.media_box, [0.0, 0.0, 612.0, 792.0]); // Default US Letter
1121 assert!(page.crop_box.is_none());
1122 assert_eq!(page.rotation, 0);
1123 assert!(page.annotations.is_none());
1124 }
1125
1126 #[test]
1127 fn test_parsed_page_width_height() {
1128 let mut page = create_test_page();
1129
1130 // A4 size
1131 assert_eq!(page.width(), 595.0);
1132 assert_eq!(page.height(), 842.0);
1133
1134 // Test with rotation
1135 page.rotation = 90;
1136 // Width and height should swap when rotated
1137 assert_eq!(page.width(), 842.0);
1138 assert_eq!(page.height(), 595.0);
1139
1140 page.rotation = 270;
1141 assert_eq!(page.width(), 842.0);
1142 assert_eq!(page.height(), 595.0);
1143
1144 page.rotation = 180;
1145 assert_eq!(page.width(), 595.0);
1146 assert_eq!(page.height(), 842.0);
1147 }
1148
1149 #[test]
1150 fn test_parsed_page_get_resources() {
1151 let page = create_test_page_with_resources();
1152 let resources = page.get_resources();
1153
1154 assert!(resources.is_some());
1155 let res = resources.unwrap();
1156 assert!(res.contains_key("Font"));
1157 }
1158
1159 #[test]
1160 fn test_parsed_page_get_contents() {
1161 let mut page = create_test_page();
1162
1163 // Add contents to page
1164 page.dict
1165 .insert("Contents".to_string(), PdfObject::Reference(10, 0));
1166
1167 let contents = page.get_contents();
1168 assert!(contents.is_some());
1169 assert_eq!(contents, Some(&PdfObject::Reference(10, 0)));
1170 }
1171
1172 #[test]
1173 fn test_parsed_page_get_annotations() {
1174 let page = create_test_page_with_resources();
1175 let annotations = page.get_annotations();
1176
1177 assert!(annotations.is_some());
1178 if let Some(arr) = annotations {
1179 assert_eq!(arr.0.len(), 0);
1180 }
1181 }
1182
1183 #[test]
1184 fn test_parsed_page_inherited_resources() {
1185 let mut page = create_test_page();
1186 let mut parent_resources = PdfDictionary::new();
1187 parent_resources.insert(
1188 "Font".to_string(),
1189 PdfObject::Dictionary(PdfDictionary::new()),
1190 );
1191
1192 // Directly set inherited resources
1193 page.inherited_resources = Some(parent_resources.clone());
1194
1195 assert!(page.inherited_resources.is_some());
1196 assert_eq!(page.inherited_resources, Some(parent_resources));
1197 }
1198
1199 #[test]
1200 fn test_parsed_page_with_crop_box() {
1201 let mut page = create_test_page();
1202 page.crop_box = Some([50.0, 50.0, 545.0, 792.0]);
1203
1204 // CropBox affects visible area
1205 let crop = page.crop_box.unwrap();
1206 assert_eq!(crop[0], 50.0);
1207 assert_eq!(crop[1], 50.0);
1208 assert_eq!(crop[2], 545.0);
1209 assert_eq!(crop[3], 792.0);
1210 }
1211
1212 #[test]
1213 fn test_page_tree_cache_overflow() {
1214 let mut tree = PageTree::new(100);
1215
1216 // Cache more pages than typical cache size
1217 for i in 0..50 {
1218 let page = create_test_page();
1219 tree.cache_page(i, page);
1220 }
1221
1222 // All pages should be cached
1223 for i in 0..50 {
1224 assert!(tree.get_cached_page(i).is_some());
1225 }
1226 }
1227
1228 #[test]
1229 fn test_page_tree_update_cached_page() {
1230 let mut tree = PageTree::new(10);
1231 let page1 = create_test_page();
1232 let mut page2 = create_test_page();
1233 page2.rotation = 180;
1234
1235 tree.cache_page(0, page1);
1236 let cached = tree.get_cached_page(0).unwrap();
1237 assert_eq!(cached.rotation, 0);
1238
1239 // Update the same page
1240 tree.cache_page(0, page2);
1241 let cached = tree.get_cached_page(0).unwrap();
1242 assert_eq!(cached.rotation, 180);
1243 }
1244
1245 #[test]
1246 fn test_parsed_page_clone() {
1247 let page = create_test_page_with_resources();
1248 let cloned = page.clone();
1249
1250 assert_eq!(page.obj_ref, cloned.obj_ref);
1251 assert_eq!(page.dict, cloned.dict);
1252 assert_eq!(page.inherited_resources, cloned.inherited_resources);
1253 assert_eq!(page.media_box, cloned.media_box);
1254 assert_eq!(page.crop_box, cloned.crop_box);
1255 assert_eq!(page.rotation, cloned.rotation);
1256 assert_eq!(page.annotations, cloned.annotations);
1257 }
1258
1259 #[test]
1260 fn test_page_tree_get_page_bounds() {
1261 let tree = PageTree::new(100);
1262
1263 // Test bounds checking
1264 assert!(tree.get_cached_page(0).is_none()); // Not cached yet
1265 assert!(tree.get_cached_page(99).is_none()); // Within bounds but not cached
1266 assert!(tree.get_cached_page(100).is_none()); // Out of bounds
1267 assert!(tree.get_cached_page(u32::MAX).is_none()); // Way out of bounds
1268 }
1269}
1270
1271#[cfg(test)]
1272#[path = "page_tree_tests.rs"]
1273mod page_tree_tests;