oxidize_pdf/parser/page_tree.rs
1//! PDF Page Tree Parser
2//!
3//! This module handles navigation and extraction of pages from the PDF page tree structure.
4//! The page tree is a hierarchical structure that organizes pages in a PDF document,
5//! allowing for efficient access and inheritance of properties from parent nodes.
6//!
7//! # Overview
8//!
9//! The PDF page tree consists of:
10//! - **Page Tree Nodes**: Internal nodes that can contain other nodes or pages
11//! - **Page Objects**: Leaf nodes representing individual pages
12//! - **Inherited Properties**: Resources, MediaBox, CropBox, and Rotate can be inherited from parent nodes
13//!
14//! # Example
15//!
16//! ```rust,no_run
17//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
18//!
19//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
20//! // Open a PDF document
21//! let reader = PdfReader::open("document.pdf")?;
22//! let document = PdfDocument::new(reader);
23//!
24//! // Get a specific page
25//! let page = document.get_page(0)?;
26//!
27//! // Access page properties
28//! println!("Page size: {}x{} points", page.width(), page.height());
29//! println!("Rotation: {}°", page.rotation);
30//!
31//! // Get page resources
32//! if let Some(resources) = page.get_resources() {
33//! println!("Page has resources");
34//! }
35//! # Ok(())
36//! # }
37//! ```
38
39use super::document::PdfDocument;
40use super::objects::{PdfArray, PdfDictionary, PdfObject, PdfStream};
41use super::reader::PdfReader;
42use super::{ParseError, ParseResult};
43use std::collections::{HashMap, HashSet};
44use std::io::{Read, Seek};
45
46/// Represents a single page in the PDF with all its properties and resources.
47///
48/// A `ParsedPage` contains all the information needed to render or analyze a PDF page,
49/// including its dimensions, content streams, resources, and inherited properties from
50/// parent page tree nodes.
51///
52/// # Fields
53///
54/// * `obj_ref` - Object reference (object number, generation number) pointing to this page in the PDF
55/// * `dict` - Complete page dictionary containing all page-specific entries
56/// * `inherited_resources` - Resources inherited from parent page tree nodes
57/// * `media_box` - Page dimensions in PDF units [llx, lly, urx, ury]
58/// * `crop_box` - Optional visible area of the page
59/// * `rotation` - Page rotation in degrees (0, 90, 180, or 270)
60///
61/// # Example
62///
63/// ```rust,no_run
64/// use oxidize_pdf::parser::{PdfDocument, PdfReader};
65///
66/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
67/// let reader = PdfReader::open("document.pdf")?;
68/// let document = PdfDocument::new(reader);
69/// let page = document.get_page(0)?;
70///
71/// // Access page properties
72/// let (obj_num, gen_num) = page.obj_ref;
73/// println!("Page object: {} {} R", obj_num, gen_num);
74///
75/// // Get page dimensions
76/// let [llx, lly, urx, ury] = page.media_box;
77/// println!("MediaBox: ({}, {}) to ({}, {})", llx, lly, urx, ury);
78///
79/// // Check for content
80/// if let Some(contents) = page.dict.get("Contents") {
81/// println!("Page has content streams");
82/// }
83/// # Ok(())
84/// # }
85/// ```
86#[derive(Debug, Clone)]
87pub struct ParsedPage {
88 /// Object reference to this page in the form (object_number, generation_number).
89 /// This uniquely identifies the page object in the PDF file.
90 pub obj_ref: (u32, u16),
91
92 /// Page dictionary containing all page-specific entries like Contents, Resources, etc.
93 /// This is the raw PDF dictionary for the page object.
94 pub dict: PdfDictionary,
95
96 /// Resources inherited from parent page tree nodes.
97 /// These are automatically merged during page tree traversal.
98 pub inherited_resources: Option<PdfDictionary>,
99
100 /// MediaBox defining the page dimensions in PDF units (typically points).
101 /// Format: [lower_left_x, lower_left_y, upper_right_x, upper_right_y]
102 pub media_box: [f64; 4],
103
104 /// CropBox defining the visible area of the page.
105 /// If None, the entire MediaBox is visible.
106 pub crop_box: Option<[f64; 4]>,
107
108 /// Page rotation in degrees. Valid values are 0, 90, 180, or 270.
109 /// The rotation is applied clockwise.
110 pub rotation: i32,
111
112 /// Annotations array containing references to annotation objects.
113 /// This is parsed from the page's /Annots entry.
114 pub annotations: Option<PdfArray>,
115}
116
117/// Maximum number of pages to allow in a flat index.
118/// Prevents OOM from malicious /Count values (e.g., 9,999,999,999).
119const MAX_PAGES: usize = 100_000;
120
121/// Page tree navigator
122pub struct PageTree {
123 /// Total number of pages
124 page_count: u32,
125 /// Cached pages by index
126 pages: HashMap<u32, ParsedPage>,
127 /// Root pages dictionary (for navigation)
128 #[allow(dead_code)]
129 pages_dict: Option<PdfDictionary>,
130 /// Flat index of page object references, built once during initialization.
131 /// Each entry is (obj_num, gen_num) for a leaf Page node.
132 page_refs: Vec<(u32, u16)>,
133}
134
135impl PageTree {
136 /// Create a new page tree navigator
137 pub fn new(page_count: u32) -> Self {
138 Self {
139 page_count,
140 pages: HashMap::new(),
141 pages_dict: None,
142 page_refs: Vec::new(),
143 }
144 }
145
146 /// Create a new page tree navigator with pages dictionary
147 pub fn new_with_pages_dict(page_count: u32, pages_dict: PdfDictionary) -> Self {
148 Self {
149 page_count,
150 pages: HashMap::new(),
151 pages_dict: Some(pages_dict),
152 page_refs: Vec::new(),
153 }
154 }
155
156 /// Create a new page tree navigator with a pre-built flat index.
157 /// The page_count is derived from the actual number of leaf pages found.
158 pub fn new_with_flat_index(pages_dict: PdfDictionary, page_refs: Vec<(u32, u16)>) -> Self {
159 let page_count = page_refs.len() as u32;
160 Self {
161 page_count,
162 pages: HashMap::new(),
163 pages_dict: Some(pages_dict),
164 page_refs,
165 }
166 }
167
168 /// Get a cached page by index (0-based)
169 pub fn get_cached_page(&self, index: u32) -> Option<&ParsedPage> {
170 self.pages.get(&index)
171 }
172
173 /// Cache a page
174 pub fn cache_page(&mut self, index: u32, page: ParsedPage) {
175 self.pages.insert(index, page);
176 }
177
178 /// Clear all cached pages
179 pub fn clear_cache(&mut self) {
180 self.pages.clear();
181 }
182
183 /// Get the total page count
184 pub fn page_count(&self) -> u32 {
185 self.page_count
186 }
187
188 /// Get a page object reference from the flat index by page index (0-based).
189 pub fn get_page_ref(&self, index: u32) -> Option<(u32, u16)> {
190 self.page_refs.get(index as usize).copied()
191 }
192
193 /// Flatten the page tree into a `Vec<(u32, u16)>` of leaf Page object references.
194 ///
195 /// This walks the tree iteratively using an explicit stack, with:
196 /// - **Cycle detection**: `HashSet<(u32, u16)>` prevents infinite loops from circular refs
197 /// - **Page cap**: Stops at `MAX_PAGES` to prevent OOM from absurd `/Count` values
198 /// - **Type inference**: Handles missing `/Type` keys by checking for `/Kids`, `/Contents`, `/MediaBox`
199 pub fn flatten_page_tree<R: Read + Seek>(
200 reader: &mut PdfReader<R>,
201 pages_dict: &PdfDictionary,
202 ) -> ParseResult<Vec<(u32, u16)>> {
203 let mut page_refs: Vec<(u32, u16)> = Vec::new();
204 let mut visited: HashSet<(u32, u16)> = HashSet::new();
205
206 // Work stack: each entry is an object reference to process
207 let mut stack: Vec<(u32, u16)> = Vec::new();
208
209 // Seed from root Kids array
210 if let Some(kids) = pages_dict.get("Kids").and_then(|k| k.as_array()) {
211 // Push in reverse so first kid is processed first (LIFO stack)
212 for kid_obj in kids.0.iter().rev() {
213 if let Some(kid_ref) = kid_obj.as_reference() {
214 stack.push(kid_ref);
215 }
216 }
217 }
218
219 while let Some(obj_ref) = stack.pop() {
220 if page_refs.len() >= MAX_PAGES {
221 tracing::warn!("Page tree exceeds {} leaves, truncating", MAX_PAGES);
222 break;
223 }
224
225 // Cycle detection
226 if !visited.insert(obj_ref) {
227 tracing::warn!(
228 "Cycle detected at {} {} R in page tree, skipping",
229 obj_ref.0,
230 obj_ref.1
231 );
232 continue;
233 }
234
235 // Resolve the object
236 let obj = match reader.get_object(obj_ref.0, obj_ref.1) {
237 Ok(o) => o,
238 Err(e) => {
239 tracing::warn!(
240 "Failed to resolve page tree node {} {} R: {}",
241 obj_ref.0,
242 obj_ref.1,
243 e
244 );
245 continue;
246 }
247 };
248
249 let dict = match obj.as_dict() {
250 Some(d) => d,
251 None => {
252 // Check if it's a stream with a dict (some PDFs embed page data in streams)
253 if let Some(stream) = obj.as_stream() {
254 &stream.dict
255 } else {
256 continue; // Skip non-dict/non-stream nodes
257 }
258 }
259 };
260
261 // Determine node type
262 let node_type = dict.get_type().or_else(|| {
263 if dict.contains_key("Kids") {
264 Some("Pages")
265 } else if dict.contains_key("Contents") || dict.contains_key("MediaBox") {
266 Some("Page")
267 } else {
268 None
269 }
270 });
271
272 match node_type {
273 Some("Page") => {
274 page_refs.push(obj_ref);
275 }
276 Some("Pages") => {
277 if let Some(kids) = dict.get("Kids").and_then(|k| k.as_array()) {
278 // Push in reverse for correct order
279 for kid_obj in kids.0.iter().rev() {
280 if let Some(kid_ref) = kid_obj.as_reference() {
281 stack.push(kid_ref);
282 }
283 }
284 }
285 }
286 _ => {
287 // Unknown type — treat as Page if it has page-like attributes
288 if dict.contains_key("MediaBox") || dict.contains_key("Contents") {
289 page_refs.push(obj_ref);
290 }
291 // Otherwise silently skip
292 }
293 }
294 }
295
296 Ok(page_refs)
297 }
298
299 /// Load a specific page by traversing the page tree
300 ///
301 /// Note: This method is currently not fully implemented due to architectural constraints
302 /// with recursive page tree traversal and borrow checker issues.
303 #[allow(dead_code)]
304 fn load_page_at_index<R: Read + Seek>(
305 &self,
306 reader: &mut PdfReader<R>,
307 node: &PdfDictionary,
308 node_ref: (u32, u16),
309 target_index: u32,
310 inherited: Option<&PdfDictionary>,
311 ) -> ParseResult<ParsedPage> {
312 let node_type = node
313 .get_type()
314 .or_else(|| {
315 // If Type is missing, try to infer from content
316 if node.contains_key("Kids") && node.contains_key("Count") {
317 Some("Pages")
318 } else if node.contains_key("Contents") || node.contains_key("MediaBox") {
319 Some("Page")
320 } else {
321 None
322 }
323 })
324 .or_else(|| {
325 // If Type is missing and we have lenient parsing, try to infer
326 let lenient_syntax = reader.options().lenient_syntax;
327 let collect_warnings = reader.options().collect_warnings;
328
329 if lenient_syntax || collect_warnings {
330 // If it has Kids, it's likely a Pages node
331 if node.contains_key("Kids") {
332 if collect_warnings {
333 tracing::debug!(
334 "Warning: Inferred Type=Pages for object {} {} R (missing Type field, has Kids)",
335 node_ref.0, node_ref.1
336 );
337 }
338 Some("Pages")
339 }
340 // If it has Contents or MediaBox but no Kids, it's likely a Page
341 else if node.contains_key("Contents")
342 || (node.contains_key("MediaBox") && !node.contains_key("Kids"))
343 {
344 if collect_warnings {
345 tracing::debug!(
346 "Warning: Inferred Type=Page for object {} {} R (missing Type field, has Contents/MediaBox)",
347 node_ref.0, node_ref.1
348 );
349 }
350 Some("Page")
351 } else {
352 None
353 }
354 } else {
355 None
356 }
357 })
358 .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
359
360 match node_type {
361 "Pages" => {
362 // This is a page tree node
363 let kids = node
364 .get("Kids")
365 .and_then(|obj| obj.as_array())
366 .or_else(|| {
367 // If Kids is missing and we have lenient parsing, use empty array
368 if reader.options().lenient_syntax {
369 if reader.options().collect_warnings {
370 tracing::debug!(
371 "Warning: Missing Kids array in Pages node, using empty array"
372 );
373 }
374 Some(&super::objects::EMPTY_PDF_ARRAY)
375 } else {
376 None
377 }
378 })
379 .ok_or_else(|| ParseError::MissingKey("Kids".to_string()))?;
380
381 // Merge inherited attributes
382 let mut merged_inherited = inherited.cloned().unwrap_or_else(PdfDictionary::new);
383
384 // Inheritable attributes: Resources, MediaBox, CropBox, Rotate
385 if let Some(resources) = node.get("Resources") {
386 if !merged_inherited.contains_key("Resources") {
387 merged_inherited.insert("Resources".to_string(), resources.clone());
388 }
389 }
390 if let Some(media_box) = node.get("MediaBox") {
391 if !merged_inherited.contains_key("MediaBox") {
392 merged_inherited.insert("MediaBox".to_string(), media_box.clone());
393 }
394 }
395 if let Some(crop_box) = node.get("CropBox") {
396 if !merged_inherited.contains_key("CropBox") {
397 merged_inherited.insert("CropBox".to_string(), crop_box.clone());
398 }
399 }
400 if let Some(rotate) = node.get("Rotate") {
401 if !merged_inherited.contains_key("Rotate") {
402 merged_inherited.insert("Rotate".to_string(), rotate.clone());
403 }
404 }
405
406 // Find which kid contains our target page
407 let mut current_index = 0;
408 for kid_ref in &kids.0 {
409 let kid_ref =
410 kid_ref
411 .as_reference()
412 .ok_or_else(|| ParseError::SyntaxError {
413 position: 0,
414 message: "Kids array must contain references".to_string(),
415 })?;
416
417 // Get the kid object info first
418 let (_kid_type, count, is_target) = {
419 // Cache parse options to avoid borrow checker issues
420 let lenient_syntax = reader.options().lenient_syntax;
421 let collect_warnings = reader.options().collect_warnings;
422
423 let kid_obj = reader.get_object(kid_ref.0, kid_ref.1)?;
424 let kid_dict =
425 kid_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
426 position: 0,
427 message: "Page tree node must be a dictionary".to_string(),
428 })?;
429
430 let kid_type = kid_dict
431 .get_type()
432 .or_else(|| {
433 // If Type is missing, try to infer from content
434 if kid_dict.contains_key("Kids") && kid_dict.contains_key("Count") {
435 Some("Pages")
436 } else if kid_dict.contains_key("Contents")
437 || kid_dict.contains_key("MediaBox")
438 {
439 Some("Page")
440 } else {
441 None
442 }
443 })
444 .or_else(|| {
445 // Additional inference for reconstructed/corrupted objects
446 if lenient_syntax || collect_warnings {
447 // If it has Kids, it's likely a Pages node
448 if kid_dict.contains_key("Kids") {
449 if collect_warnings {
450 tracing::debug!(
451 "Warning: Inferred Type=Pages for object {} 0 R (missing Type field, has Kids)",
452 kid_ref.0
453 );
454 }
455 Some("Pages")
456 }
457 // If it has Contents or MediaBox but no Kids, it's likely a Page
458 else if kid_dict.contains_key("Contents")
459 || (kid_dict.contains_key("MediaBox") && !kid_dict.contains_key("Kids"))
460 {
461 if collect_warnings {
462 tracing::debug!(
463 "Warning: Inferred Type=Page for object {} 0 R (missing Type field, has Contents/MediaBox)",
464 kid_ref.0
465 );
466 }
467 Some("Page")
468 } else {
469 None
470 }
471 } else {
472 None
473 }
474 })
475 .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
476
477 let count = if kid_type == "Pages" {
478 // This is another page tree node
479 if let Some(count_obj) = kid_dict.get("Count") {
480 count_obj.as_integer().unwrap_or(0) as u32
481 } else {
482 // Missing Count - use size of Kids array as approximation
483 if let Some(nested_kids_obj) = kid_dict.get("Kids") {
484 if let Some(nested_kids_array) = nested_kids_obj.as_array() {
485 // Use array length as page count approximation
486 nested_kids_array.0.len() as u32
487 } else {
488 1 // Default if Kids is not an array
489 }
490 } else {
491 1 // Default if no Kids array
492 }
493 }
494 } else {
495 // This is a page
496 1
497 };
498
499 let is_target = target_index < current_index + count;
500 (kid_type.to_string(), count, is_target)
501 };
502
503 if is_target {
504 // Found the right subtree/page
505 // Due to borrow checker constraints with recursive calls,
506 // we return a placeholder page for now.
507 // A proper implementation would require refactoring the page tree
508 // traversal to use an iterative approach instead of recursion.
509
510 return Ok(ParsedPage {
511 obj_ref: kid_ref,
512 dict: PdfDictionary::new(),
513 inherited_resources: Some(merged_inherited.clone()),
514 media_box: [0.0, 0.0, 612.0, 792.0],
515 crop_box: None,
516 rotation: 0,
517 annotations: None,
518 });
519 }
520
521 current_index += count;
522 }
523
524 Err(ParseError::SyntaxError {
525 position: 0,
526 message: "Page not found in tree".to_string(),
527 })
528 }
529 "Page" => {
530 // This is a page object
531 if target_index != 0 {
532 return Err(ParseError::SyntaxError {
533 position: 0,
534 message: "Page index mismatch".to_string(),
535 });
536 }
537
538 // Use the object reference passed as parameter
539 let obj_ref = node_ref;
540
541 // Extract page attributes
542 let media_box =
543 Self::get_rectangle(node, inherited, "MediaBox")?.unwrap_or_else(|| {
544 // Use default Letter size if MediaBox is missing
545 #[cfg(debug_assertions)]
546 tracing::debug!(
547 "Warning: Page {} {} R missing MediaBox, using default Letter size",
548 obj_ref.0,
549 obj_ref.1
550 );
551 [0.0, 0.0, 612.0, 792.0]
552 });
553
554 let crop_box = Self::get_rectangle(node, inherited, "CropBox")?;
555
556 let rotation = Self::get_integer(node, inherited, "Rotate")?.unwrap_or(0) as i32;
557
558 // Get resources
559 let inherited_resources = if let Some(inherited) = inherited {
560 inherited
561 .get("Resources")
562 .and_then(|r| r.as_dict())
563 .cloned()
564 } else {
565 None
566 };
567
568 // Get annotations if present
569 let annotations = node.get("Annots").and_then(|obj| obj.as_array()).cloned();
570
571 Ok(ParsedPage {
572 obj_ref,
573 dict: node.clone(),
574 inherited_resources,
575 media_box,
576 crop_box,
577 rotation,
578 annotations,
579 })
580 }
581 _ => Err(ParseError::SyntaxError {
582 position: 0,
583 message: format!("Invalid page tree node type: {node_type}"),
584 }),
585 }
586 }
587
588 /// Get a rectangle value, checking both node and inherited dictionaries
589 #[allow(dead_code)]
590 fn get_rectangle(
591 node: &PdfDictionary,
592 inherited: Option<&PdfDictionary>,
593 key: &str,
594 ) -> ParseResult<Option<[f64; 4]>> {
595 let array = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
596
597 if let Some(array) = array.and_then(|obj| obj.as_array()) {
598 if array.len() != 4 {
599 return Err(ParseError::SyntaxError {
600 position: 0,
601 message: format!("{key} must have 4 elements"),
602 });
603 }
604
605 // Safe: array length is guaranteed to be 4 after validation above
606 let rect = [
607 array.0[0].as_real().unwrap_or(0.0),
608 array.0[1].as_real().unwrap_or(0.0),
609 array.0[2].as_real().unwrap_or(0.0),
610 array.0[3].as_real().unwrap_or(0.0),
611 ];
612
613 Ok(Some(rect))
614 } else {
615 Ok(None)
616 }
617 }
618
619 /// Get an integer value, checking both node and inherited dictionaries
620 #[allow(dead_code)]
621 fn get_integer(
622 node: &PdfDictionary,
623 inherited: Option<&PdfDictionary>,
624 key: &str,
625 ) -> ParseResult<Option<i64>> {
626 let value = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
627
628 Ok(value.and_then(|obj| obj.as_integer()))
629 }
630}
631
632impl ParsedPage {
633 /// Get the effective page width accounting for rotation.
634 ///
635 /// The width is calculated from the MediaBox and adjusted based on the page rotation.
636 /// For 90° or 270° rotations, the width and height are swapped.
637 ///
638 /// # Returns
639 ///
640 /// The page width in PDF units (typically points, where 1 point = 1/72 inch)
641 ///
642 /// # Example
643 ///
644 /// ```rust,no_run
645 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
646 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
647 /// # let reader = PdfReader::open("document.pdf")?;
648 /// # let document = PdfDocument::new(reader);
649 /// let page = document.get_page(0)?;
650 /// let width_pts = page.width();
651 /// let width_inches = width_pts / 72.0;
652 /// let width_mm = width_pts * 25.4 / 72.0;
653 /// println!("Page width: {} points ({:.2} inches, {:.2} mm)", width_pts, width_inches, width_mm);
654 /// # Ok(())
655 /// # }
656 /// ```
657 pub fn width(&self) -> f64 {
658 match self.rotation {
659 90 | 270 => self.media_box[3] - self.media_box[1],
660 _ => self.media_box[2] - self.media_box[0],
661 }
662 }
663
664 /// Get the effective page height accounting for rotation.
665 ///
666 /// The height is calculated from the MediaBox and adjusted based on the page rotation.
667 /// For 90° or 270° rotations, the width and height are swapped.
668 ///
669 /// # Returns
670 ///
671 /// The page height in PDF units (typically points, where 1 point = 1/72 inch)
672 ///
673 /// # Example
674 ///
675 /// ```rust,no_run
676 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
677 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
678 /// # let reader = PdfReader::open("document.pdf")?;
679 /// # let document = PdfDocument::new(reader);
680 /// let page = document.get_page(0)?;
681 /// println!("Page dimensions: {}x{} points", page.width(), page.height());
682 /// if page.rotation != 0 {
683 /// println!("Page is rotated {} degrees", page.rotation);
684 /// }
685 /// # Ok(())
686 /// # }
687 /// ```
688 pub fn height(&self) -> f64 {
689 match self.rotation {
690 90 | 270 => self.media_box[2] - self.media_box[0],
691 _ => self.media_box[3] - self.media_box[1],
692 }
693 }
694
695 /// Get the content streams for this page using a PdfReader.
696 ///
697 /// Content streams contain the actual drawing instructions (operators) that render
698 /// text, graphics, and images on the page. A page may have multiple content streams
699 /// which are concatenated during rendering.
700 ///
701 /// # Arguments
702 ///
703 /// * `reader` - Mutable reference to the PDF reader
704 ///
705 /// # Returns
706 ///
707 /// A vector of decompressed content stream data. Each vector contains the raw bytes
708 /// of a content stream ready for parsing.
709 ///
710 /// # Errors
711 ///
712 /// Returns an error if:
713 /// - The Contents entry is malformed
714 /// - Stream decompression fails
715 /// - Referenced objects cannot be resolved
716 ///
717 /// # Example
718 ///
719 /// ```rust,no_run
720 /// # use oxidize_pdf::parser::{PdfReader, ParsedPage};
721 /// # fn example(page: &ParsedPage, reader: &mut PdfReader<std::fs::File>) -> Result<(), Box<dyn std::error::Error>> {
722 /// let streams = page.content_streams(reader)?;
723 /// for (i, stream) in streams.iter().enumerate() {
724 /// println!("Content stream {}: {} bytes", i, stream.len());
725 /// }
726 /// # Ok(())
727 /// # }
728 /// ```
729 pub fn content_streams<R: Read + Seek>(
730 &self,
731 reader: &mut PdfReader<R>,
732 ) -> ParseResult<Vec<Vec<u8>>> {
733 let mut streams = Vec::new();
734
735 if let Some(contents) = self.dict.get("Contents") {
736 // First resolve contents to check its type
737 let contents_type = match contents {
738 PdfObject::Reference(obj_num, gen_num) => {
739 let resolved = reader.get_object(*obj_num, *gen_num)?;
740 match resolved {
741 PdfObject::Stream(_) => "stream",
742 PdfObject::Array(_) => "array",
743 _ => "other",
744 }
745 }
746 PdfObject::Stream(_) => "stream",
747 PdfObject::Array(_) => "array",
748 _ => "other",
749 };
750
751 let options = reader.options().clone();
752 match contents_type {
753 "stream" => {
754 let resolved = reader.resolve(contents)?;
755 if let PdfObject::Stream(stream) = resolved {
756 streams.push(stream.decode(&options)?);
757 }
758 }
759 "array" => {
760 // Get array references first
761 let refs: Vec<(u32, u16)> = {
762 let resolved = reader.resolve(contents)?;
763 if let PdfObject::Array(array) = resolved {
764 array
765 .0
766 .iter()
767 .filter_map(|obj| {
768 if let PdfObject::Reference(num, gen) = obj {
769 Some((*num, *gen))
770 } else {
771 None
772 }
773 })
774 .collect()
775 } else {
776 Vec::new()
777 }
778 };
779
780 // Now resolve each reference
781 for (obj_num, gen_num) in refs {
782 let obj = reader.get_object(obj_num, gen_num)?;
783 if let PdfObject::Stream(stream) = obj {
784 streams.push(stream.decode(&options)?);
785 }
786 }
787 }
788 _ => {
789 return Err(ParseError::SyntaxError {
790 position: 0,
791 message: "Contents must be a stream or array of streams".to_string(),
792 })
793 }
794 }
795 }
796
797 Ok(streams)
798 }
799
800 /// Get content streams using PdfDocument (recommended method).
801 ///
802 /// This is the preferred method for accessing content streams as it uses the
803 /// document's caching and resource management capabilities.
804 ///
805 /// # Arguments
806 ///
807 /// * `document` - Reference to the PDF document
808 ///
809 /// # Returns
810 ///
811 /// A vector of decompressed content stream data ready for parsing with `ContentParser`.
812 ///
813 /// # Example
814 ///
815 /// ```rust,no_run
816 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
817 /// # use oxidize_pdf::parser::content::ContentParser;
818 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
819 /// let reader = PdfReader::open("document.pdf")?;
820 /// let document = PdfDocument::new(reader);
821 /// let page = document.get_page(0)?;
822 ///
823 /// // Get content streams
824 /// let streams = page.content_streams_with_document(&document)?;
825 ///
826 /// // Parse each stream
827 /// for stream_data in streams {
828 /// let operations = ContentParser::parse_content(&stream_data)?;
829 /// println!("Stream has {} operations", operations.len());
830 /// }
831 /// # Ok(())
832 /// # }
833 /// ```
834 pub fn content_streams_with_document<R: Read + Seek>(
835 &self,
836 document: &PdfDocument<R>,
837 ) -> ParseResult<Vec<Vec<u8>>> {
838 document.get_page_content_streams(self)
839 }
840
841 /// Get the effective resources for this page (including inherited).
842 ///
843 /// Resources include fonts, images (XObjects), color spaces, patterns, and other
844 /// assets needed to render the page. This method returns page-specific resources
845 /// if present, otherwise falls back to inherited resources from parent nodes.
846 ///
847 /// # Returns
848 ///
849 /// The Resources dictionary if available, or None if the page has no resources.
850 ///
851 /// # Resource Categories
852 ///
853 /// The Resources dictionary may contain:
854 /// - `Font` - Font definitions used by text operators
855 /// - `XObject` - External objects (images, form XObjects)
856 /// - `ColorSpace` - Color space definitions
857 /// - `Pattern` - Pattern definitions for fills
858 /// - `Shading` - Shading dictionaries
859 /// - `ExtGState` - Graphics state parameter dictionaries
860 /// - `Properties` - Property list dictionaries
861 ///
862 /// # Example
863 ///
864 /// ```rust,no_run
865 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
866 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
867 /// # let reader = PdfReader::open("document.pdf")?;
868 /// # let document = PdfDocument::new(reader);
869 /// # let page = document.get_page(0)?;
870 /// if let Some(resources) = page.get_resources() {
871 /// // Check for fonts
872 /// if let Some(fonts) = resources.get("Font").and_then(|f| f.as_dict()) {
873 /// println!("Page uses {} fonts", fonts.0.len());
874 /// }
875 ///
876 /// // Check for images
877 /// if let Some(xobjects) = resources.get("XObject").and_then(|x| x.as_dict()) {
878 /// println!("Page has {} XObjects", xobjects.0.len());
879 /// }
880 /// }
881 /// # Ok(())
882 /// # }
883 /// ```
884 pub fn get_contents(&self) -> Option<&PdfObject> {
885 self.dict.get("Contents")
886 }
887
888 pub fn get_resources(&self) -> Option<&PdfDictionary> {
889 self.dict
890 .get("Resources")
891 .and_then(|r| r.as_dict())
892 .or(self.inherited_resources.as_ref())
893 }
894
895 /// Clone this page with all inherited resources merged into the page dictionary.
896 ///
897 /// This is useful when extracting a page for separate processing or when you need
898 /// a self-contained page object with all resources explicitly included.
899 ///
900 /// # Returns
901 ///
902 /// A cloned page with inherited resources merged into the Resources entry
903 /// of the page dictionary.
904 ///
905 /// # Example
906 ///
907 /// ```rust,no_run
908 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
909 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
910 /// # let reader = PdfReader::open("document.pdf")?;
911 /// # let document = PdfDocument::new(reader);
912 /// # let page = document.get_page(0)?;
913 /// // Get a self-contained page with all resources
914 /// let standalone_page = page.clone_with_resources();
915 ///
916 /// // The cloned page now has all resources in its dictionary
917 /// assert!(standalone_page.dict.contains_key("Resources"));
918 /// # Ok(())
919 /// # }
920 /// ```
921 pub fn clone_with_resources(&self) -> Self {
922 let mut cloned = self.clone();
923
924 // Merge inherited resources into the page dictionary if needed
925 if let Some(inherited) = &self.inherited_resources {
926 if !cloned.dict.contains_key("Resources") {
927 cloned.dict.insert(
928 "Resources".to_string(),
929 PdfObject::Dictionary(inherited.clone()),
930 );
931 }
932 }
933
934 cloned
935 }
936
937 /// Get the annotations array for this page.
938 ///
939 /// Returns a reference to the annotations array if present.
940 /// Each element in the array is typically a reference to an annotation dictionary.
941 ///
942 /// # Example
943 ///
944 /// ```rust,no_run
945 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
946 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
947 /// # let reader = PdfReader::open("document.pdf")?;
948 /// # let document = PdfDocument::new(reader);
949 /// # let page = document.get_page(0)?;
950 /// if let Some(annots) = page.get_annotations() {
951 /// println!("Page has {} annotations", annots.len());
952 /// }
953 /// # Ok(())
954 /// # }
955 /// ```
956 pub fn get_annotations(&self) -> Option<&PdfArray> {
957 self.annotations.as_ref()
958 }
959
960 /// Check if the page has annotations.
961 ///
962 /// # Returns
963 ///
964 /// `true` if the page has an annotations array with at least one annotation,
965 /// `false` otherwise.
966 ///
967 /// # Example
968 ///
969 /// ```rust,no_run
970 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
971 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
972 /// # let reader = PdfReader::open("document.pdf")?;
973 /// # let document = PdfDocument::new(reader);
974 /// # let page = document.get_page(0)?;
975 /// if page.has_annotations() {
976 /// println!("This page contains annotations");
977 /// }
978 /// # Ok(())
979 /// # }
980 /// ```
981 pub fn has_annotations(&self) -> bool {
982 self.annotations
983 .as_ref()
984 .map(|arr| !arr.is_empty())
985 .unwrap_or(false)
986 }
987
988 /// Get all objects referenced by this page (for extraction or analysis).
989 ///
990 /// This method recursively collects all objects referenced by the page, including:
991 /// - Content streams
992 /// - Resources (fonts, images, etc.)
993 /// - Nested objects within resources
994 ///
995 /// This is useful for extracting a complete page with all its dependencies or
996 /// for analyzing the object graph of a page.
997 ///
998 /// # Arguments
999 ///
1000 /// * `reader` - Mutable reference to the PDF reader
1001 ///
1002 /// # Returns
1003 ///
1004 /// A HashMap mapping object references (obj_num, gen_num) to their resolved objects.
1005 ///
1006 /// # Example
1007 ///
1008 /// ```rust,no_run
1009 /// # use oxidize_pdf::parser::{PdfReader, ParsedPage};
1010 /// # fn example(page: &ParsedPage, reader: &mut PdfReader<std::fs::File>) -> Result<(), Box<dyn std::error::Error>> {
1011 /// let referenced_objects = page.get_referenced_objects(reader)?;
1012 ///
1013 /// println!("Page references {} objects", referenced_objects.len());
1014 /// for ((obj_num, gen_num), obj) in &referenced_objects {
1015 /// println!(" {} {} R: {:?}", obj_num, gen_num, obj);
1016 /// }
1017 /// # Ok(())
1018 /// # }
1019 /// ```
1020 pub fn get_referenced_objects<R: Read + Seek>(
1021 &self,
1022 reader: &mut PdfReader<R>,
1023 ) -> ParseResult<HashMap<(u32, u16), PdfObject>> {
1024 let mut objects = HashMap::new();
1025 let mut to_process = Vec::new();
1026
1027 // Start with Contents
1028 if let Some(contents) = self.dict.get("Contents") {
1029 Self::collect_references(contents, &mut to_process);
1030 }
1031
1032 // Add Resources
1033 if let Some(resources) = self.get_resources() {
1034 for value in resources.0.values() {
1035 Self::collect_references(value, &mut to_process);
1036 }
1037 }
1038
1039 // Process all references
1040 while let Some((obj_num, gen_num)) = to_process.pop() {
1041 if let std::collections::hash_map::Entry::Vacant(e) = objects.entry((obj_num, gen_num))
1042 {
1043 let obj = reader.get_object(obj_num, gen_num)?;
1044
1045 // Collect nested references
1046 Self::collect_references_from_object(obj, &mut to_process);
1047
1048 e.insert(obj.clone());
1049 }
1050 }
1051
1052 Ok(objects)
1053 }
1054
1055 /// Collect object references from a PDF object
1056 fn collect_references(obj: &PdfObject, refs: &mut Vec<(u32, u16)>) {
1057 match obj {
1058 PdfObject::Reference(obj_num, gen_num) => {
1059 refs.push((*obj_num, *gen_num));
1060 }
1061 PdfObject::Array(array) => {
1062 for item in &array.0 {
1063 Self::collect_references(item, refs);
1064 }
1065 }
1066 PdfObject::Dictionary(dict) => {
1067 for value in dict.0.values() {
1068 Self::collect_references(value, refs);
1069 }
1070 }
1071 _ => {}
1072 }
1073 }
1074
1075 /// Collect references from an object (after resolution)
1076 fn collect_references_from_object(obj: &PdfObject, refs: &mut Vec<(u32, u16)>) {
1077 match obj {
1078 PdfObject::Array(array) => {
1079 for item in &array.0 {
1080 Self::collect_references(item, refs);
1081 }
1082 }
1083 PdfObject::Dictionary(dict) | PdfObject::Stream(PdfStream { dict, .. }) => {
1084 for value in dict.0.values() {
1085 Self::collect_references(value, refs);
1086 }
1087 }
1088 _ => {}
1089 }
1090 }
1091}
1092
1093#[cfg(test)]
1094mod tests {
1095 use super::super::objects::{PdfArray, PdfDictionary, PdfName, PdfObject};
1096 use super::*;
1097 use std::collections::HashMap;
1098
1099 fn create_test_page() -> ParsedPage {
1100 let mut dict = PdfDictionary(HashMap::new());
1101 dict.0.insert(
1102 PdfName("Type".to_string()),
1103 PdfObject::Name(PdfName("Page".to_string())),
1104 );
1105 dict.0
1106 .insert(PdfName("Parent".to_string()), PdfObject::Reference(2, 0));
1107
1108 ParsedPage {
1109 obj_ref: (3, 0),
1110 dict,
1111 inherited_resources: None,
1112 media_box: [0.0, 0.0, 595.0, 842.0],
1113 crop_box: None,
1114 rotation: 0,
1115 annotations: None,
1116 }
1117 }
1118
1119 fn create_test_page_with_resources() -> ParsedPage {
1120 let mut dict = PdfDictionary(HashMap::new());
1121 dict.0.insert(
1122 PdfName("Type".to_string()),
1123 PdfObject::Name(PdfName("Page".to_string())),
1124 );
1125
1126 let mut resources = PdfDictionary(HashMap::new());
1127 resources.0.insert(
1128 PdfName("Font".to_string()),
1129 PdfObject::Dictionary(PdfDictionary(HashMap::new())),
1130 );
1131
1132 ParsedPage {
1133 obj_ref: (4, 0),
1134 dict,
1135 inherited_resources: Some(resources),
1136 media_box: [0.0, 0.0, 595.0, 842.0],
1137 crop_box: Some([10.0, 10.0, 585.0, 832.0]),
1138 rotation: 90,
1139 annotations: Some(PdfArray(vec![])),
1140 }
1141 }
1142
1143 #[test]
1144 fn test_page_tree_new() {
1145 let tree = PageTree::new(10);
1146 assert_eq!(tree.page_count, 10);
1147 assert_eq!(tree.pages.len(), 0);
1148 assert!(tree.pages_dict.is_none());
1149 }
1150
1151 #[test]
1152 fn test_page_tree_new_with_pages_dict() {
1153 let pages_dict = PdfDictionary(HashMap::new());
1154 let tree = PageTree::new_with_pages_dict(5, pages_dict);
1155 assert_eq!(tree.page_count, 5);
1156 assert_eq!(tree.pages.len(), 0);
1157 assert!(tree.pages_dict.is_some());
1158 }
1159
1160 #[test]
1161 fn test_get_cached_page_empty() {
1162 let tree = PageTree::new(10);
1163 assert!(tree.get_cached_page(0).is_none());
1164 assert!(tree.get_cached_page(5).is_none());
1165 }
1166
1167 #[test]
1168 fn test_cache_and_get_page() {
1169 let mut tree = PageTree::new(10);
1170 let page = create_test_page();
1171
1172 tree.cache_page(0, page);
1173
1174 let cached = tree.get_cached_page(0);
1175 assert!(cached.is_some());
1176 let cached_page = cached.unwrap();
1177 assert_eq!(cached_page.obj_ref, (3, 0));
1178 assert_eq!(cached_page.media_box, [0.0, 0.0, 595.0, 842.0]);
1179 }
1180
1181 #[test]
1182 fn test_cache_multiple_pages() {
1183 let mut tree = PageTree::new(10);
1184 let page1 = create_test_page();
1185 let page2 = create_test_page_with_resources();
1186
1187 tree.cache_page(0, page1);
1188 tree.cache_page(1, page2);
1189
1190 assert!(tree.get_cached_page(0).is_some());
1191 assert!(tree.get_cached_page(1).is_some());
1192 assert!(tree.get_cached_page(2).is_none());
1193
1194 let cached1 = tree.get_cached_page(0).unwrap();
1195 assert_eq!(cached1.rotation, 0);
1196
1197 let cached2 = tree.get_cached_page(1).unwrap();
1198 assert_eq!(cached2.rotation, 90);
1199 }
1200
1201 #[test]
1202 fn test_get_page_count() {
1203 let tree = PageTree::new(25);
1204 assert_eq!(tree.page_count, 25);
1205 }
1206
1207 #[test]
1208 fn test_clear_cache() {
1209 let mut tree = PageTree::new(10);
1210 let page = create_test_page();
1211
1212 tree.cache_page(0, page.clone());
1213 tree.cache_page(1, page);
1214 assert_eq!(tree.pages.len(), 2);
1215
1216 tree.clear_cache();
1217 assert_eq!(tree.pages.len(), 0);
1218 assert!(tree.get_cached_page(0).is_none());
1219 assert!(tree.get_cached_page(1).is_none());
1220 }
1221
1222 #[test]
1223 fn test_parsed_page_properties() {
1224 let page = create_test_page_with_resources();
1225
1226 assert_eq!(page.obj_ref, (4, 0));
1227 assert_eq!(page.rotation, 90);
1228 assert!(page.inherited_resources.is_some());
1229 assert!(page.crop_box.is_some());
1230 assert!(page.annotations.is_some());
1231
1232 let crop_box = page.crop_box.unwrap();
1233 assert_eq!(crop_box, [10.0, 10.0, 585.0, 832.0]);
1234 }
1235
1236 #[test]
1237 fn test_parsed_page_creation() {
1238 let dict = PdfDictionary::new();
1239 let page = ParsedPage {
1240 obj_ref: (1, 0),
1241 dict: dict.clone(),
1242 inherited_resources: None,
1243 media_box: [0.0, 0.0, 612.0, 792.0],
1244 crop_box: None,
1245 rotation: 0,
1246 annotations: None,
1247 };
1248
1249 assert_eq!(page.obj_ref, (1, 0));
1250 assert_eq!(page.dict, dict);
1251 assert!(page.inherited_resources.is_none());
1252 assert_eq!(page.media_box, [0.0, 0.0, 612.0, 792.0]); // Default US Letter
1253 assert!(page.crop_box.is_none());
1254 assert_eq!(page.rotation, 0);
1255 assert!(page.annotations.is_none());
1256 }
1257
1258 #[test]
1259 fn test_parsed_page_width_height() {
1260 let mut page = create_test_page();
1261
1262 // A4 size
1263 assert_eq!(page.width(), 595.0);
1264 assert_eq!(page.height(), 842.0);
1265
1266 // Test with rotation
1267 page.rotation = 90;
1268 // Width and height should swap when rotated
1269 assert_eq!(page.width(), 842.0);
1270 assert_eq!(page.height(), 595.0);
1271
1272 page.rotation = 270;
1273 assert_eq!(page.width(), 842.0);
1274 assert_eq!(page.height(), 595.0);
1275
1276 page.rotation = 180;
1277 assert_eq!(page.width(), 595.0);
1278 assert_eq!(page.height(), 842.0);
1279 }
1280
1281 #[test]
1282 fn test_parsed_page_get_resources() {
1283 let page = create_test_page_with_resources();
1284 let resources = page.get_resources();
1285
1286 assert!(resources.is_some());
1287 let res = resources.unwrap();
1288 assert!(res.contains_key("Font"));
1289 }
1290
1291 #[test]
1292 fn test_parsed_page_get_contents() {
1293 let mut page = create_test_page();
1294
1295 // Add contents to page
1296 page.dict
1297 .insert("Contents".to_string(), PdfObject::Reference(10, 0));
1298
1299 let contents = page.get_contents();
1300 assert!(contents.is_some());
1301 assert_eq!(contents, Some(&PdfObject::Reference(10, 0)));
1302 }
1303
1304 #[test]
1305 fn test_parsed_page_get_annotations() {
1306 let page = create_test_page_with_resources();
1307 let annotations = page.get_annotations();
1308
1309 assert!(annotations.is_some());
1310 if let Some(arr) = annotations {
1311 assert_eq!(arr.0.len(), 0);
1312 }
1313 }
1314
1315 #[test]
1316 fn test_parsed_page_inherited_resources() {
1317 let mut page = create_test_page();
1318 let mut parent_resources = PdfDictionary::new();
1319 parent_resources.insert(
1320 "Font".to_string(),
1321 PdfObject::Dictionary(PdfDictionary::new()),
1322 );
1323
1324 // Directly set inherited resources
1325 page.inherited_resources = Some(parent_resources.clone());
1326
1327 assert!(page.inherited_resources.is_some());
1328 assert_eq!(page.inherited_resources, Some(parent_resources));
1329 }
1330
1331 #[test]
1332 fn test_parsed_page_with_crop_box() {
1333 let mut page = create_test_page();
1334 page.crop_box = Some([50.0, 50.0, 545.0, 792.0]);
1335
1336 // CropBox affects visible area
1337 let crop = page.crop_box.unwrap();
1338 assert_eq!(crop[0], 50.0);
1339 assert_eq!(crop[1], 50.0);
1340 assert_eq!(crop[2], 545.0);
1341 assert_eq!(crop[3], 792.0);
1342 }
1343
1344 #[test]
1345 fn test_page_tree_cache_overflow() {
1346 let mut tree = PageTree::new(100);
1347
1348 // Cache more pages than typical cache size
1349 for i in 0..50 {
1350 let page = create_test_page();
1351 tree.cache_page(i, page);
1352 }
1353
1354 // All pages should be cached
1355 for i in 0..50 {
1356 assert!(tree.get_cached_page(i).is_some());
1357 }
1358 }
1359
1360 #[test]
1361 fn test_page_tree_update_cached_page() {
1362 let mut tree = PageTree::new(10);
1363 let page1 = create_test_page();
1364 let mut page2 = create_test_page();
1365 page2.rotation = 180;
1366
1367 tree.cache_page(0, page1);
1368 let cached = tree.get_cached_page(0).unwrap();
1369 assert_eq!(cached.rotation, 0);
1370
1371 // Update the same page
1372 tree.cache_page(0, page2);
1373 let cached = tree.get_cached_page(0).unwrap();
1374 assert_eq!(cached.rotation, 180);
1375 }
1376
1377 #[test]
1378 fn test_parsed_page_clone() {
1379 let page = create_test_page_with_resources();
1380 let cloned = page.clone();
1381
1382 assert_eq!(page.obj_ref, cloned.obj_ref);
1383 assert_eq!(page.dict, cloned.dict);
1384 assert_eq!(page.inherited_resources, cloned.inherited_resources);
1385 assert_eq!(page.media_box, cloned.media_box);
1386 assert_eq!(page.crop_box, cloned.crop_box);
1387 assert_eq!(page.rotation, cloned.rotation);
1388 assert_eq!(page.annotations, cloned.annotations);
1389 }
1390
1391 #[test]
1392 fn test_page_tree_get_page_bounds() {
1393 let tree = PageTree::new(100);
1394
1395 // Test bounds checking
1396 assert!(tree.get_cached_page(0).is_none()); // Not cached yet
1397 assert!(tree.get_cached_page(99).is_none()); // Within bounds but not cached
1398 assert!(tree.get_cached_page(100).is_none()); // Out of bounds
1399 assert!(tree.get_cached_page(u32::MAX).is_none()); // Way out of bounds
1400 }
1401}
1402
1403#[cfg(test)]
1404#[path = "page_tree_tests.rs"]
1405mod page_tree_tests;