oxidize_pdf/parser/document.rs
1//! PDF Document wrapper - High-level interface for PDF parsing and manipulation
2//!
3//! This module provides a robust, high-level interface for working with PDF documents.
4//! It solves Rust's borrow checker challenges through careful use of interior mutability
5//! (RefCell) and separation of concerns between parsing, caching, and page access.
6//!
7//! # Architecture
8//!
9//! The module uses a layered architecture:
10//! - **PdfDocument**: Main entry point with RefCell-based state management
11//! - **ResourceManager**: Centralized object caching with interior mutability
12//! - **PdfReader**: Low-level file access (wrapped in RefCell)
13//! - **PageTree**: Lazy-loaded page navigation
14//!
15//! # Key Features
16//!
17//! - **Automatic caching**: Objects are cached after first access
18//! - **Resource management**: Shared resources are handled efficiently
19//! - **Page navigation**: Fast access to any page in the document
20//! - **Reference resolution**: Automatic resolution of indirect references
21//! - **Text extraction**: Built-in support for extracting text from pages
22//!
23//! # Example
24//!
25//! ```rust,no_run
26//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
27//!
28//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
29//! // Open a PDF document
30//! let reader = PdfReader::open("document.pdf")?;
31//! let document = PdfDocument::new(reader);
32//!
33//! // Get document information
34//! let page_count = document.page_count()?;
35//! let metadata = document.metadata()?;
36//! println!("Title: {:?}", metadata.title);
37//! println!("Pages: {}", page_count);
38//!
39//! // Access a specific page
40//! let page = document.get_page(0)?;
41//! println!("Page size: {}x{}", page.width(), page.height());
42//!
43//! // Extract text from all pages
44//! let extracted_text = document.extract_text()?;
45//! for (i, page_text) in extracted_text.iter().enumerate() {
46//! println!("Page {}: {}", i + 1, page_text.text);
47//! }
48//! # Ok(())
49//! # }
50//! ```
51
52use super::objects::{PdfDictionary, PdfObject};
53use super::page_tree::{PageTree, ParsedPage};
54use super::reader::PdfReader;
55use super::{ParseError, ParseOptions, ParseResult};
56use std::cell::RefCell;
57use std::collections::HashMap;
58use std::io::{Read, Seek};
59use std::rc::Rc;
60
61/// Resource manager for efficient PDF object caching.
62///
63/// The ResourceManager provides centralized caching of PDF objects to avoid
64/// repeated parsing and to share resources between different parts of the document.
65/// It uses RefCell for interior mutability, allowing multiple immutable references
66/// to the document while still being able to update the cache.
67///
68/// # Caching Strategy
69///
70/// - Objects are cached on first access
71/// - Cache persists for the lifetime of the document
72/// - Manual cache clearing is supported for memory management
73///
74/// # Example
75///
76/// ```rust,no_run
77/// use oxidize_pdf::parser::document::ResourceManager;
78///
79/// let resources = ResourceManager::new();
80///
81/// // Objects are cached automatically when accessed through PdfDocument
82/// // Manual cache management:
83/// resources.clear_cache(); // Free memory when needed
84/// ```
85pub struct ResourceManager {
86 /// Cached objects indexed by (object_number, generation_number)
87 object_cache: RefCell<HashMap<(u32, u16), PdfObject>>,
88}
89
90impl Default for ResourceManager {
91 fn default() -> Self {
92 Self::new()
93 }
94}
95
96impl ResourceManager {
97 /// Create a new resource manager
98 pub fn new() -> Self {
99 Self {
100 object_cache: RefCell::new(HashMap::new()),
101 }
102 }
103
104 /// Get an object from cache if available.
105 ///
106 /// # Arguments
107 ///
108 /// * `obj_ref` - Object reference (object_number, generation_number)
109 ///
110 /// # Returns
111 ///
112 /// Cloned object if cached, None otherwise.
113 ///
114 /// # Example
115 ///
116 /// ```rust,no_run
117 /// # use oxidize_pdf::parser::document::ResourceManager;
118 /// # let resources = ResourceManager::new();
119 /// if let Some(obj) = resources.get_cached((10, 0)) {
120 /// println!("Object 10 0 R found in cache");
121 /// }
122 /// ```
123 pub fn get_cached(&self, obj_ref: (u32, u16)) -> Option<PdfObject> {
124 self.object_cache.borrow().get(&obj_ref).cloned()
125 }
126
127 /// Cache an object for future access.
128 ///
129 /// # Arguments
130 ///
131 /// * `obj_ref` - Object reference (object_number, generation_number)
132 /// * `obj` - The PDF object to cache
133 ///
134 /// # Example
135 ///
136 /// ```rust,no_run
137 /// # use oxidize_pdf::parser::document::ResourceManager;
138 /// # use oxidize_pdf::parser::objects::PdfObject;
139 /// # let resources = ResourceManager::new();
140 /// resources.cache_object((10, 0), PdfObject::Integer(42));
141 /// ```
142 pub fn cache_object(&self, obj_ref: (u32, u16), obj: PdfObject) {
143 self.object_cache.borrow_mut().insert(obj_ref, obj);
144 }
145
146 /// Clear all cached objects to free memory.
147 ///
148 /// Use this when processing large documents to manage memory usage.
149 ///
150 /// # Example
151 ///
152 /// ```rust,no_run
153 /// # use oxidize_pdf::parser::document::ResourceManager;
154 /// # let resources = ResourceManager::new();
155 /// // After processing many pages
156 /// resources.clear_cache();
157 /// println!("Cache cleared to free memory");
158 /// ```
159 pub fn clear_cache(&self) {
160 self.object_cache.borrow_mut().clear();
161 }
162}
163
164/// High-level PDF document interface for parsing and manipulation.
165///
166/// `PdfDocument` provides a clean, safe API for working with PDF files.
167/// It handles the complexity of PDF structure, object references, and resource
168/// management behind a simple interface.
169///
170/// # Type Parameter
171///
172/// * `R` - The reader type (must implement Read + Seek)
173///
174/// # Architecture Benefits
175///
176/// - **RefCell Usage**: Allows multiple parts of the API to access the document
177/// - **Lazy Loading**: Pages and resources are loaded on demand
178/// - **Automatic Caching**: Frequently accessed objects are cached
179/// - **Safe API**: Borrow checker issues are handled internally
180///
181/// # Example
182///
183/// ```rust,no_run
184/// use oxidize_pdf::parser::{PdfDocument, PdfReader};
185/// use std::fs::File;
186///
187/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
188/// // From a file
189/// let reader = PdfReader::open("document.pdf")?;
190/// let document = PdfDocument::new(reader);
191///
192/// // From any Read + Seek source
193/// let file = File::open("document.pdf")?;
194/// let reader = PdfReader::new(file)?;
195/// let document = PdfDocument::new(reader);
196///
197/// // Use the document
198/// let page_count = document.page_count()?;
199/// for i in 0..page_count {
200/// let page = document.get_page(i)?;
201/// // Process page...
202/// }
203/// # Ok(())
204/// # }
205/// ```
206pub struct PdfDocument<R: Read + Seek> {
207 /// The underlying PDF reader wrapped for interior mutability
208 reader: RefCell<PdfReader<R>>,
209 /// Page tree navigator (lazily initialized)
210 page_tree: RefCell<Option<PageTree>>,
211 /// Shared resource manager for object caching
212 resources: Rc<ResourceManager>,
213 /// Cached document metadata to avoid repeated parsing
214 metadata_cache: RefCell<Option<super::reader::DocumentMetadata>>,
215}
216
217impl<R: Read + Seek> PdfDocument<R> {
218 /// Create a new PDF document from a reader
219 pub fn new(reader: PdfReader<R>) -> Self {
220 Self {
221 reader: RefCell::new(reader),
222 page_tree: RefCell::new(None),
223 resources: Rc::new(ResourceManager::new()),
224 metadata_cache: RefCell::new(None),
225 }
226 }
227
228 /// Get the PDF version of the document.
229 ///
230 /// # Returns
231 ///
232 /// PDF version string (e.g., "1.4", "1.7", "2.0")
233 ///
234 /// # Example
235 ///
236 /// ```rust,no_run
237 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
238 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
239 /// # let reader = PdfReader::open("document.pdf")?;
240 /// # let document = PdfDocument::new(reader);
241 /// let version = document.version()?;
242 /// println!("PDF version: {}", version);
243 /// # Ok(())
244 /// # }
245 /// ```
246 pub fn version(&self) -> ParseResult<String> {
247 Ok(self.reader.borrow().version().to_string())
248 }
249
250 /// Get the parse options
251 pub fn options(&self) -> ParseOptions {
252 self.reader.borrow().options().clone()
253 }
254
255 /// Get the total number of pages in the document.
256 ///
257 /// # Returns
258 ///
259 /// The page count as an unsigned 32-bit integer.
260 ///
261 /// # Errors
262 ///
263 /// Returns an error if the page tree is malformed or missing.
264 ///
265 /// # Example
266 ///
267 /// ```rust,no_run
268 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
269 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
270 /// # let reader = PdfReader::open("document.pdf")?;
271 /// # let document = PdfDocument::new(reader);
272 /// let count = document.page_count()?;
273 /// println!("Document has {} pages", count);
274 ///
275 /// // Iterate through all pages
276 /// for i in 0..count {
277 /// let page = document.get_page(i)?;
278 /// // Process page...
279 /// }
280 /// # Ok(())
281 /// # }
282 /// ```
283 pub fn page_count(&self) -> ParseResult<u32> {
284 self.reader.borrow_mut().page_count()
285 }
286
287 /// Get document metadata including title, author, creation date, etc.
288 ///
289 /// Metadata is cached after first access for performance.
290 ///
291 /// # Returns
292 ///
293 /// A `DocumentMetadata` struct containing all available metadata fields.
294 ///
295 /// # Example
296 ///
297 /// ```rust,no_run
298 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
299 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
300 /// # let reader = PdfReader::open("document.pdf")?;
301 /// # let document = PdfDocument::new(reader);
302 /// let metadata = document.metadata()?;
303 ///
304 /// if let Some(title) = &metadata.title {
305 /// println!("Title: {}", title);
306 /// }
307 /// if let Some(author) = &metadata.author {
308 /// println!("Author: {}", author);
309 /// }
310 /// if let Some(creation_date) = &metadata.creation_date {
311 /// println!("Created: {}", creation_date);
312 /// }
313 /// println!("PDF Version: {}", metadata.version);
314 /// # Ok(())
315 /// # }
316 /// ```
317 pub fn metadata(&self) -> ParseResult<super::reader::DocumentMetadata> {
318 // Check cache first
319 if let Some(metadata) = self.metadata_cache.borrow().as_ref() {
320 return Ok(metadata.clone());
321 }
322
323 // Load metadata
324 let metadata = self.reader.borrow_mut().metadata()?;
325 self.metadata_cache.borrow_mut().replace(metadata.clone());
326 Ok(metadata)
327 }
328
329 /// Initialize the page tree if not already done
330 fn ensure_page_tree(&self) -> ParseResult<()> {
331 if self.page_tree.borrow().is_none() {
332 let page_count = self.page_count()?;
333 let pages_dict = self.load_pages_dict()?;
334 let page_tree = PageTree::new_with_pages_dict(page_count, pages_dict);
335 self.page_tree.borrow_mut().replace(page_tree);
336 }
337 Ok(())
338 }
339
340 /// Load the pages dictionary
341 fn load_pages_dict(&self) -> ParseResult<PdfDictionary> {
342 let mut reader = self.reader.borrow_mut();
343 let pages = reader.pages()?;
344 Ok(pages.clone())
345 }
346
347 /// Get a page by index (0-based).
348 ///
349 /// Pages are cached after first access. This method handles page tree
350 /// traversal and property inheritance automatically.
351 ///
352 /// # Arguments
353 ///
354 /// * `index` - Zero-based page index (0 to page_count-1)
355 ///
356 /// # Returns
357 ///
358 /// A complete `ParsedPage` with all properties and inherited resources.
359 ///
360 /// # Errors
361 ///
362 /// Returns an error if:
363 /// - Index is out of bounds
364 /// - Page tree is malformed
365 /// - Required page properties are missing
366 ///
367 /// # Example
368 ///
369 /// ```rust,no_run
370 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
371 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
372 /// # let reader = PdfReader::open("document.pdf")?;
373 /// # let document = PdfDocument::new(reader);
374 /// // Get the first page
375 /// let page = document.get_page(0)?;
376 ///
377 /// // Access page properties
378 /// println!("Page size: {}x{} points", page.width(), page.height());
379 /// println!("Rotation: {}°", page.rotation);
380 ///
381 /// // Get content streams
382 /// let streams = page.content_streams_with_document(&document)?;
383 /// println!("Page has {} content streams", streams.len());
384 /// # Ok(())
385 /// # }
386 /// ```
387 pub fn get_page(&self, index: u32) -> ParseResult<ParsedPage> {
388 self.ensure_page_tree()?;
389
390 // First check if page is already loaded
391 if let Some(page_tree) = self.page_tree.borrow().as_ref() {
392 if let Some(page) = page_tree.get_cached_page(index) {
393 return Ok(page.clone());
394 }
395 }
396
397 // Load the page (reference stack will handle circular detection automatically)
398 let page = self.load_page_at_index(index)?;
399
400 // Cache it
401 if let Some(page_tree) = self.page_tree.borrow_mut().as_mut() {
402 page_tree.cache_page(index, page.clone());
403 }
404
405 Ok(page)
406 }
407
408 /// Load a specific page by index
409 fn load_page_at_index(&self, index: u32) -> ParseResult<ParsedPage> {
410 // Get the pages root
411 let pages_dict = self.load_pages_dict()?;
412
413 // Navigate to the specific page
414 let page_info = self.find_page_in_tree(&pages_dict, index, 0, None)?;
415
416 Ok(page_info)
417 }
418
419 /// Find a page in the page tree (iterative implementation for stack safety)
420 fn find_page_in_tree(
421 &self,
422 root_node: &PdfDictionary,
423 target_index: u32,
424 initial_current_index: u32,
425 initial_inherited: Option<&PdfDictionary>,
426 ) -> ParseResult<ParsedPage> {
427 // Work item for the traversal queue
428 #[derive(Debug)]
429 struct WorkItem {
430 node_dict: PdfDictionary,
431 node_ref: Option<(u32, u16)>,
432 current_index: u32,
433 inherited: Option<PdfDictionary>,
434 }
435
436 // Initialize work queue with root node
437 let mut work_queue = Vec::new();
438 work_queue.push(WorkItem {
439 node_dict: root_node.clone(),
440 node_ref: None,
441 current_index: initial_current_index,
442 inherited: initial_inherited.cloned(),
443 });
444
445 // Iterative traversal
446 while let Some(work_item) = work_queue.pop() {
447 let WorkItem {
448 node_dict,
449 node_ref,
450 current_index,
451 inherited,
452 } = work_item;
453
454 let node_type = node_dict
455 .get_type()
456 .or_else(|| {
457 // If Type is missing, try to infer from content
458 if node_dict.contains_key("Kids") && node_dict.contains_key("Count") {
459 Some("Pages")
460 } else if node_dict.contains_key("Contents")
461 || node_dict.contains_key("MediaBox")
462 {
463 Some("Page")
464 } else {
465 None
466 }
467 })
468 .or_else(|| {
469 // If Type is missing, try to infer from structure
470 if node_dict.contains_key("Kids") {
471 Some("Pages")
472 } else if node_dict.contains_key("Contents")
473 || (node_dict.contains_key("MediaBox") && !node_dict.contains_key("Kids"))
474 {
475 Some("Page")
476 } else {
477 None
478 }
479 })
480 .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
481
482 match node_type {
483 "Pages" => {
484 // This is a page tree node
485 let kids = node_dict
486 .get("Kids")
487 .and_then(|obj| obj.as_array())
488 .or_else(|| {
489 // If Kids is missing, use empty array
490 eprintln!(
491 "Warning: Missing Kids array in Pages node, using empty array"
492 );
493 Some(&super::objects::EMPTY_PDF_ARRAY)
494 })
495 .ok_or_else(|| ParseError::MissingKey("Kids".to_string()))?;
496
497 // Merge inherited attributes
498 let mut merged_inherited = inherited.unwrap_or_else(PdfDictionary::new);
499
500 // Inheritable attributes
501 for key in ["Resources", "MediaBox", "CropBox", "Rotate"] {
502 if let Some(value) = node_dict.get(key) {
503 if !merged_inherited.contains_key(key) {
504 merged_inherited.insert(key.to_string(), value.clone());
505 }
506 }
507 }
508
509 // Process kids in reverse order (since we're using a stack/Vec::pop())
510 // This ensures we process them in the correct order
511 let mut current_idx = current_index;
512 let mut pending_kids = Vec::new();
513
514 for kid_ref in &kids.0 {
515 let kid_ref =
516 kid_ref
517 .as_reference()
518 .ok_or_else(|| ParseError::SyntaxError {
519 position: 0,
520 message: "Kids array must contain references".to_string(),
521 })?;
522
523 // Get the kid object
524 let kid_obj = self.get_object(kid_ref.0, kid_ref.1)?;
525 let kid_dict =
526 kid_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
527 position: 0,
528 message: "Page tree node must be a dictionary".to_string(),
529 })?;
530
531 let kid_type = kid_dict
532 .get_type()
533 .or_else(|| {
534 // If Type is missing, try to infer from content
535 if kid_dict.contains_key("Kids") && kid_dict.contains_key("Count") {
536 Some("Pages")
537 } else if kid_dict.contains_key("Contents")
538 || kid_dict.contains_key("MediaBox")
539 {
540 Some("Page")
541 } else {
542 None
543 }
544 })
545 .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
546
547 let count = if kid_type == "Pages" {
548 kid_dict
549 .get("Count")
550 .and_then(|obj| obj.as_integer())
551 .unwrap_or(1) // Fallback to 1 if Count is missing (defensive)
552 as u32
553 } else {
554 1
555 };
556
557 if target_index < current_idx + count {
558 // Found the right subtree/page
559 if kid_type == "Page" {
560 // This is the page we want
561 return self.create_parsed_page(
562 kid_ref,
563 kid_dict,
564 Some(&merged_inherited),
565 );
566 } else {
567 // Need to traverse this subtree - add to queue
568 pending_kids.push(WorkItem {
569 node_dict: kid_dict.clone(),
570 node_ref: Some(kid_ref),
571 current_index: current_idx,
572 inherited: Some(merged_inherited.clone()),
573 });
574 break; // Found our target subtree, no need to continue
575 }
576 }
577
578 current_idx += count;
579 }
580
581 // Add pending kids to work queue in reverse order for correct processing
582 work_queue.extend(pending_kids.into_iter().rev());
583 }
584 "Page" => {
585 // This is a page object
586 if target_index != current_index {
587 return Err(ParseError::SyntaxError {
588 position: 0,
589 message: "Page index mismatch".to_string(),
590 });
591 }
592
593 // We need the reference for creating the parsed page
594 if let Some(page_ref) = node_ref {
595 return self.create_parsed_page(page_ref, &node_dict, inherited.as_ref());
596 } else {
597 return Err(ParseError::SyntaxError {
598 position: 0,
599 message: "Direct page object without reference".to_string(),
600 });
601 }
602 }
603 _ => {
604 return Err(ParseError::SyntaxError {
605 position: 0,
606 message: format!("Invalid page tree node type: {node_type}"),
607 });
608 }
609 }
610 }
611
612 Err(ParseError::SyntaxError {
613 position: 0,
614 message: "Page not found in tree".to_string(),
615 })
616 }
617
618 /// Create a ParsedPage from a page dictionary
619 fn create_parsed_page(
620 &self,
621 obj_ref: (u32, u16),
622 page_dict: &PdfDictionary,
623 inherited: Option<&PdfDictionary>,
624 ) -> ParseResult<ParsedPage> {
625 // Extract page attributes with fallback for missing MediaBox
626 let media_box = match self.get_rectangle(page_dict, inherited, "MediaBox")? {
627 Some(mb) => mb,
628 None => {
629 // Use default Letter size if MediaBox is missing
630 #[cfg(debug_assertions)]
631 eprintln!(
632 "Warning: Page {} {} R missing MediaBox, using default Letter size",
633 obj_ref.0, obj_ref.1
634 );
635 [0.0, 0.0, 612.0, 792.0]
636 }
637 };
638
639 let crop_box = self.get_rectangle(page_dict, inherited, "CropBox")?;
640
641 let rotation = self
642 .get_integer(page_dict, inherited, "Rotate")?
643 .unwrap_or(0) as i32;
644
645 // Get inherited resources
646 let inherited_resources = if let Some(inherited) = inherited {
647 inherited
648 .get("Resources")
649 .and_then(|r| r.as_dict())
650 .cloned()
651 } else {
652 None
653 };
654
655 // Get annotations if present
656 let annotations = page_dict
657 .get("Annots")
658 .and_then(|obj| obj.as_array())
659 .cloned();
660
661 Ok(ParsedPage {
662 obj_ref,
663 dict: page_dict.clone(),
664 inherited_resources,
665 media_box,
666 crop_box,
667 rotation,
668 annotations,
669 })
670 }
671
672 /// Get a rectangle value
673 fn get_rectangle(
674 &self,
675 node: &PdfDictionary,
676 inherited: Option<&PdfDictionary>,
677 key: &str,
678 ) -> ParseResult<Option<[f64; 4]>> {
679 let array = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
680
681 if let Some(array) = array.and_then(|obj| obj.as_array()) {
682 if array.len() != 4 {
683 return Err(ParseError::SyntaxError {
684 position: 0,
685 message: format!("{key} must have 4 elements"),
686 });
687 }
688
689 let rect = [
690 array.0.first().unwrap().as_real().unwrap_or(0.0),
691 array.get(1).unwrap().as_real().unwrap_or(0.0),
692 array.get(2).unwrap().as_real().unwrap_or(0.0),
693 array.get(3).unwrap().as_real().unwrap_or(0.0),
694 ];
695
696 Ok(Some(rect))
697 } else {
698 Ok(None)
699 }
700 }
701
702 /// Get an integer value
703 fn get_integer(
704 &self,
705 node: &PdfDictionary,
706 inherited: Option<&PdfDictionary>,
707 key: &str,
708 ) -> ParseResult<Option<i64>> {
709 let value = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
710
711 Ok(value.and_then(|obj| obj.as_integer()))
712 }
713
714 /// Get an object by its reference numbers.
715 ///
716 /// This method first checks the cache, then loads from the file if needed.
717 /// Objects are automatically cached after loading.
718 ///
719 /// # Arguments
720 ///
721 /// * `obj_num` - Object number
722 /// * `gen_num` - Generation number
723 ///
724 /// # Returns
725 ///
726 /// The resolved PDF object.
727 ///
728 /// # Errors
729 ///
730 /// Returns an error if:
731 /// - Object doesn't exist
732 /// - Object is part of an encrypted object stream
733 /// - File is corrupted
734 ///
735 /// # Example
736 ///
737 /// ```rust,no_run
738 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
739 /// # use oxidize_pdf::parser::objects::PdfObject;
740 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
741 /// # let reader = PdfReader::open("document.pdf")?;
742 /// # let document = PdfDocument::new(reader);
743 /// // Get object 10 0 R
744 /// let obj = document.get_object(10, 0)?;
745 ///
746 /// // Check object type
747 /// match obj {
748 /// PdfObject::Dictionary(dict) => {
749 /// println!("Object is a dictionary with {} entries", dict.0.len());
750 /// }
751 /// PdfObject::Stream(stream) => {
752 /// println!("Object is a stream");
753 /// }
754 /// _ => {}
755 /// }
756 /// # Ok(())
757 /// # }
758 /// ```
759 pub fn get_object(&self, obj_num: u32, gen_num: u16) -> ParseResult<PdfObject> {
760 // Check resource cache first
761 if let Some(obj) = self.resources.get_cached((obj_num, gen_num)) {
762 return Ok(obj);
763 }
764
765 // Load from reader
766 let obj = {
767 let mut reader = self.reader.borrow_mut();
768 reader.get_object(obj_num, gen_num)?.clone()
769 };
770
771 // Cache it
772 self.resources.cache_object((obj_num, gen_num), obj.clone());
773
774 Ok(obj)
775 }
776
777 /// Resolve a reference to get the actual object.
778 ///
779 /// If the input is a Reference, fetches the referenced object.
780 /// Otherwise returns a clone of the input object.
781 ///
782 /// # Arguments
783 ///
784 /// * `obj` - The object to resolve (may be a Reference or direct object)
785 ///
786 /// # Returns
787 ///
788 /// The resolved object (never a Reference).
789 ///
790 /// # Example
791 ///
792 /// ```rust,no_run
793 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
794 /// # use oxidize_pdf::parser::objects::PdfObject;
795 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
796 /// # let reader = PdfReader::open("document.pdf")?;
797 /// # let document = PdfDocument::new(reader);
798 /// # let page = document.get_page(0)?;
799 /// // Contents might be a reference or direct object
800 /// if let Some(contents) = page.dict.get("Contents") {
801 /// let resolved = document.resolve(contents)?;
802 /// match resolved {
803 /// PdfObject::Stream(_) => println!("Single content stream"),
804 /// PdfObject::Array(_) => println!("Multiple content streams"),
805 /// _ => println!("Unexpected content type"),
806 /// }
807 /// }
808 /// # Ok(())
809 /// # }
810 /// ```
811 pub fn resolve(&self, obj: &PdfObject) -> ParseResult<PdfObject> {
812 match obj {
813 PdfObject::Reference(obj_num, gen_num) => self.get_object(*obj_num, *gen_num),
814 _ => Ok(obj.clone()),
815 }
816 }
817
818 /// Get content streams for a specific page.
819 ///
820 /// This method handles both single streams and arrays of streams,
821 /// automatically decompressing them according to their filters.
822 ///
823 /// # Arguments
824 ///
825 /// * `page` - The page to get content streams from
826 ///
827 /// # Returns
828 ///
829 /// Vector of decompressed content stream data ready for parsing.
830 ///
831 /// # Example
832 ///
833 /// ```rust,no_run
834 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
835 /// # use oxidize_pdf::parser::content::ContentParser;
836 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
837 /// # let reader = PdfReader::open("document.pdf")?;
838 /// # let document = PdfDocument::new(reader);
839 /// let page = document.get_page(0)?;
840 /// let streams = document.get_page_content_streams(&page)?;
841 ///
842 /// // Parse content streams
843 /// for stream_data in streams {
844 /// let operations = ContentParser::parse(&stream_data)?;
845 /// println!("Stream has {} operations", operations.len());
846 /// }
847 /// # Ok(())
848 /// # }
849 /// ```
850 /// Get page resources dictionary.
851 ///
852 /// This method returns the resources dictionary for a page, which may include
853 /// fonts, images (XObjects), patterns, color spaces, and other resources.
854 ///
855 /// # Arguments
856 ///
857 /// * `page` - The page to get resources from
858 ///
859 /// # Returns
860 ///
861 /// Optional resources dictionary if the page has resources.
862 ///
863 /// # Example
864 ///
865 /// ```rust,no_run
866 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader, PdfObject, PdfName};
867 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
868 /// # let reader = PdfReader::open("document.pdf")?;
869 /// # let document = PdfDocument::new(reader);
870 /// let page = document.get_page(0)?;
871 /// if let Some(resources) = document.get_page_resources(&page)? {
872 /// // Check for images (XObjects)
873 /// if let Some(PdfObject::Dictionary(xobjects)) = resources.0.get(&PdfName("XObject".to_string())) {
874 /// for (name, _) in xobjects.0.iter() {
875 /// println!("Found XObject: {}", name.0);
876 /// }
877 /// }
878 /// }
879 /// # Ok(())
880 /// # }
881 /// ```
882 pub fn get_page_resources<'a>(
883 &self,
884 page: &'a ParsedPage,
885 ) -> ParseResult<Option<&'a PdfDictionary>> {
886 Ok(page.get_resources())
887 }
888
889 pub fn get_page_content_streams(&self, page: &ParsedPage) -> ParseResult<Vec<Vec<u8>>> {
890 let mut streams = Vec::new();
891 let options = self.options();
892
893 if let Some(contents) = page.dict.get("Contents") {
894 let resolved_contents = self.resolve(contents)?;
895
896 match &resolved_contents {
897 PdfObject::Stream(stream) => {
898 streams.push(stream.decode(&options)?);
899 }
900 PdfObject::Array(array) => {
901 for item in &array.0 {
902 let resolved = self.resolve(item)?;
903 if let PdfObject::Stream(stream) = resolved {
904 streams.push(stream.decode(&options)?);
905 }
906 }
907 }
908 _ => {
909 return Err(ParseError::SyntaxError {
910 position: 0,
911 message: "Contents must be a stream or array of streams".to_string(),
912 })
913 }
914 }
915 }
916
917 Ok(streams)
918 }
919
920 /// Extract text from all pages in the document.
921 ///
922 /// Uses the default text extraction settings. For custom settings,
923 /// use `extract_text_with_options`.
924 ///
925 /// # Returns
926 ///
927 /// A vector of `ExtractedText`, one for each page in the document.
928 ///
929 /// # Example
930 ///
931 /// ```rust,no_run
932 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
933 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
934 /// # let reader = PdfReader::open("document.pdf")?;
935 /// # let document = PdfDocument::new(reader);
936 /// let extracted_pages = document.extract_text()?;
937 ///
938 /// for (page_num, page_text) in extracted_pages.iter().enumerate() {
939 /// println!("=== Page {} ===", page_num + 1);
940 /// println!("{}", page_text.text);
941 /// println!();
942 /// }
943 /// # Ok(())
944 /// # }
945 /// ```
946 pub fn extract_text(&self) -> ParseResult<Vec<crate::text::ExtractedText>> {
947 let extractor = crate::text::TextExtractor::new();
948 extractor.extract_from_document(self)
949 }
950
951 /// Extract text from a specific page.
952 ///
953 /// # Arguments
954 ///
955 /// * `page_index` - Zero-based page index
956 ///
957 /// # Returns
958 ///
959 /// Extracted text with optional position information.
960 ///
961 /// # Example
962 ///
963 /// ```rust,no_run
964 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
965 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
966 /// # let reader = PdfReader::open("document.pdf")?;
967 /// # let document = PdfDocument::new(reader);
968 /// // Extract text from first page only
969 /// let page_text = document.extract_text_from_page(0)?;
970 /// println!("First page text: {}", page_text.text);
971 ///
972 /// // Access text fragments with positions (if preserved)
973 /// for fragment in &page_text.fragments {
974 /// println!("'{}' at ({}, {})", fragment.text, fragment.x, fragment.y);
975 /// }
976 /// # Ok(())
977 /// # }
978 /// ```
979 pub fn extract_text_from_page(
980 &self,
981 page_index: u32,
982 ) -> ParseResult<crate::text::ExtractedText> {
983 let extractor = crate::text::TextExtractor::new();
984 extractor.extract_from_page(self, page_index)
985 }
986
987 /// Extract text with custom extraction options.
988 ///
989 /// Allows fine control over text extraction behavior including
990 /// layout preservation, spacing thresholds, and more.
991 ///
992 /// # Arguments
993 ///
994 /// * `options` - Text extraction configuration
995 ///
996 /// # Returns
997 ///
998 /// A vector of `ExtractedText`, one for each page.
999 ///
1000 /// # Example
1001 ///
1002 /// ```rust,no_run
1003 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
1004 /// # use oxidize_pdf::text::ExtractionOptions;
1005 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
1006 /// # let reader = PdfReader::open("document.pdf")?;
1007 /// # let document = PdfDocument::new(reader);
1008 /// // Configure extraction to preserve layout
1009 /// let options = ExtractionOptions {
1010 /// preserve_layout: true,
1011 /// space_threshold: 0.3,
1012 /// newline_threshold: 10.0,
1013 /// ..Default::default()
1014 /// };
1015 ///
1016 /// let extracted_pages = document.extract_text_with_options(options)?;
1017 ///
1018 /// // Text fragments will include position information
1019 /// for page_text in extracted_pages {
1020 /// for fragment in &page_text.fragments {
1021 /// println!("{:?}", fragment);
1022 /// }
1023 /// }
1024 /// # Ok(())
1025 /// # }
1026 /// ```
1027 pub fn extract_text_with_options(
1028 &self,
1029 options: crate::text::ExtractionOptions,
1030 ) -> ParseResult<Vec<crate::text::ExtractedText>> {
1031 let extractor = crate::text::TextExtractor::with_options(options);
1032 extractor.extract_from_document(self)
1033 }
1034
1035 /// Get annotations from a specific page.
1036 ///
1037 /// Returns a vector of annotation dictionaries for the specified page.
1038 /// Each annotation dictionary contains properties like Type, Rect, Contents, etc.
1039 ///
1040 /// # Arguments
1041 ///
1042 /// * `page_index` - Zero-based page index
1043 ///
1044 /// # Returns
1045 ///
1046 /// A vector of PdfDictionary objects representing annotations, or an empty vector
1047 /// if the page has no annotations.
1048 ///
1049 /// # Example
1050 ///
1051 /// ```rust,no_run
1052 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
1053 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
1054 /// # let reader = PdfReader::open("document.pdf")?;
1055 /// # let document = PdfDocument::new(reader);
1056 /// let annotations = document.get_page_annotations(0)?;
1057 /// for annot in &annotations {
1058 /// if let Some(contents) = annot.get("Contents").and_then(|c| c.as_string()) {
1059 /// println!("Annotation: {:?}", contents);
1060 /// }
1061 /// }
1062 /// # Ok(())
1063 /// # }
1064 /// ```
1065 pub fn get_page_annotations(&self, page_index: u32) -> ParseResult<Vec<PdfDictionary>> {
1066 let page = self.get_page(page_index)?;
1067
1068 if let Some(annots_array) = page.get_annotations() {
1069 let mut annotations = Vec::new();
1070 let mut reader = self.reader.borrow_mut();
1071
1072 for annot_ref in &annots_array.0 {
1073 if let Some(ref_nums) = annot_ref.as_reference() {
1074 match reader.get_object(ref_nums.0, ref_nums.1) {
1075 Ok(obj) => {
1076 if let Some(dict) = obj.as_dict() {
1077 annotations.push(dict.clone());
1078 }
1079 }
1080 Err(_) => {
1081 // Skip annotations that can't be loaded
1082 continue;
1083 }
1084 }
1085 }
1086 }
1087
1088 Ok(annotations)
1089 } else {
1090 Ok(Vec::new())
1091 }
1092 }
1093
1094 /// Get all annotations from all pages in the document.
1095 ///
1096 /// Returns a vector of tuples containing (page_index, annotations) for each page
1097 /// that has annotations.
1098 ///
1099 /// # Returns
1100 ///
1101 /// A vector of tuples where the first element is the page index and the second
1102 /// is a vector of annotation dictionaries for that page.
1103 ///
1104 /// # Example
1105 ///
1106 /// ```rust,no_run
1107 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
1108 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
1109 /// # let reader = PdfReader::open("document.pdf")?;
1110 /// # let document = PdfDocument::new(reader);
1111 /// let all_annotations = document.get_all_annotations()?;
1112 /// for (page_idx, annotations) in all_annotations {
1113 /// println!("Page {} has {} annotations", page_idx, annotations.len());
1114 /// }
1115 /// # Ok(())
1116 /// # }
1117 /// ```
1118 pub fn get_all_annotations(&self) -> ParseResult<Vec<(u32, Vec<PdfDictionary>)>> {
1119 let page_count = self.page_count()?;
1120 let mut all_annotations = Vec::new();
1121
1122 for i in 0..page_count {
1123 let annotations = self.get_page_annotations(i)?;
1124 if !annotations.is_empty() {
1125 all_annotations.push((i, annotations));
1126 }
1127 }
1128
1129 Ok(all_annotations)
1130 }
1131}
1132
1133#[cfg(test)]
1134mod tests {
1135 use super::*;
1136 use crate::parser::objects::{PdfObject, PdfString};
1137 use std::io::Cursor;
1138
1139 // Helper function to create a minimal PDF in memory
1140 fn create_minimal_pdf() -> Vec<u8> {
1141 let mut pdf = Vec::new();
1142
1143 // PDF header
1144 pdf.extend_from_slice(b"%PDF-1.4\n");
1145
1146 // Catalog object
1147 pdf.extend_from_slice(b"1 0 obj\n");
1148 pdf.extend_from_slice(b"<< /Type /Catalog /Pages 2 0 R >>\n");
1149 pdf.extend_from_slice(b"endobj\n");
1150
1151 // Pages object
1152 pdf.extend_from_slice(b"2 0 obj\n");
1153 pdf.extend_from_slice(b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>\n");
1154 pdf.extend_from_slice(b"endobj\n");
1155
1156 // Page object
1157 pdf.extend_from_slice(b"3 0 obj\n");
1158 pdf.extend_from_slice(
1159 b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << >> >>\n",
1160 );
1161 pdf.extend_from_slice(b"endobj\n");
1162
1163 // Cross-reference table
1164 let xref_pos = pdf.len();
1165 pdf.extend_from_slice(b"xref\n");
1166 pdf.extend_from_slice(b"0 4\n");
1167 pdf.extend_from_slice(b"0000000000 65535 f \n");
1168 pdf.extend_from_slice(b"0000000009 00000 n \n");
1169 pdf.extend_from_slice(b"0000000058 00000 n \n");
1170 pdf.extend_from_slice(b"0000000115 00000 n \n");
1171
1172 // Trailer
1173 pdf.extend_from_slice(b"trailer\n");
1174 pdf.extend_from_slice(b"<< /Size 4 /Root 1 0 R >>\n");
1175 pdf.extend_from_slice(b"startxref\n");
1176 pdf.extend_from_slice(format!("{}\n", xref_pos).as_bytes());
1177 pdf.extend_from_slice(b"%%EOF\n");
1178
1179 pdf
1180 }
1181
1182 // Helper to create a PDF with metadata
1183 fn create_pdf_with_metadata() -> Vec<u8> {
1184 let mut pdf = Vec::new();
1185
1186 // PDF header
1187 pdf.extend_from_slice(b"%PDF-1.5\n");
1188
1189 // Record positions for xref
1190 let obj1_pos = pdf.len();
1191
1192 // Catalog object
1193 pdf.extend_from_slice(b"1 0 obj\n");
1194 pdf.extend_from_slice(b"<< /Type /Catalog /Pages 2 0 R >>\n");
1195 pdf.extend_from_slice(b"endobj\n");
1196
1197 let obj2_pos = pdf.len();
1198
1199 // Pages object
1200 pdf.extend_from_slice(b"2 0 obj\n");
1201 pdf.extend_from_slice(b"<< /Type /Pages /Kids [] /Count 0 >>\n");
1202 pdf.extend_from_slice(b"endobj\n");
1203
1204 let obj3_pos = pdf.len();
1205
1206 // Info object
1207 pdf.extend_from_slice(b"3 0 obj\n");
1208 pdf.extend_from_slice(
1209 b"<< /Title (Test Document) /Author (Test Author) /Subject (Test Subject) >>\n",
1210 );
1211 pdf.extend_from_slice(b"endobj\n");
1212
1213 // Cross-reference table
1214 let xref_pos = pdf.len();
1215 pdf.extend_from_slice(b"xref\n");
1216 pdf.extend_from_slice(b"0 4\n");
1217 pdf.extend_from_slice(b"0000000000 65535 f \n");
1218 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj1_pos).as_bytes());
1219 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj2_pos).as_bytes());
1220 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj3_pos).as_bytes());
1221
1222 // Trailer
1223 pdf.extend_from_slice(b"trailer\n");
1224 pdf.extend_from_slice(b"<< /Size 4 /Root 1 0 R /Info 3 0 R >>\n");
1225 pdf.extend_from_slice(b"startxref\n");
1226 pdf.extend_from_slice(format!("{}\n", xref_pos).as_bytes());
1227 pdf.extend_from_slice(b"%%EOF\n");
1228
1229 pdf
1230 }
1231
1232 #[test]
1233 fn test_pdf_document_new() {
1234 let pdf_data = create_minimal_pdf();
1235 let cursor = Cursor::new(pdf_data);
1236 let reader = PdfReader::new(cursor).unwrap();
1237 let document = PdfDocument::new(reader);
1238
1239 // Verify document is created with empty caches
1240 assert!(document.page_tree.borrow().is_none());
1241 assert!(document.metadata_cache.borrow().is_none());
1242 }
1243
1244 #[test]
1245 fn test_version() {
1246 let pdf_data = create_minimal_pdf();
1247 let cursor = Cursor::new(pdf_data);
1248 let reader = PdfReader::new(cursor).unwrap();
1249 let document = PdfDocument::new(reader);
1250
1251 let version = document.version().unwrap();
1252 assert_eq!(version, "1.4");
1253 }
1254
1255 #[test]
1256 fn test_page_count() {
1257 let pdf_data = create_minimal_pdf();
1258 let cursor = Cursor::new(pdf_data);
1259 let reader = PdfReader::new(cursor).unwrap();
1260 let document = PdfDocument::new(reader);
1261
1262 let count = document.page_count().unwrap();
1263 assert_eq!(count, 1);
1264 }
1265
1266 #[test]
1267 fn test_metadata() {
1268 let pdf_data = create_pdf_with_metadata();
1269 let cursor = Cursor::new(pdf_data);
1270 let reader = PdfReader::new(cursor).unwrap();
1271 let document = PdfDocument::new(reader);
1272
1273 let metadata = document.metadata().unwrap();
1274 assert_eq!(metadata.title, Some("Test Document".to_string()));
1275 assert_eq!(metadata.author, Some("Test Author".to_string()));
1276 assert_eq!(metadata.subject, Some("Test Subject".to_string()));
1277
1278 // Verify caching works
1279 let metadata2 = document.metadata().unwrap();
1280 assert_eq!(metadata.title, metadata2.title);
1281 }
1282
1283 #[test]
1284 fn test_get_page() {
1285 let pdf_data = create_minimal_pdf();
1286 let cursor = Cursor::new(pdf_data);
1287 let reader = PdfReader::new(cursor).unwrap();
1288 let document = PdfDocument::new(reader);
1289
1290 // Get first page
1291 let page = document.get_page(0).unwrap();
1292 assert_eq!(page.media_box, [0.0, 0.0, 612.0, 792.0]);
1293
1294 // Verify caching works
1295 let page2 = document.get_page(0).unwrap();
1296 assert_eq!(page.media_box, page2.media_box);
1297 }
1298
1299 #[test]
1300 fn test_get_page_out_of_bounds() {
1301 let pdf_data = create_minimal_pdf();
1302 let cursor = Cursor::new(pdf_data);
1303 let reader = PdfReader::new(cursor).unwrap();
1304 let document = PdfDocument::new(reader);
1305
1306 // Try to get page that doesn't exist
1307 let result = document.get_page(10);
1308 assert!(result.is_err());
1309 }
1310
1311 #[test]
1312 fn test_resource_manager_caching() {
1313 let resources = ResourceManager::new();
1314
1315 // Test caching an object
1316 let obj_ref = (1, 0);
1317 let obj = PdfObject::String(PdfString("Test".as_bytes().to_vec()));
1318
1319 assert!(resources.get_cached(obj_ref).is_none());
1320
1321 resources.cache_object(obj_ref, obj.clone());
1322
1323 let cached = resources.get_cached(obj_ref).unwrap();
1324 assert_eq!(cached, obj);
1325
1326 // Test clearing cache
1327 resources.clear_cache();
1328 assert!(resources.get_cached(obj_ref).is_none());
1329 }
1330
1331 #[test]
1332 fn test_get_object() {
1333 let pdf_data = create_minimal_pdf();
1334 let cursor = Cursor::new(pdf_data);
1335 let reader = PdfReader::new(cursor).unwrap();
1336 let document = PdfDocument::new(reader);
1337
1338 // Get catalog object
1339 let catalog = document.get_object(1, 0).unwrap();
1340 if let PdfObject::Dictionary(dict) = catalog {
1341 if let Some(PdfObject::Name(name)) = dict.get("Type") {
1342 assert_eq!(name.0, "Catalog");
1343 } else {
1344 panic!("Expected /Type name");
1345 }
1346 } else {
1347 panic!("Expected dictionary object");
1348 }
1349 }
1350
1351 #[test]
1352 fn test_resolve_reference() {
1353 let pdf_data = create_minimal_pdf();
1354 let cursor = Cursor::new(pdf_data);
1355 let reader = PdfReader::new(cursor).unwrap();
1356 let document = PdfDocument::new(reader);
1357
1358 // Create a reference to the catalog
1359 let ref_obj = PdfObject::Reference(1, 0);
1360
1361 // Resolve it
1362 let resolved = document.resolve(&ref_obj).unwrap();
1363 if let PdfObject::Dictionary(dict) = resolved {
1364 if let Some(PdfObject::Name(name)) = dict.get("Type") {
1365 assert_eq!(name.0, "Catalog");
1366 } else {
1367 panic!("Expected /Type name");
1368 }
1369 } else {
1370 panic!("Expected dictionary object");
1371 }
1372 }
1373
1374 #[test]
1375 fn test_resolve_non_reference() {
1376 let pdf_data = create_minimal_pdf();
1377 let cursor = Cursor::new(pdf_data);
1378 let reader = PdfReader::new(cursor).unwrap();
1379 let document = PdfDocument::new(reader);
1380
1381 // Try to resolve a non-reference object
1382 let obj = PdfObject::String(PdfString("Test".as_bytes().to_vec()));
1383 let resolved = document.resolve(&obj).unwrap();
1384
1385 // Should return the same object
1386 assert_eq!(resolved, obj);
1387 }
1388
1389 #[test]
1390 fn test_invalid_pdf_data() {
1391 let invalid_data = b"This is not a PDF";
1392 let cursor = Cursor::new(invalid_data.to_vec());
1393 let result = PdfReader::new(cursor);
1394
1395 assert!(result.is_err());
1396 }
1397
1398 #[test]
1399 fn test_empty_page_tree() {
1400 // Create PDF with empty page tree
1401 let pdf_data = create_pdf_with_metadata(); // This has 0 pages
1402 let cursor = Cursor::new(pdf_data);
1403 let reader = PdfReader::new(cursor).unwrap();
1404 let document = PdfDocument::new(reader);
1405
1406 let count = document.page_count().unwrap();
1407 assert_eq!(count, 0);
1408
1409 // Try to get a page from empty document
1410 let result = document.get_page(0);
1411 assert!(result.is_err());
1412 }
1413
1414 #[test]
1415 fn test_extract_text_empty_document() {
1416 let pdf_data = create_pdf_with_metadata();
1417 let cursor = Cursor::new(pdf_data);
1418 let reader = PdfReader::new(cursor).unwrap();
1419 let document = PdfDocument::new(reader);
1420
1421 let text = document.extract_text().unwrap();
1422 assert!(text.is_empty());
1423 }
1424
1425 #[test]
1426 fn test_concurrent_access() {
1427 let pdf_data = create_minimal_pdf();
1428 let cursor = Cursor::new(pdf_data);
1429 let reader = PdfReader::new(cursor).unwrap();
1430 let document = PdfDocument::new(reader);
1431
1432 // Access multiple things concurrently
1433 let version = document.version().unwrap();
1434 let count = document.page_count().unwrap();
1435 let page = document.get_page(0).unwrap();
1436
1437 assert_eq!(version, "1.4");
1438 assert_eq!(count, 1);
1439 assert_eq!(page.media_box[2], 612.0);
1440 }
1441
1442 // Additional comprehensive tests
1443 mod comprehensive_tests {
1444 use super::*;
1445
1446 #[test]
1447 fn test_resource_manager_default() {
1448 let resources = ResourceManager::default();
1449 assert!(resources.get_cached((1, 0)).is_none());
1450 }
1451
1452 #[test]
1453 fn test_resource_manager_multiple_objects() {
1454 let resources = ResourceManager::new();
1455
1456 // Cache multiple objects
1457 resources.cache_object((1, 0), PdfObject::Integer(42));
1458 resources.cache_object((2, 0), PdfObject::Boolean(true));
1459 resources.cache_object(
1460 (3, 0),
1461 PdfObject::String(PdfString("test".as_bytes().to_vec())),
1462 );
1463
1464 // Verify all are cached
1465 assert!(resources.get_cached((1, 0)).is_some());
1466 assert!(resources.get_cached((2, 0)).is_some());
1467 assert!(resources.get_cached((3, 0)).is_some());
1468
1469 // Clear and verify empty
1470 resources.clear_cache();
1471 assert!(resources.get_cached((1, 0)).is_none());
1472 assert!(resources.get_cached((2, 0)).is_none());
1473 assert!(resources.get_cached((3, 0)).is_none());
1474 }
1475
1476 #[test]
1477 fn test_resource_manager_object_overwrite() {
1478 let resources = ResourceManager::new();
1479
1480 // Cache an object
1481 resources.cache_object((1, 0), PdfObject::Integer(42));
1482 assert_eq!(resources.get_cached((1, 0)), Some(PdfObject::Integer(42)));
1483
1484 // Overwrite with different object
1485 resources.cache_object((1, 0), PdfObject::Boolean(true));
1486 assert_eq!(resources.get_cached((1, 0)), Some(PdfObject::Boolean(true)));
1487 }
1488
1489 #[test]
1490 fn test_get_object_caching() {
1491 let pdf_data = create_minimal_pdf();
1492 let cursor = Cursor::new(pdf_data);
1493 let reader = PdfReader::new(cursor).unwrap();
1494 let document = PdfDocument::new(reader);
1495
1496 // Get object first time (should cache)
1497 let obj1 = document.get_object(1, 0).unwrap();
1498
1499 // Get same object again (should use cache)
1500 let obj2 = document.get_object(1, 0).unwrap();
1501
1502 // Objects should be identical
1503 assert_eq!(obj1, obj2);
1504
1505 // Verify it's cached
1506 assert!(document.resources.get_cached((1, 0)).is_some());
1507 }
1508
1509 #[test]
1510 fn test_get_object_different_generations() {
1511 let pdf_data = create_minimal_pdf();
1512 let cursor = Cursor::new(pdf_data);
1513 let reader = PdfReader::new(cursor).unwrap();
1514 let document = PdfDocument::new(reader);
1515
1516 // Get object with generation 0
1517 let _obj1 = document.get_object(1, 0).unwrap();
1518
1519 // Try to get same object with different generation (should fail)
1520 let result = document.get_object(1, 1);
1521 assert!(result.is_err());
1522
1523 // Original should still be cached
1524 assert!(document.resources.get_cached((1, 0)).is_some());
1525 }
1526
1527 #[test]
1528 fn test_get_object_nonexistent() {
1529 let pdf_data = create_minimal_pdf();
1530 let cursor = Cursor::new(pdf_data);
1531 let reader = PdfReader::new(cursor).unwrap();
1532 let document = PdfDocument::new(reader);
1533
1534 // Try to get non-existent object
1535 let result = document.get_object(999, 0);
1536 assert!(result.is_err());
1537 }
1538
1539 #[test]
1540 fn test_resolve_nested_references() {
1541 let pdf_data = create_minimal_pdf();
1542 let cursor = Cursor::new(pdf_data);
1543 let reader = PdfReader::new(cursor).unwrap();
1544 let document = PdfDocument::new(reader);
1545
1546 // Test resolving a reference
1547 let ref_obj = PdfObject::Reference(2, 0);
1548 let resolved = document.resolve(&ref_obj).unwrap();
1549
1550 // Should resolve to the pages object
1551 if let PdfObject::Dictionary(dict) = resolved {
1552 if let Some(PdfObject::Name(name)) = dict.get("Type") {
1553 assert_eq!(name.0, "Pages");
1554 }
1555 }
1556 }
1557
1558 #[test]
1559 fn test_resolve_various_object_types() {
1560 let pdf_data = create_minimal_pdf();
1561 let cursor = Cursor::new(pdf_data);
1562 let reader = PdfReader::new(cursor).unwrap();
1563 let document = PdfDocument::new(reader);
1564
1565 // Test resolving different object types
1566 let test_objects = vec![
1567 PdfObject::Integer(42),
1568 PdfObject::Boolean(true),
1569 PdfObject::String(PdfString("test".as_bytes().to_vec())),
1570 PdfObject::Real(3.14),
1571 PdfObject::Null,
1572 ];
1573
1574 for obj in test_objects {
1575 let resolved = document.resolve(&obj).unwrap();
1576 assert_eq!(resolved, obj);
1577 }
1578 }
1579
1580 #[test]
1581 fn test_get_page_cached() {
1582 let pdf_data = create_minimal_pdf();
1583 let cursor = Cursor::new(pdf_data);
1584 let reader = PdfReader::new(cursor).unwrap();
1585 let document = PdfDocument::new(reader);
1586
1587 // Get page first time
1588 let page1 = document.get_page(0).unwrap();
1589
1590 // Get same page again
1591 let page2 = document.get_page(0).unwrap();
1592
1593 // Should be identical
1594 assert_eq!(page1.media_box, page2.media_box);
1595 assert_eq!(page1.rotation, page2.rotation);
1596 assert_eq!(page1.obj_ref, page2.obj_ref);
1597 }
1598
1599 #[test]
1600 fn test_metadata_caching() {
1601 let pdf_data = create_pdf_with_metadata();
1602 let cursor = Cursor::new(pdf_data);
1603 let reader = PdfReader::new(cursor).unwrap();
1604 let document = PdfDocument::new(reader);
1605
1606 // Get metadata first time
1607 let meta1 = document.metadata().unwrap();
1608
1609 // Get metadata again
1610 let meta2 = document.metadata().unwrap();
1611
1612 // Should be identical
1613 assert_eq!(meta1.title, meta2.title);
1614 assert_eq!(meta1.author, meta2.author);
1615 assert_eq!(meta1.subject, meta2.subject);
1616 assert_eq!(meta1.version, meta2.version);
1617 }
1618
1619 #[test]
1620 fn test_page_tree_initialization() {
1621 let pdf_data = create_minimal_pdf();
1622 let cursor = Cursor::new(pdf_data);
1623 let reader = PdfReader::new(cursor).unwrap();
1624 let document = PdfDocument::new(reader);
1625
1626 // Initially page tree should be None
1627 assert!(document.page_tree.borrow().is_none());
1628
1629 // After getting page count, page tree should be initialized
1630 let _count = document.page_count().unwrap();
1631 // Note: page_tree is private, so we can't directly check it
1632 // But we can verify it works by getting a page
1633 let _page = document.get_page(0).unwrap();
1634 }
1635
1636 #[test]
1637 fn test_get_page_resources() {
1638 let pdf_data = create_minimal_pdf();
1639 let cursor = Cursor::new(pdf_data);
1640 let reader = PdfReader::new(cursor).unwrap();
1641 let document = PdfDocument::new(reader);
1642
1643 let page = document.get_page(0).unwrap();
1644 let resources = document.get_page_resources(&page).unwrap();
1645
1646 // The minimal PDF has empty resources
1647 assert!(resources.is_some());
1648 }
1649
1650 #[test]
1651 fn test_get_page_content_streams_empty() {
1652 let pdf_data = create_minimal_pdf();
1653 let cursor = Cursor::new(pdf_data);
1654 let reader = PdfReader::new(cursor).unwrap();
1655 let document = PdfDocument::new(reader);
1656
1657 let page = document.get_page(0).unwrap();
1658 let streams = document.get_page_content_streams(&page).unwrap();
1659
1660 // Minimal PDF has no content streams
1661 assert!(streams.is_empty());
1662 }
1663
1664 #[test]
1665 fn test_extract_text_from_page() {
1666 let pdf_data = create_minimal_pdf();
1667 let cursor = Cursor::new(pdf_data);
1668 let reader = PdfReader::new(cursor).unwrap();
1669 let document = PdfDocument::new(reader);
1670
1671 let result = document.extract_text_from_page(0);
1672 // Should succeed even with empty page
1673 assert!(result.is_ok());
1674 }
1675
1676 #[test]
1677 fn test_extract_text_from_page_out_of_bounds() {
1678 let pdf_data = create_minimal_pdf();
1679 let cursor = Cursor::new(pdf_data);
1680 let reader = PdfReader::new(cursor).unwrap();
1681 let document = PdfDocument::new(reader);
1682
1683 let result = document.extract_text_from_page(999);
1684 assert!(result.is_err());
1685 }
1686
1687 #[test]
1688 fn test_extract_text_with_options() {
1689 let pdf_data = create_minimal_pdf();
1690 let cursor = Cursor::new(pdf_data);
1691 let reader = PdfReader::new(cursor).unwrap();
1692 let document = PdfDocument::new(reader);
1693
1694 let options = crate::text::ExtractionOptions {
1695 preserve_layout: true,
1696 space_threshold: 0.5,
1697 newline_threshold: 15.0,
1698 ..Default::default()
1699 };
1700
1701 let result = document.extract_text_with_options(options);
1702 assert!(result.is_ok());
1703 }
1704
1705 #[test]
1706 fn test_version_different_pdf_versions() {
1707 // Test with different PDF versions
1708 let versions = vec!["1.3", "1.4", "1.5", "1.6", "1.7"];
1709
1710 for version in versions {
1711 let mut pdf_data = Vec::new();
1712
1713 // PDF header
1714 pdf_data.extend_from_slice(format!("%PDF-{}\n", version).as_bytes());
1715
1716 // Track positions for xref
1717 let obj1_pos = pdf_data.len();
1718
1719 // Catalog object
1720 pdf_data.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1721
1722 let obj2_pos = pdf_data.len();
1723
1724 // Pages object
1725 pdf_data
1726 .extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n");
1727
1728 // Cross-reference table
1729 let xref_pos = pdf_data.len();
1730 pdf_data.extend_from_slice(b"xref\n");
1731 pdf_data.extend_from_slice(b"0 3\n");
1732 pdf_data.extend_from_slice(b"0000000000 65535 f \n");
1733 pdf_data.extend_from_slice(format!("{:010} 00000 n \n", obj1_pos).as_bytes());
1734 pdf_data.extend_from_slice(format!("{:010} 00000 n \n", obj2_pos).as_bytes());
1735
1736 // Trailer
1737 pdf_data.extend_from_slice(b"trailer\n");
1738 pdf_data.extend_from_slice(b"<< /Size 3 /Root 1 0 R >>\n");
1739 pdf_data.extend_from_slice(b"startxref\n");
1740 pdf_data.extend_from_slice(format!("{}\n", xref_pos).as_bytes());
1741 pdf_data.extend_from_slice(b"%%EOF\n");
1742
1743 let cursor = Cursor::new(pdf_data);
1744 let reader = PdfReader::new(cursor).unwrap();
1745 let document = PdfDocument::new(reader);
1746
1747 let pdf_version = document.version().unwrap();
1748 assert_eq!(pdf_version, version);
1749 }
1750 }
1751
1752 #[test]
1753 fn test_page_count_zero() {
1754 let pdf_data = create_pdf_with_metadata(); // Has 0 pages
1755 let cursor = Cursor::new(pdf_data);
1756 let reader = PdfReader::new(cursor).unwrap();
1757 let document = PdfDocument::new(reader);
1758
1759 let count = document.page_count().unwrap();
1760 assert_eq!(count, 0);
1761 }
1762
1763 #[test]
1764 fn test_multiple_object_access() {
1765 let pdf_data = create_minimal_pdf();
1766 let cursor = Cursor::new(pdf_data);
1767 let reader = PdfReader::new(cursor).unwrap();
1768 let document = PdfDocument::new(reader);
1769
1770 // Access multiple objects
1771 let catalog = document.get_object(1, 0).unwrap();
1772 let pages = document.get_object(2, 0).unwrap();
1773 let page = document.get_object(3, 0).unwrap();
1774
1775 // Verify they're all different objects
1776 assert_ne!(catalog, pages);
1777 assert_ne!(pages, page);
1778 assert_ne!(catalog, page);
1779 }
1780
1781 #[test]
1782 fn test_error_handling_invalid_object_reference() {
1783 let pdf_data = create_minimal_pdf();
1784 let cursor = Cursor::new(pdf_data);
1785 let reader = PdfReader::new(cursor).unwrap();
1786 let document = PdfDocument::new(reader);
1787
1788 // Try to resolve an invalid reference
1789 let invalid_ref = PdfObject::Reference(999, 0);
1790 let result = document.resolve(&invalid_ref);
1791 assert!(result.is_err());
1792 }
1793
1794 #[test]
1795 fn test_concurrent_metadata_access() {
1796 let pdf_data = create_pdf_with_metadata();
1797 let cursor = Cursor::new(pdf_data);
1798 let reader = PdfReader::new(cursor).unwrap();
1799 let document = PdfDocument::new(reader);
1800
1801 // Access metadata and other properties concurrently
1802 let metadata = document.metadata().unwrap();
1803 let version = document.version().unwrap();
1804 let count = document.page_count().unwrap();
1805
1806 assert_eq!(metadata.title, Some("Test Document".to_string()));
1807 assert_eq!(version, "1.5");
1808 assert_eq!(count, 0);
1809 }
1810
1811 #[test]
1812 fn test_page_properties_comprehensive() {
1813 let pdf_data = create_minimal_pdf();
1814 let cursor = Cursor::new(pdf_data);
1815 let reader = PdfReader::new(cursor).unwrap();
1816 let document = PdfDocument::new(reader);
1817
1818 let page = document.get_page(0).unwrap();
1819
1820 // Test all page properties
1821 assert_eq!(page.media_box, [0.0, 0.0, 612.0, 792.0]);
1822 assert_eq!(page.crop_box, None);
1823 assert_eq!(page.rotation, 0);
1824 assert_eq!(page.obj_ref, (3, 0));
1825
1826 // Test width/height calculation
1827 assert_eq!(page.width(), 612.0);
1828 assert_eq!(page.height(), 792.0);
1829 }
1830
1831 #[test]
1832 fn test_memory_usage_efficiency() {
1833 let pdf_data = create_minimal_pdf();
1834 let cursor = Cursor::new(pdf_data);
1835 let reader = PdfReader::new(cursor).unwrap();
1836 let document = PdfDocument::new(reader);
1837
1838 // Access same page multiple times
1839 for _ in 0..10 {
1840 let _page = document.get_page(0).unwrap();
1841 }
1842
1843 // Should only have one copy in cache
1844 let page_count = document.page_count().unwrap();
1845 assert_eq!(page_count, 1);
1846 }
1847
1848 #[test]
1849 fn test_reader_borrow_safety() {
1850 let pdf_data = create_minimal_pdf();
1851 let cursor = Cursor::new(pdf_data);
1852 let reader = PdfReader::new(cursor).unwrap();
1853 let document = PdfDocument::new(reader);
1854
1855 // Multiple concurrent borrows should work
1856 let version = document.version().unwrap();
1857 let count = document.page_count().unwrap();
1858 let metadata = document.metadata().unwrap();
1859
1860 assert_eq!(version, "1.4");
1861 assert_eq!(count, 1);
1862 assert!(metadata.title.is_none());
1863 }
1864
1865 #[test]
1866 fn test_cache_consistency() {
1867 let pdf_data = create_minimal_pdf();
1868 let cursor = Cursor::new(pdf_data);
1869 let reader = PdfReader::new(cursor).unwrap();
1870 let document = PdfDocument::new(reader);
1871
1872 // Get object and verify caching
1873 let obj1 = document.get_object(1, 0).unwrap();
1874 let cached = document.resources.get_cached((1, 0)).unwrap();
1875
1876 assert_eq!(obj1, cached);
1877
1878 // Clear cache and get object again
1879 document.resources.clear_cache();
1880 let obj2 = document.get_object(1, 0).unwrap();
1881
1882 // Should be same content but loaded fresh
1883 assert_eq!(obj1, obj2);
1884 }
1885 }
1886}