oxidize_pdf/parser/document.rs
1//! PDF Document wrapper - High-level interface for PDF parsing and manipulation
2//!
3//! This module provides a robust, high-level interface for working with PDF documents.
4//! It solves Rust's borrow checker challenges through careful use of interior mutability
5//! (RefCell) and separation of concerns between parsing, caching, and page access.
6//!
7//! # Architecture
8//!
9//! The module uses a layered architecture:
10//! - **PdfDocument**: Main entry point with RefCell-based state management
11//! - **ResourceManager**: Centralized object caching with interior mutability
12//! - **PdfReader**: Low-level file access (wrapped in RefCell)
13//! - **PageTree**: Lazy-loaded page navigation
14//!
15//! # Key Features
16//!
17//! - **Automatic caching**: Objects are cached after first access
18//! - **Resource management**: Shared resources are handled efficiently
19//! - **Page navigation**: Fast access to any page in the document
20//! - **Reference resolution**: Automatic resolution of indirect references
21//! - **Text extraction**: Built-in support for extracting text from pages
22//!
23//! # Example
24//!
25//! ```rust,no_run
26//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
27//!
28//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
29//! // Open a PDF document
30//! let reader = PdfReader::open("document.pdf")?;
31//! let document = PdfDocument::new(reader);
32//!
33//! // Get document information
34//! let page_count = document.page_count()?;
35//! let metadata = document.metadata()?;
36//! println!("Title: {:?}", metadata.title);
37//! println!("Pages: {}", page_count);
38//!
39//! // Access a specific page
40//! let page = document.get_page(0)?;
41//! println!("Page size: {}x{}", page.width(), page.height());
42//!
43//! // Extract text from all pages
44//! let extracted_text = document.extract_text()?;
45//! for (i, page_text) in extracted_text.iter().enumerate() {
46//! println!("Page {}: {}", i + 1, page_text.text);
47//! }
48//! # Ok(())
49//! # }
50//! ```
51
52use super::objects::{PdfDictionary, PdfObject};
53use super::page_tree::{PageTree, ParsedPage};
54use super::reader::PdfReader;
55use super::{ParseError, ParseResult};
56use std::cell::RefCell;
57use std::collections::HashMap;
58use std::io::{Read, Seek};
59use std::rc::Rc;
60
61/// Resource manager for efficient PDF object caching.
62///
63/// The ResourceManager provides centralized caching of PDF objects to avoid
64/// repeated parsing and to share resources between different parts of the document.
65/// It uses RefCell for interior mutability, allowing multiple immutable references
66/// to the document while still being able to update the cache.
67///
68/// # Caching Strategy
69///
70/// - Objects are cached on first access
71/// - Cache persists for the lifetime of the document
72/// - Manual cache clearing is supported for memory management
73///
74/// # Example
75///
76/// ```rust,no_run
77/// use oxidize_pdf::parser::document::ResourceManager;
78///
79/// let resources = ResourceManager::new();
80///
81/// // Objects are cached automatically when accessed through PdfDocument
82/// // Manual cache management:
83/// resources.clear_cache(); // Free memory when needed
84/// ```
85pub struct ResourceManager {
86 /// Cached objects indexed by (object_number, generation_number)
87 object_cache: RefCell<HashMap<(u32, u16), PdfObject>>,
88}
89
90impl Default for ResourceManager {
91 fn default() -> Self {
92 Self::new()
93 }
94}
95
96impl ResourceManager {
97 /// Create a new resource manager
98 pub fn new() -> Self {
99 Self {
100 object_cache: RefCell::new(HashMap::new()),
101 }
102 }
103
104 /// Get an object from cache if available.
105 ///
106 /// # Arguments
107 ///
108 /// * `obj_ref` - Object reference (object_number, generation_number)
109 ///
110 /// # Returns
111 ///
112 /// Cloned object if cached, None otherwise.
113 ///
114 /// # Example
115 ///
116 /// ```rust,no_run
117 /// # use oxidize_pdf::parser::document::ResourceManager;
118 /// # let resources = ResourceManager::new();
119 /// if let Some(obj) = resources.get_cached((10, 0)) {
120 /// println!("Object 10 0 R found in cache");
121 /// }
122 /// ```
123 pub fn get_cached(&self, obj_ref: (u32, u16)) -> Option<PdfObject> {
124 self.object_cache.borrow().get(&obj_ref).cloned()
125 }
126
127 /// Cache an object for future access.
128 ///
129 /// # Arguments
130 ///
131 /// * `obj_ref` - Object reference (object_number, generation_number)
132 /// * `obj` - The PDF object to cache
133 ///
134 /// # Example
135 ///
136 /// ```rust,no_run
137 /// # use oxidize_pdf::parser::document::ResourceManager;
138 /// # use oxidize_pdf::parser::objects::PdfObject;
139 /// # let resources = ResourceManager::new();
140 /// resources.cache_object((10, 0), PdfObject::Integer(42));
141 /// ```
142 pub fn cache_object(&self, obj_ref: (u32, u16), obj: PdfObject) {
143 self.object_cache.borrow_mut().insert(obj_ref, obj);
144 }
145
146 /// Clear all cached objects to free memory.
147 ///
148 /// Use this when processing large documents to manage memory usage.
149 ///
150 /// # Example
151 ///
152 /// ```rust,no_run
153 /// # use oxidize_pdf::parser::document::ResourceManager;
154 /// # let resources = ResourceManager::new();
155 /// // After processing many pages
156 /// resources.clear_cache();
157 /// println!("Cache cleared to free memory");
158 /// ```
159 pub fn clear_cache(&self) {
160 self.object_cache.borrow_mut().clear();
161 }
162}
163
164/// High-level PDF document interface for parsing and manipulation.
165///
166/// `PdfDocument` provides a clean, safe API for working with PDF files.
167/// It handles the complexity of PDF structure, object references, and resource
168/// management behind a simple interface.
169///
170/// # Type Parameter
171///
172/// * `R` - The reader type (must implement Read + Seek)
173///
174/// # Architecture Benefits
175///
176/// - **RefCell Usage**: Allows multiple parts of the API to access the document
177/// - **Lazy Loading**: Pages and resources are loaded on demand
178/// - **Automatic Caching**: Frequently accessed objects are cached
179/// - **Safe API**: Borrow checker issues are handled internally
180///
181/// # Example
182///
183/// ```rust,no_run
184/// use oxidize_pdf::parser::{PdfDocument, PdfReader};
185/// use std::fs::File;
186///
187/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
188/// // From a file
189/// let reader = PdfReader::open("document.pdf")?;
190/// let document = PdfDocument::new(reader);
191///
192/// // From any Read + Seek source
193/// let file = File::open("document.pdf")?;
194/// let reader = PdfReader::new(file)?;
195/// let document = PdfDocument::new(reader);
196///
197/// // Use the document
198/// let page_count = document.page_count()?;
199/// for i in 0..page_count {
200/// let page = document.get_page(i)?;
201/// // Process page...
202/// }
203/// # Ok(())
204/// # }
205/// ```
206pub struct PdfDocument<R: Read + Seek> {
207 /// The underlying PDF reader wrapped for interior mutability
208 reader: RefCell<PdfReader<R>>,
209 /// Page tree navigator (lazily initialized)
210 page_tree: RefCell<Option<PageTree>>,
211 /// Shared resource manager for object caching
212 resources: Rc<ResourceManager>,
213 /// Cached document metadata to avoid repeated parsing
214 metadata_cache: RefCell<Option<super::reader::DocumentMetadata>>,
215}
216
217impl<R: Read + Seek> PdfDocument<R> {
218 /// Create a new PDF document from a reader
219 pub fn new(reader: PdfReader<R>) -> Self {
220 Self {
221 reader: RefCell::new(reader),
222 page_tree: RefCell::new(None),
223 resources: Rc::new(ResourceManager::new()),
224 metadata_cache: RefCell::new(None),
225 }
226 }
227
228 /// Get the PDF version of the document.
229 ///
230 /// # Returns
231 ///
232 /// PDF version string (e.g., "1.4", "1.7", "2.0")
233 ///
234 /// # Example
235 ///
236 /// ```rust,no_run
237 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
238 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
239 /// # let reader = PdfReader::open("document.pdf")?;
240 /// # let document = PdfDocument::new(reader);
241 /// let version = document.version()?;
242 /// println!("PDF version: {}", version);
243 /// # Ok(())
244 /// # }
245 /// ```
246 pub fn version(&self) -> ParseResult<String> {
247 Ok(self.reader.borrow().version().to_string())
248 }
249
250 /// Get the total number of pages in the document.
251 ///
252 /// # Returns
253 ///
254 /// The page count as an unsigned 32-bit integer.
255 ///
256 /// # Errors
257 ///
258 /// Returns an error if the page tree is malformed or missing.
259 ///
260 /// # Example
261 ///
262 /// ```rust,no_run
263 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
264 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
265 /// # let reader = PdfReader::open("document.pdf")?;
266 /// # let document = PdfDocument::new(reader);
267 /// let count = document.page_count()?;
268 /// println!("Document has {} pages", count);
269 ///
270 /// // Iterate through all pages
271 /// for i in 0..count {
272 /// let page = document.get_page(i)?;
273 /// // Process page...
274 /// }
275 /// # Ok(())
276 /// # }
277 /// ```
278 pub fn page_count(&self) -> ParseResult<u32> {
279 self.reader.borrow_mut().page_count()
280 }
281
282 /// Get document metadata including title, author, creation date, etc.
283 ///
284 /// Metadata is cached after first access for performance.
285 ///
286 /// # Returns
287 ///
288 /// A `DocumentMetadata` struct containing all available metadata fields.
289 ///
290 /// # Example
291 ///
292 /// ```rust,no_run
293 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
294 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
295 /// # let reader = PdfReader::open("document.pdf")?;
296 /// # let document = PdfDocument::new(reader);
297 /// let metadata = document.metadata()?;
298 ///
299 /// if let Some(title) = &metadata.title {
300 /// println!("Title: {}", title);
301 /// }
302 /// if let Some(author) = &metadata.author {
303 /// println!("Author: {}", author);
304 /// }
305 /// if let Some(creation_date) = &metadata.creation_date {
306 /// println!("Created: {}", creation_date);
307 /// }
308 /// println!("PDF Version: {}", metadata.version);
309 /// # Ok(())
310 /// # }
311 /// ```
312 pub fn metadata(&self) -> ParseResult<super::reader::DocumentMetadata> {
313 // Check cache first
314 if let Some(metadata) = self.metadata_cache.borrow().as_ref() {
315 return Ok(metadata.clone());
316 }
317
318 // Load metadata
319 let metadata = self.reader.borrow_mut().metadata()?;
320 self.metadata_cache.borrow_mut().replace(metadata.clone());
321 Ok(metadata)
322 }
323
324 /// Initialize the page tree if not already done
325 fn ensure_page_tree(&self) -> ParseResult<()> {
326 if self.page_tree.borrow().is_none() {
327 let page_count = self.page_count()?;
328 let pages_dict = self.load_pages_dict()?;
329 let page_tree = PageTree::new_with_pages_dict(page_count, pages_dict);
330 self.page_tree.borrow_mut().replace(page_tree);
331 }
332 Ok(())
333 }
334
335 /// Load the pages dictionary
336 fn load_pages_dict(&self) -> ParseResult<PdfDictionary> {
337 let mut reader = self.reader.borrow_mut();
338 let pages = reader.pages()?;
339 Ok(pages.clone())
340 }
341
342 /// Get a page by index (0-based).
343 ///
344 /// Pages are cached after first access. This method handles page tree
345 /// traversal and property inheritance automatically.
346 ///
347 /// # Arguments
348 ///
349 /// * `index` - Zero-based page index (0 to page_count-1)
350 ///
351 /// # Returns
352 ///
353 /// A complete `ParsedPage` with all properties and inherited resources.
354 ///
355 /// # Errors
356 ///
357 /// Returns an error if:
358 /// - Index is out of bounds
359 /// - Page tree is malformed
360 /// - Required page properties are missing
361 ///
362 /// # Example
363 ///
364 /// ```rust,no_run
365 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
366 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
367 /// # let reader = PdfReader::open("document.pdf")?;
368 /// # let document = PdfDocument::new(reader);
369 /// // Get the first page
370 /// let page = document.get_page(0)?;
371 ///
372 /// // Access page properties
373 /// println!("Page size: {}x{} points", page.width(), page.height());
374 /// println!("Rotation: {}°", page.rotation);
375 ///
376 /// // Get content streams
377 /// let streams = page.content_streams_with_document(&document)?;
378 /// println!("Page has {} content streams", streams.len());
379 /// # Ok(())
380 /// # }
381 /// ```
382 pub fn get_page(&self, index: u32) -> ParseResult<ParsedPage> {
383 self.ensure_page_tree()?;
384
385 // First check if page is already loaded
386 if let Some(page_tree) = self.page_tree.borrow().as_ref() {
387 if let Some(page) = page_tree.get_cached_page(index) {
388 return Ok(page.clone());
389 }
390 }
391
392 // Load the page
393 let page = self.load_page_at_index(index)?;
394
395 // Cache it
396 if let Some(page_tree) = self.page_tree.borrow_mut().as_mut() {
397 page_tree.cache_page(index, page.clone());
398 }
399
400 Ok(page)
401 }
402
403 /// Load a specific page by index
404 fn load_page_at_index(&self, index: u32) -> ParseResult<ParsedPage> {
405 // Get the pages root
406 let pages_dict = self.load_pages_dict()?;
407
408 // Navigate to the specific page
409 let page_info = self.find_page_in_tree(&pages_dict, index, 0, None)?;
410
411 Ok(page_info)
412 }
413
414 /// Find a page in the page tree
415 fn find_page_in_tree(
416 &self,
417 node: &PdfDictionary,
418 target_index: u32,
419 current_index: u32,
420 inherited: Option<&PdfDictionary>,
421 ) -> ParseResult<ParsedPage> {
422 let node_type = node
423 .get_type()
424 .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
425
426 match node_type {
427 "Pages" => {
428 // This is a page tree node
429 let kids = node
430 .get("Kids")
431 .and_then(|obj| obj.as_array())
432 .ok_or_else(|| ParseError::MissingKey("Kids".to_string()))?;
433
434 // Merge inherited attributes
435 let mut merged_inherited = inherited.cloned().unwrap_or_else(PdfDictionary::new);
436
437 // Inheritable attributes
438 for key in ["Resources", "MediaBox", "CropBox", "Rotate"] {
439 if let Some(value) = node.get(key) {
440 if !merged_inherited.contains_key(key) {
441 merged_inherited.insert(key.to_string(), value.clone());
442 }
443 }
444 }
445
446 // Find which kid contains our target page
447 let mut current_idx = current_index;
448 for kid_ref in &kids.0 {
449 let kid_ref =
450 kid_ref
451 .as_reference()
452 .ok_or_else(|| ParseError::SyntaxError {
453 position: 0,
454 message: "Kids array must contain references".to_string(),
455 })?;
456
457 // Get the kid object
458 let kid_obj = self.get_object(kid_ref.0, kid_ref.1)?;
459 let kid_dict = kid_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
460 position: 0,
461 message: "Page tree node must be a dictionary".to_string(),
462 })?;
463
464 let kid_type = kid_dict
465 .get_type()
466 .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
467
468 let count = if kid_type == "Pages" {
469 kid_dict
470 .get("Count")
471 .and_then(|obj| obj.as_integer())
472 .ok_or_else(|| ParseError::MissingKey("Count".to_string()))?
473 as u32
474 } else {
475 1
476 };
477
478 if target_index < current_idx + count {
479 // Found the right subtree/page
480 if kid_type == "Page" {
481 // This is the page we want
482 return self.create_parsed_page(
483 kid_ref,
484 kid_dict,
485 Some(&merged_inherited),
486 );
487 } else {
488 // Recurse into this subtree
489 return self.find_page_in_tree(
490 kid_dict,
491 target_index,
492 current_idx,
493 Some(&merged_inherited),
494 );
495 }
496 }
497
498 current_idx += count;
499 }
500
501 Err(ParseError::SyntaxError {
502 position: 0,
503 message: "Page not found in tree".to_string(),
504 })
505 }
506 "Page" => {
507 // This is a page object
508 if target_index != current_index {
509 return Err(ParseError::SyntaxError {
510 position: 0,
511 message: "Page index mismatch".to_string(),
512 });
513 }
514
515 // We need the reference, but we don't have it here
516 // This case shouldn't happen if we're navigating properly
517 Err(ParseError::SyntaxError {
518 position: 0,
519 message: "Direct page object without reference".to_string(),
520 })
521 }
522 _ => Err(ParseError::SyntaxError {
523 position: 0,
524 message: format!("Invalid page tree node type: {node_type}"),
525 }),
526 }
527 }
528
529 /// Create a ParsedPage from a page dictionary
530 fn create_parsed_page(
531 &self,
532 obj_ref: (u32, u16),
533 page_dict: &PdfDictionary,
534 inherited: Option<&PdfDictionary>,
535 ) -> ParseResult<ParsedPage> {
536 // Extract page attributes
537 let media_box = self
538 .get_rectangle(page_dict, inherited, "MediaBox")?
539 .ok_or_else(|| ParseError::MissingKey("MediaBox".to_string()))?;
540
541 let crop_box = self.get_rectangle(page_dict, inherited, "CropBox")?;
542
543 let rotation = self
544 .get_integer(page_dict, inherited, "Rotate")?
545 .unwrap_or(0) as i32;
546
547 // Get inherited resources
548 let inherited_resources = if let Some(inherited) = inherited {
549 inherited
550 .get("Resources")
551 .and_then(|r| r.as_dict())
552 .cloned()
553 } else {
554 None
555 };
556
557 Ok(ParsedPage {
558 obj_ref,
559 dict: page_dict.clone(),
560 inherited_resources,
561 media_box,
562 crop_box,
563 rotation,
564 })
565 }
566
567 /// Get a rectangle value
568 fn get_rectangle(
569 &self,
570 node: &PdfDictionary,
571 inherited: Option<&PdfDictionary>,
572 key: &str,
573 ) -> ParseResult<Option<[f64; 4]>> {
574 let array = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
575
576 if let Some(array) = array.and_then(|obj| obj.as_array()) {
577 if array.len() != 4 {
578 return Err(ParseError::SyntaxError {
579 position: 0,
580 message: format!("{key} must have 4 elements"),
581 });
582 }
583
584 let rect = [
585 array.get(0).unwrap().as_real().unwrap_or(0.0),
586 array.get(1).unwrap().as_real().unwrap_or(0.0),
587 array.get(2).unwrap().as_real().unwrap_or(0.0),
588 array.get(3).unwrap().as_real().unwrap_or(0.0),
589 ];
590
591 Ok(Some(rect))
592 } else {
593 Ok(None)
594 }
595 }
596
597 /// Get an integer value
598 fn get_integer(
599 &self,
600 node: &PdfDictionary,
601 inherited: Option<&PdfDictionary>,
602 key: &str,
603 ) -> ParseResult<Option<i64>> {
604 let value = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
605
606 Ok(value.and_then(|obj| obj.as_integer()))
607 }
608
609 /// Get an object by its reference numbers.
610 ///
611 /// This method first checks the cache, then loads from the file if needed.
612 /// Objects are automatically cached after loading.
613 ///
614 /// # Arguments
615 ///
616 /// * `obj_num` - Object number
617 /// * `gen_num` - Generation number
618 ///
619 /// # Returns
620 ///
621 /// The resolved PDF object.
622 ///
623 /// # Errors
624 ///
625 /// Returns an error if:
626 /// - Object doesn't exist
627 /// - Object is part of an encrypted object stream
628 /// - File is corrupted
629 ///
630 /// # Example
631 ///
632 /// ```rust,no_run
633 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
634 /// # use oxidize_pdf::parser::objects::PdfObject;
635 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
636 /// # let reader = PdfReader::open("document.pdf")?;
637 /// # let document = PdfDocument::new(reader);
638 /// // Get object 10 0 R
639 /// let obj = document.get_object(10, 0)?;
640 ///
641 /// // Check object type
642 /// match obj {
643 /// PdfObject::Dictionary(dict) => {
644 /// println!("Object is a dictionary with {} entries", dict.0.len());
645 /// }
646 /// PdfObject::Stream(stream) => {
647 /// println!("Object is a stream");
648 /// }
649 /// _ => {}
650 /// }
651 /// # Ok(())
652 /// # }
653 /// ```
654 pub fn get_object(&self, obj_num: u32, gen_num: u16) -> ParseResult<PdfObject> {
655 // Check resource cache first
656 if let Some(obj) = self.resources.get_cached((obj_num, gen_num)) {
657 return Ok(obj);
658 }
659
660 // Load from reader
661 let obj = {
662 let mut reader = self.reader.borrow_mut();
663 reader.get_object(obj_num, gen_num)?.clone()
664 };
665
666 // Cache it
667 self.resources.cache_object((obj_num, gen_num), obj.clone());
668
669 Ok(obj)
670 }
671
672 /// Resolve a reference to get the actual object.
673 ///
674 /// If the input is a Reference, fetches the referenced object.
675 /// Otherwise returns a clone of the input object.
676 ///
677 /// # Arguments
678 ///
679 /// * `obj` - The object to resolve (may be a Reference or direct object)
680 ///
681 /// # Returns
682 ///
683 /// The resolved object (never a Reference).
684 ///
685 /// # Example
686 ///
687 /// ```rust,no_run
688 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
689 /// # use oxidize_pdf::parser::objects::PdfObject;
690 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
691 /// # let reader = PdfReader::open("document.pdf")?;
692 /// # let document = PdfDocument::new(reader);
693 /// # let page = document.get_page(0)?;
694 /// // Contents might be a reference or direct object
695 /// if let Some(contents) = page.dict.get("Contents") {
696 /// let resolved = document.resolve(contents)?;
697 /// match resolved {
698 /// PdfObject::Stream(_) => println!("Single content stream"),
699 /// PdfObject::Array(_) => println!("Multiple content streams"),
700 /// _ => println!("Unexpected content type"),
701 /// }
702 /// }
703 /// # Ok(())
704 /// # }
705 /// ```
706 pub fn resolve(&self, obj: &PdfObject) -> ParseResult<PdfObject> {
707 match obj {
708 PdfObject::Reference(obj_num, gen_num) => self.get_object(*obj_num, *gen_num),
709 _ => Ok(obj.clone()),
710 }
711 }
712
713 /// Get content streams for a specific page.
714 ///
715 /// This method handles both single streams and arrays of streams,
716 /// automatically decompressing them according to their filters.
717 ///
718 /// # Arguments
719 ///
720 /// * `page` - The page to get content streams from
721 ///
722 /// # Returns
723 ///
724 /// Vector of decompressed content stream data ready for parsing.
725 ///
726 /// # Example
727 ///
728 /// ```rust,no_run
729 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
730 /// # use oxidize_pdf::parser::content::ContentParser;
731 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
732 /// # let reader = PdfReader::open("document.pdf")?;
733 /// # let document = PdfDocument::new(reader);
734 /// let page = document.get_page(0)?;
735 /// let streams = document.get_page_content_streams(&page)?;
736 ///
737 /// // Parse content streams
738 /// for stream_data in streams {
739 /// let operations = ContentParser::parse(&stream_data)?;
740 /// println!("Stream has {} operations", operations.len());
741 /// }
742 /// # Ok(())
743 /// # }
744 /// ```
745 /// Get page resources dictionary.
746 ///
747 /// This method returns the resources dictionary for a page, which may include
748 /// fonts, images (XObjects), patterns, color spaces, and other resources.
749 ///
750 /// # Arguments
751 ///
752 /// * `page` - The page to get resources from
753 ///
754 /// # Returns
755 ///
756 /// Optional resources dictionary if the page has resources.
757 ///
758 /// # Example
759 ///
760 /// ```rust,no_run
761 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader, PdfObject, PdfName};
762 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
763 /// # let reader = PdfReader::open("document.pdf")?;
764 /// # let document = PdfDocument::new(reader);
765 /// let page = document.get_page(0)?;
766 /// if let Some(resources) = document.get_page_resources(&page)? {
767 /// // Check for images (XObjects)
768 /// if let Some(PdfObject::Dictionary(xobjects)) = resources.0.get(&PdfName("XObject".to_string())) {
769 /// for (name, _) in xobjects.0.iter() {
770 /// println!("Found XObject: {}", name.0);
771 /// }
772 /// }
773 /// }
774 /// # Ok(())
775 /// # }
776 /// ```
777 pub fn get_page_resources<'a>(
778 &self,
779 page: &'a ParsedPage,
780 ) -> ParseResult<Option<&'a PdfDictionary>> {
781 Ok(page.get_resources())
782 }
783
784 pub fn get_page_content_streams(&self, page: &ParsedPage) -> ParseResult<Vec<Vec<u8>>> {
785 let mut streams = Vec::new();
786
787 if let Some(contents) = page.dict.get("Contents") {
788 let resolved_contents = self.resolve(contents)?;
789
790 match &resolved_contents {
791 PdfObject::Stream(stream) => {
792 streams.push(stream.decode()?);
793 }
794 PdfObject::Array(array) => {
795 for item in &array.0 {
796 let resolved = self.resolve(item)?;
797 if let PdfObject::Stream(stream) = resolved {
798 streams.push(stream.decode()?);
799 }
800 }
801 }
802 _ => {
803 return Err(ParseError::SyntaxError {
804 position: 0,
805 message: "Contents must be a stream or array of streams".to_string(),
806 })
807 }
808 }
809 }
810
811 Ok(streams)
812 }
813
814 /// Extract text from all pages in the document.
815 ///
816 /// Uses the default text extraction settings. For custom settings,
817 /// use `extract_text_with_options`.
818 ///
819 /// # Returns
820 ///
821 /// A vector of `ExtractedText`, one for each page in the document.
822 ///
823 /// # Example
824 ///
825 /// ```rust,no_run
826 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
827 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
828 /// # let reader = PdfReader::open("document.pdf")?;
829 /// # let document = PdfDocument::new(reader);
830 /// let extracted_pages = document.extract_text()?;
831 ///
832 /// for (page_num, page_text) in extracted_pages.iter().enumerate() {
833 /// println!("=== Page {} ===", page_num + 1);
834 /// println!("{}", page_text.text);
835 /// println!();
836 /// }
837 /// # Ok(())
838 /// # }
839 /// ```
840 pub fn extract_text(&self) -> ParseResult<Vec<crate::text::ExtractedText>> {
841 let extractor = crate::text::TextExtractor::new();
842 extractor.extract_from_document(self)
843 }
844
845 /// Extract text from a specific page.
846 ///
847 /// # Arguments
848 ///
849 /// * `page_index` - Zero-based page index
850 ///
851 /// # Returns
852 ///
853 /// Extracted text with optional position information.
854 ///
855 /// # Example
856 ///
857 /// ```rust,no_run
858 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
859 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
860 /// # let reader = PdfReader::open("document.pdf")?;
861 /// # let document = PdfDocument::new(reader);
862 /// // Extract text from first page only
863 /// let page_text = document.extract_text_from_page(0)?;
864 /// println!("First page text: {}", page_text.text);
865 ///
866 /// // Access text fragments with positions (if preserved)
867 /// for fragment in &page_text.fragments {
868 /// println!("'{}' at ({}, {})", fragment.text, fragment.x, fragment.y);
869 /// }
870 /// # Ok(())
871 /// # }
872 /// ```
873 pub fn extract_text_from_page(
874 &self,
875 page_index: u32,
876 ) -> ParseResult<crate::text::ExtractedText> {
877 let extractor = crate::text::TextExtractor::new();
878 extractor.extract_from_page(self, page_index)
879 }
880
881 /// Extract text with custom extraction options.
882 ///
883 /// Allows fine control over text extraction behavior including
884 /// layout preservation, spacing thresholds, and more.
885 ///
886 /// # Arguments
887 ///
888 /// * `options` - Text extraction configuration
889 ///
890 /// # Returns
891 ///
892 /// A vector of `ExtractedText`, one for each page.
893 ///
894 /// # Example
895 ///
896 /// ```rust,no_run
897 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
898 /// # use oxidize_pdf::text::ExtractionOptions;
899 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
900 /// # let reader = PdfReader::open("document.pdf")?;
901 /// # let document = PdfDocument::new(reader);
902 /// // Configure extraction to preserve layout
903 /// let options = ExtractionOptions {
904 /// preserve_layout: true,
905 /// space_threshold: 0.3,
906 /// newline_threshold: 10.0,
907 /// ..Default::default()
908 /// };
909 ///
910 /// let extracted_pages = document.extract_text_with_options(options)?;
911 ///
912 /// // Text fragments will include position information
913 /// for page_text in extracted_pages {
914 /// for fragment in &page_text.fragments {
915 /// println!("{:?}", fragment);
916 /// }
917 /// }
918 /// # Ok(())
919 /// # }
920 /// ```
921 pub fn extract_text_with_options(
922 &self,
923 options: crate::text::ExtractionOptions,
924 ) -> ParseResult<Vec<crate::text::ExtractedText>> {
925 let extractor = crate::text::TextExtractor::with_options(options);
926 extractor.extract_from_document(self)
927 }
928}
929
930#[cfg(test)]
931mod tests {
932 use super::*;
933 use crate::parser::objects::{PdfObject, PdfString};
934 use std::io::Cursor;
935
936 // Helper function to create a minimal PDF in memory
937 fn create_minimal_pdf() -> Vec<u8> {
938 let mut pdf = Vec::new();
939
940 // PDF header
941 pdf.extend_from_slice(b"%PDF-1.4\n");
942
943 // Catalog object
944 pdf.extend_from_slice(b"1 0 obj\n");
945 pdf.extend_from_slice(b"<< /Type /Catalog /Pages 2 0 R >>\n");
946 pdf.extend_from_slice(b"endobj\n");
947
948 // Pages object
949 pdf.extend_from_slice(b"2 0 obj\n");
950 pdf.extend_from_slice(b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>\n");
951 pdf.extend_from_slice(b"endobj\n");
952
953 // Page object
954 pdf.extend_from_slice(b"3 0 obj\n");
955 pdf.extend_from_slice(
956 b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << >> >>\n",
957 );
958 pdf.extend_from_slice(b"endobj\n");
959
960 // Cross-reference table
961 let xref_pos = pdf.len();
962 pdf.extend_from_slice(b"xref\n");
963 pdf.extend_from_slice(b"0 4\n");
964 pdf.extend_from_slice(b"0000000000 65535 f \n");
965 pdf.extend_from_slice(b"0000000009 00000 n \n");
966 pdf.extend_from_slice(b"0000000058 00000 n \n");
967 pdf.extend_from_slice(b"0000000115 00000 n \n");
968
969 // Trailer
970 pdf.extend_from_slice(b"trailer\n");
971 pdf.extend_from_slice(b"<< /Size 4 /Root 1 0 R >>\n");
972 pdf.extend_from_slice(b"startxref\n");
973 pdf.extend_from_slice(format!("{}\n", xref_pos).as_bytes());
974 pdf.extend_from_slice(b"%%EOF\n");
975
976 pdf
977 }
978
979 // Helper to create a PDF with metadata
980 fn create_pdf_with_metadata() -> Vec<u8> {
981 let mut pdf = Vec::new();
982
983 // PDF header
984 pdf.extend_from_slice(b"%PDF-1.5\n");
985
986 // Record positions for xref
987 let obj1_pos = pdf.len();
988
989 // Catalog object
990 pdf.extend_from_slice(b"1 0 obj\n");
991 pdf.extend_from_slice(b"<< /Type /Catalog /Pages 2 0 R >>\n");
992 pdf.extend_from_slice(b"endobj\n");
993
994 let obj2_pos = pdf.len();
995
996 // Pages object
997 pdf.extend_from_slice(b"2 0 obj\n");
998 pdf.extend_from_slice(b"<< /Type /Pages /Kids [] /Count 0 >>\n");
999 pdf.extend_from_slice(b"endobj\n");
1000
1001 let obj3_pos = pdf.len();
1002
1003 // Info object
1004 pdf.extend_from_slice(b"3 0 obj\n");
1005 pdf.extend_from_slice(
1006 b"<< /Title (Test Document) /Author (Test Author) /Subject (Test Subject) >>\n",
1007 );
1008 pdf.extend_from_slice(b"endobj\n");
1009
1010 // Cross-reference table
1011 let xref_pos = pdf.len();
1012 pdf.extend_from_slice(b"xref\n");
1013 pdf.extend_from_slice(b"0 4\n");
1014 pdf.extend_from_slice(b"0000000000 65535 f \n");
1015 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj1_pos).as_bytes());
1016 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj2_pos).as_bytes());
1017 pdf.extend_from_slice(format!("{:010} 00000 n \n", obj3_pos).as_bytes());
1018
1019 // Trailer
1020 pdf.extend_from_slice(b"trailer\n");
1021 pdf.extend_from_slice(b"<< /Size 4 /Root 1 0 R /Info 3 0 R >>\n");
1022 pdf.extend_from_slice(b"startxref\n");
1023 pdf.extend_from_slice(format!("{}\n", xref_pos).as_bytes());
1024 pdf.extend_from_slice(b"%%EOF\n");
1025
1026 pdf
1027 }
1028
1029 #[test]
1030 fn test_pdf_document_new() {
1031 let pdf_data = create_minimal_pdf();
1032 let cursor = Cursor::new(pdf_data);
1033 let reader = PdfReader::new(cursor).unwrap();
1034 let document = PdfDocument::new(reader);
1035
1036 // Verify document is created with empty caches
1037 assert!(document.page_tree.borrow().is_none());
1038 assert!(document.metadata_cache.borrow().is_none());
1039 }
1040
1041 #[test]
1042 fn test_version() {
1043 let pdf_data = create_minimal_pdf();
1044 let cursor = Cursor::new(pdf_data);
1045 let reader = PdfReader::new(cursor).unwrap();
1046 let document = PdfDocument::new(reader);
1047
1048 let version = document.version().unwrap();
1049 assert_eq!(version, "1.4");
1050 }
1051
1052 #[test]
1053 fn test_page_count() {
1054 let pdf_data = create_minimal_pdf();
1055 let cursor = Cursor::new(pdf_data);
1056 let reader = PdfReader::new(cursor).unwrap();
1057 let document = PdfDocument::new(reader);
1058
1059 let count = document.page_count().unwrap();
1060 assert_eq!(count, 1);
1061 }
1062
1063 #[test]
1064 fn test_metadata() {
1065 let pdf_data = create_pdf_with_metadata();
1066 let cursor = Cursor::new(pdf_data);
1067 let reader = PdfReader::new(cursor).unwrap();
1068 let document = PdfDocument::new(reader);
1069
1070 let metadata = document.metadata().unwrap();
1071 assert_eq!(metadata.title, Some("Test Document".to_string()));
1072 assert_eq!(metadata.author, Some("Test Author".to_string()));
1073 assert_eq!(metadata.subject, Some("Test Subject".to_string()));
1074
1075 // Verify caching works
1076 let metadata2 = document.metadata().unwrap();
1077 assert_eq!(metadata.title, metadata2.title);
1078 }
1079
1080 #[test]
1081 fn test_get_page() {
1082 let pdf_data = create_minimal_pdf();
1083 let cursor = Cursor::new(pdf_data);
1084 let reader = PdfReader::new(cursor).unwrap();
1085 let document = PdfDocument::new(reader);
1086
1087 // Get first page
1088 let page = document.get_page(0).unwrap();
1089 assert_eq!(page.media_box, [0.0, 0.0, 612.0, 792.0]);
1090
1091 // Verify caching works
1092 let page2 = document.get_page(0).unwrap();
1093 assert_eq!(page.media_box, page2.media_box);
1094 }
1095
1096 #[test]
1097 fn test_get_page_out_of_bounds() {
1098 let pdf_data = create_minimal_pdf();
1099 let cursor = Cursor::new(pdf_data);
1100 let reader = PdfReader::new(cursor).unwrap();
1101 let document = PdfDocument::new(reader);
1102
1103 // Try to get page that doesn't exist
1104 let result = document.get_page(10);
1105 assert!(result.is_err());
1106 }
1107
1108 #[test]
1109 fn test_resource_manager_caching() {
1110 let resources = ResourceManager::new();
1111
1112 // Test caching an object
1113 let obj_ref = (1, 0);
1114 let obj = PdfObject::String(PdfString("Test".as_bytes().to_vec()));
1115
1116 assert!(resources.get_cached(obj_ref).is_none());
1117
1118 resources.cache_object(obj_ref, obj.clone());
1119
1120 let cached = resources.get_cached(obj_ref).unwrap();
1121 assert_eq!(cached, obj);
1122
1123 // Test clearing cache
1124 resources.clear_cache();
1125 assert!(resources.get_cached(obj_ref).is_none());
1126 }
1127
1128 #[test]
1129 fn test_get_object() {
1130 let pdf_data = create_minimal_pdf();
1131 let cursor = Cursor::new(pdf_data);
1132 let reader = PdfReader::new(cursor).unwrap();
1133 let document = PdfDocument::new(reader);
1134
1135 // Get catalog object
1136 let catalog = document.get_object(1, 0).unwrap();
1137 if let PdfObject::Dictionary(dict) = catalog {
1138 if let Some(PdfObject::Name(name)) = dict.get("Type") {
1139 assert_eq!(name.0, "Catalog");
1140 } else {
1141 panic!("Expected /Type name");
1142 }
1143 } else {
1144 panic!("Expected dictionary object");
1145 }
1146 }
1147
1148 #[test]
1149 fn test_resolve_reference() {
1150 let pdf_data = create_minimal_pdf();
1151 let cursor = Cursor::new(pdf_data);
1152 let reader = PdfReader::new(cursor).unwrap();
1153 let document = PdfDocument::new(reader);
1154
1155 // Create a reference to the catalog
1156 let ref_obj = PdfObject::Reference(1, 0);
1157
1158 // Resolve it
1159 let resolved = document.resolve(&ref_obj).unwrap();
1160 if let PdfObject::Dictionary(dict) = resolved {
1161 if let Some(PdfObject::Name(name)) = dict.get("Type") {
1162 assert_eq!(name.0, "Catalog");
1163 } else {
1164 panic!("Expected /Type name");
1165 }
1166 } else {
1167 panic!("Expected dictionary object");
1168 }
1169 }
1170
1171 #[test]
1172 fn test_resolve_non_reference() {
1173 let pdf_data = create_minimal_pdf();
1174 let cursor = Cursor::new(pdf_data);
1175 let reader = PdfReader::new(cursor).unwrap();
1176 let document = PdfDocument::new(reader);
1177
1178 // Try to resolve a non-reference object
1179 let obj = PdfObject::String(PdfString("Test".as_bytes().to_vec()));
1180 let resolved = document.resolve(&obj).unwrap();
1181
1182 // Should return the same object
1183 assert_eq!(resolved, obj);
1184 }
1185
1186 #[test]
1187 fn test_invalid_pdf_data() {
1188 let invalid_data = b"This is not a PDF";
1189 let cursor = Cursor::new(invalid_data.to_vec());
1190 let result = PdfReader::new(cursor);
1191
1192 assert!(result.is_err());
1193 }
1194
1195 #[test]
1196 fn test_empty_page_tree() {
1197 // Create PDF with empty page tree
1198 let pdf_data = create_pdf_with_metadata(); // This has 0 pages
1199 let cursor = Cursor::new(pdf_data);
1200 let reader = PdfReader::new(cursor).unwrap();
1201 let document = PdfDocument::new(reader);
1202
1203 let count = document.page_count().unwrap();
1204 assert_eq!(count, 0);
1205
1206 // Try to get a page from empty document
1207 let result = document.get_page(0);
1208 assert!(result.is_err());
1209 }
1210
1211 #[test]
1212 fn test_extract_text_empty_document() {
1213 let pdf_data = create_pdf_with_metadata();
1214 let cursor = Cursor::new(pdf_data);
1215 let reader = PdfReader::new(cursor).unwrap();
1216 let document = PdfDocument::new(reader);
1217
1218 let text = document.extract_text().unwrap();
1219 assert!(text.is_empty());
1220 }
1221
1222 #[test]
1223 fn test_concurrent_access() {
1224 let pdf_data = create_minimal_pdf();
1225 let cursor = Cursor::new(pdf_data);
1226 let reader = PdfReader::new(cursor).unwrap();
1227 let document = PdfDocument::new(reader);
1228
1229 // Access multiple things concurrently
1230 let version = document.version().unwrap();
1231 let count = document.page_count().unwrap();
1232 let page = document.get_page(0).unwrap();
1233
1234 assert_eq!(version, "1.4");
1235 assert_eq!(count, 1);
1236 assert_eq!(page.media_box[2], 612.0);
1237 }
1238
1239 // Additional comprehensive tests
1240 mod comprehensive_tests {
1241 use super::*;
1242
1243 #[test]
1244 fn test_resource_manager_default() {
1245 let resources = ResourceManager::default();
1246 assert!(resources.get_cached((1, 0)).is_none());
1247 }
1248
1249 #[test]
1250 fn test_resource_manager_multiple_objects() {
1251 let resources = ResourceManager::new();
1252
1253 // Cache multiple objects
1254 resources.cache_object((1, 0), PdfObject::Integer(42));
1255 resources.cache_object((2, 0), PdfObject::Boolean(true));
1256 resources.cache_object(
1257 (3, 0),
1258 PdfObject::String(PdfString("test".as_bytes().to_vec())),
1259 );
1260
1261 // Verify all are cached
1262 assert!(resources.get_cached((1, 0)).is_some());
1263 assert!(resources.get_cached((2, 0)).is_some());
1264 assert!(resources.get_cached((3, 0)).is_some());
1265
1266 // Clear and verify empty
1267 resources.clear_cache();
1268 assert!(resources.get_cached((1, 0)).is_none());
1269 assert!(resources.get_cached((2, 0)).is_none());
1270 assert!(resources.get_cached((3, 0)).is_none());
1271 }
1272
1273 #[test]
1274 fn test_resource_manager_object_overwrite() {
1275 let resources = ResourceManager::new();
1276
1277 // Cache an object
1278 resources.cache_object((1, 0), PdfObject::Integer(42));
1279 assert_eq!(resources.get_cached((1, 0)), Some(PdfObject::Integer(42)));
1280
1281 // Overwrite with different object
1282 resources.cache_object((1, 0), PdfObject::Boolean(true));
1283 assert_eq!(resources.get_cached((1, 0)), Some(PdfObject::Boolean(true)));
1284 }
1285
1286 #[test]
1287 fn test_get_object_caching() {
1288 let pdf_data = create_minimal_pdf();
1289 let cursor = Cursor::new(pdf_data);
1290 let reader = PdfReader::new(cursor).unwrap();
1291 let document = PdfDocument::new(reader);
1292
1293 // Get object first time (should cache)
1294 let obj1 = document.get_object(1, 0).unwrap();
1295
1296 // Get same object again (should use cache)
1297 let obj2 = document.get_object(1, 0).unwrap();
1298
1299 // Objects should be identical
1300 assert_eq!(obj1, obj2);
1301
1302 // Verify it's cached
1303 assert!(document.resources.get_cached((1, 0)).is_some());
1304 }
1305
1306 #[test]
1307 fn test_get_object_different_generations() {
1308 let pdf_data = create_minimal_pdf();
1309 let cursor = Cursor::new(pdf_data);
1310 let reader = PdfReader::new(cursor).unwrap();
1311 let document = PdfDocument::new(reader);
1312
1313 // Get object with generation 0
1314 let _obj1 = document.get_object(1, 0).unwrap();
1315
1316 // Try to get same object with different generation (should fail)
1317 let result = document.get_object(1, 1);
1318 assert!(result.is_err());
1319
1320 // Original should still be cached
1321 assert!(document.resources.get_cached((1, 0)).is_some());
1322 }
1323
1324 #[test]
1325 fn test_get_object_nonexistent() {
1326 let pdf_data = create_minimal_pdf();
1327 let cursor = Cursor::new(pdf_data);
1328 let reader = PdfReader::new(cursor).unwrap();
1329 let document = PdfDocument::new(reader);
1330
1331 // Try to get non-existent object
1332 let result = document.get_object(999, 0);
1333 assert!(result.is_err());
1334 }
1335
1336 #[test]
1337 fn test_resolve_nested_references() {
1338 let pdf_data = create_minimal_pdf();
1339 let cursor = Cursor::new(pdf_data);
1340 let reader = PdfReader::new(cursor).unwrap();
1341 let document = PdfDocument::new(reader);
1342
1343 // Test resolving a reference
1344 let ref_obj = PdfObject::Reference(2, 0);
1345 let resolved = document.resolve(&ref_obj).unwrap();
1346
1347 // Should resolve to the pages object
1348 if let PdfObject::Dictionary(dict) = resolved {
1349 if let Some(PdfObject::Name(name)) = dict.get("Type") {
1350 assert_eq!(name.0, "Pages");
1351 }
1352 }
1353 }
1354
1355 #[test]
1356 fn test_resolve_various_object_types() {
1357 let pdf_data = create_minimal_pdf();
1358 let cursor = Cursor::new(pdf_data);
1359 let reader = PdfReader::new(cursor).unwrap();
1360 let document = PdfDocument::new(reader);
1361
1362 // Test resolving different object types
1363 let test_objects = vec![
1364 PdfObject::Integer(42),
1365 PdfObject::Boolean(true),
1366 PdfObject::String(PdfString("test".as_bytes().to_vec())),
1367 PdfObject::Real(3.14),
1368 PdfObject::Null,
1369 ];
1370
1371 for obj in test_objects {
1372 let resolved = document.resolve(&obj).unwrap();
1373 assert_eq!(resolved, obj);
1374 }
1375 }
1376
1377 #[test]
1378 fn test_get_page_cached() {
1379 let pdf_data = create_minimal_pdf();
1380 let cursor = Cursor::new(pdf_data);
1381 let reader = PdfReader::new(cursor).unwrap();
1382 let document = PdfDocument::new(reader);
1383
1384 // Get page first time
1385 let page1 = document.get_page(0).unwrap();
1386
1387 // Get same page again
1388 let page2 = document.get_page(0).unwrap();
1389
1390 // Should be identical
1391 assert_eq!(page1.media_box, page2.media_box);
1392 assert_eq!(page1.rotation, page2.rotation);
1393 assert_eq!(page1.obj_ref, page2.obj_ref);
1394 }
1395
1396 #[test]
1397 fn test_metadata_caching() {
1398 let pdf_data = create_pdf_with_metadata();
1399 let cursor = Cursor::new(pdf_data);
1400 let reader = PdfReader::new(cursor).unwrap();
1401 let document = PdfDocument::new(reader);
1402
1403 // Get metadata first time
1404 let meta1 = document.metadata().unwrap();
1405
1406 // Get metadata again
1407 let meta2 = document.metadata().unwrap();
1408
1409 // Should be identical
1410 assert_eq!(meta1.title, meta2.title);
1411 assert_eq!(meta1.author, meta2.author);
1412 assert_eq!(meta1.subject, meta2.subject);
1413 assert_eq!(meta1.version, meta2.version);
1414 }
1415
1416 #[test]
1417 fn test_page_tree_initialization() {
1418 let pdf_data = create_minimal_pdf();
1419 let cursor = Cursor::new(pdf_data);
1420 let reader = PdfReader::new(cursor).unwrap();
1421 let document = PdfDocument::new(reader);
1422
1423 // Initially page tree should be None
1424 assert!(document.page_tree.borrow().is_none());
1425
1426 // After getting page count, page tree should be initialized
1427 let _count = document.page_count().unwrap();
1428 // Note: page_tree is private, so we can't directly check it
1429 // But we can verify it works by getting a page
1430 let _page = document.get_page(0).unwrap();
1431 }
1432
1433 #[test]
1434 fn test_get_page_resources() {
1435 let pdf_data = create_minimal_pdf();
1436 let cursor = Cursor::new(pdf_data);
1437 let reader = PdfReader::new(cursor).unwrap();
1438 let document = PdfDocument::new(reader);
1439
1440 let page = document.get_page(0).unwrap();
1441 let resources = document.get_page_resources(&page).unwrap();
1442
1443 // The minimal PDF has empty resources
1444 assert!(resources.is_some());
1445 }
1446
1447 #[test]
1448 fn test_get_page_content_streams_empty() {
1449 let pdf_data = create_minimal_pdf();
1450 let cursor = Cursor::new(pdf_data);
1451 let reader = PdfReader::new(cursor).unwrap();
1452 let document = PdfDocument::new(reader);
1453
1454 let page = document.get_page(0).unwrap();
1455 let streams = document.get_page_content_streams(&page).unwrap();
1456
1457 // Minimal PDF has no content streams
1458 assert!(streams.is_empty());
1459 }
1460
1461 #[test]
1462 fn test_extract_text_from_page() {
1463 let pdf_data = create_minimal_pdf();
1464 let cursor = Cursor::new(pdf_data);
1465 let reader = PdfReader::new(cursor).unwrap();
1466 let document = PdfDocument::new(reader);
1467
1468 let result = document.extract_text_from_page(0);
1469 // Should succeed even with empty page
1470 assert!(result.is_ok());
1471 }
1472
1473 #[test]
1474 fn test_extract_text_from_page_out_of_bounds() {
1475 let pdf_data = create_minimal_pdf();
1476 let cursor = Cursor::new(pdf_data);
1477 let reader = PdfReader::new(cursor).unwrap();
1478 let document = PdfDocument::new(reader);
1479
1480 let result = document.extract_text_from_page(999);
1481 assert!(result.is_err());
1482 }
1483
1484 #[test]
1485 fn test_extract_text_with_options() {
1486 let pdf_data = create_minimal_pdf();
1487 let cursor = Cursor::new(pdf_data);
1488 let reader = PdfReader::new(cursor).unwrap();
1489 let document = PdfDocument::new(reader);
1490
1491 let options = crate::text::ExtractionOptions {
1492 preserve_layout: true,
1493 space_threshold: 0.5,
1494 newline_threshold: 15.0,
1495 ..Default::default()
1496 };
1497
1498 let result = document.extract_text_with_options(options);
1499 assert!(result.is_ok());
1500 }
1501
1502 #[test]
1503 fn test_version_different_pdf_versions() {
1504 // Test with different PDF versions
1505 let versions = vec!["1.3", "1.4", "1.5", "1.6", "1.7"];
1506
1507 for version in versions {
1508 let mut pdf_data = Vec::new();
1509
1510 // PDF header
1511 pdf_data.extend_from_slice(format!("%PDF-{}\n", version).as_bytes());
1512
1513 // Track positions for xref
1514 let obj1_pos = pdf_data.len();
1515
1516 // Catalog object
1517 pdf_data.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
1518
1519 let obj2_pos = pdf_data.len();
1520
1521 // Pages object
1522 pdf_data
1523 .extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n");
1524
1525 // Cross-reference table
1526 let xref_pos = pdf_data.len();
1527 pdf_data.extend_from_slice(b"xref\n");
1528 pdf_data.extend_from_slice(b"0 3\n");
1529 pdf_data.extend_from_slice(b"0000000000 65535 f \n");
1530 pdf_data.extend_from_slice(format!("{:010} 00000 n \n", obj1_pos).as_bytes());
1531 pdf_data.extend_from_slice(format!("{:010} 00000 n \n", obj2_pos).as_bytes());
1532
1533 // Trailer
1534 pdf_data.extend_from_slice(b"trailer\n");
1535 pdf_data.extend_from_slice(b"<< /Size 3 /Root 1 0 R >>\n");
1536 pdf_data.extend_from_slice(b"startxref\n");
1537 pdf_data.extend_from_slice(format!("{}\n", xref_pos).as_bytes());
1538 pdf_data.extend_from_slice(b"%%EOF\n");
1539
1540 let cursor = Cursor::new(pdf_data);
1541 let reader = PdfReader::new(cursor).unwrap();
1542 let document = PdfDocument::new(reader);
1543
1544 let pdf_version = document.version().unwrap();
1545 assert_eq!(pdf_version, version);
1546 }
1547 }
1548
1549 #[test]
1550 fn test_page_count_zero() {
1551 let pdf_data = create_pdf_with_metadata(); // Has 0 pages
1552 let cursor = Cursor::new(pdf_data);
1553 let reader = PdfReader::new(cursor).unwrap();
1554 let document = PdfDocument::new(reader);
1555
1556 let count = document.page_count().unwrap();
1557 assert_eq!(count, 0);
1558 }
1559
1560 #[test]
1561 fn test_multiple_object_access() {
1562 let pdf_data = create_minimal_pdf();
1563 let cursor = Cursor::new(pdf_data);
1564 let reader = PdfReader::new(cursor).unwrap();
1565 let document = PdfDocument::new(reader);
1566
1567 // Access multiple objects
1568 let catalog = document.get_object(1, 0).unwrap();
1569 let pages = document.get_object(2, 0).unwrap();
1570 let page = document.get_object(3, 0).unwrap();
1571
1572 // Verify they're all different objects
1573 assert_ne!(catalog, pages);
1574 assert_ne!(pages, page);
1575 assert_ne!(catalog, page);
1576 }
1577
1578 #[test]
1579 fn test_error_handling_invalid_object_reference() {
1580 let pdf_data = create_minimal_pdf();
1581 let cursor = Cursor::new(pdf_data);
1582 let reader = PdfReader::new(cursor).unwrap();
1583 let document = PdfDocument::new(reader);
1584
1585 // Try to resolve an invalid reference
1586 let invalid_ref = PdfObject::Reference(999, 0);
1587 let result = document.resolve(&invalid_ref);
1588 assert!(result.is_err());
1589 }
1590
1591 #[test]
1592 fn test_concurrent_metadata_access() {
1593 let pdf_data = create_pdf_with_metadata();
1594 let cursor = Cursor::new(pdf_data);
1595 let reader = PdfReader::new(cursor).unwrap();
1596 let document = PdfDocument::new(reader);
1597
1598 // Access metadata and other properties concurrently
1599 let metadata = document.metadata().unwrap();
1600 let version = document.version().unwrap();
1601 let count = document.page_count().unwrap();
1602
1603 assert_eq!(metadata.title, Some("Test Document".to_string()));
1604 assert_eq!(version, "1.5");
1605 assert_eq!(count, 0);
1606 }
1607
1608 #[test]
1609 fn test_page_properties_comprehensive() {
1610 let pdf_data = create_minimal_pdf();
1611 let cursor = Cursor::new(pdf_data);
1612 let reader = PdfReader::new(cursor).unwrap();
1613 let document = PdfDocument::new(reader);
1614
1615 let page = document.get_page(0).unwrap();
1616
1617 // Test all page properties
1618 assert_eq!(page.media_box, [0.0, 0.0, 612.0, 792.0]);
1619 assert_eq!(page.crop_box, None);
1620 assert_eq!(page.rotation, 0);
1621 assert_eq!(page.obj_ref, (3, 0));
1622
1623 // Test width/height calculation
1624 assert_eq!(page.width(), 612.0);
1625 assert_eq!(page.height(), 792.0);
1626 }
1627
1628 #[test]
1629 fn test_memory_usage_efficiency() {
1630 let pdf_data = create_minimal_pdf();
1631 let cursor = Cursor::new(pdf_data);
1632 let reader = PdfReader::new(cursor).unwrap();
1633 let document = PdfDocument::new(reader);
1634
1635 // Access same page multiple times
1636 for _ in 0..10 {
1637 let _page = document.get_page(0).unwrap();
1638 }
1639
1640 // Should only have one copy in cache
1641 let page_count = document.page_count().unwrap();
1642 assert_eq!(page_count, 1);
1643 }
1644
1645 #[test]
1646 fn test_reader_borrow_safety() {
1647 let pdf_data = create_minimal_pdf();
1648 let cursor = Cursor::new(pdf_data);
1649 let reader = PdfReader::new(cursor).unwrap();
1650 let document = PdfDocument::new(reader);
1651
1652 // Multiple concurrent borrows should work
1653 let version = document.version().unwrap();
1654 let count = document.page_count().unwrap();
1655 let metadata = document.metadata().unwrap();
1656
1657 assert_eq!(version, "1.4");
1658 assert_eq!(count, 1);
1659 assert!(metadata.title.is_none());
1660 }
1661
1662 #[test]
1663 fn test_cache_consistency() {
1664 let pdf_data = create_minimal_pdf();
1665 let cursor = Cursor::new(pdf_data);
1666 let reader = PdfReader::new(cursor).unwrap();
1667 let document = PdfDocument::new(reader);
1668
1669 // Get object and verify caching
1670 let obj1 = document.get_object(1, 0).unwrap();
1671 let cached = document.resources.get_cached((1, 0)).unwrap();
1672
1673 assert_eq!(obj1, cached);
1674
1675 // Clear cache and get object again
1676 document.resources.clear_cache();
1677 let obj2 = document.get_object(1, 0).unwrap();
1678
1679 // Should be same content but loaded fresh
1680 assert_eq!(obj1, obj2);
1681 }
1682 }
1683}