oxidize_pdf/parser/document.rs
1//! PDF Document wrapper - High-level interface for PDF parsing and manipulation
2//!
3//! This module provides a robust, high-level interface for working with PDF documents.
4//! It solves Rust's borrow checker challenges through careful use of interior mutability
5//! (RefCell) and separation of concerns between parsing, caching, and page access.
6//!
7//! # Architecture
8//!
9//! The module uses a layered architecture:
10//! - **PdfDocument**: Main entry point with RefCell-based state management
11//! - **ResourceManager**: Centralized object caching with interior mutability
12//! - **PdfReader**: Low-level file access (wrapped in RefCell)
13//! - **PageTree**: Lazy-loaded page navigation
14//!
15//! # Key Features
16//!
17//! - **Automatic caching**: Objects are cached after first access
18//! - **Resource management**: Shared resources are handled efficiently
19//! - **Page navigation**: Fast access to any page in the document
20//! - **Reference resolution**: Automatic resolution of indirect references
21//! - **Text extraction**: Built-in support for extracting text from pages
22//!
23//! # Example
24//!
25//! ```rust,no_run
26//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
27//!
28//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
29//! // Open a PDF document
30//! let reader = PdfReader::open("document.pdf")?;
31//! let document = PdfDocument::new(reader);
32//!
33//! // Get document information
34//! let page_count = document.page_count()?;
35//! let metadata = document.metadata()?;
36//! println!("Title: {:?}", metadata.title);
37//! println!("Pages: {}", page_count);
38//!
39//! // Access a specific page
40//! let page = document.get_page(0)?;
41//! println!("Page size: {}x{}", page.width(), page.height());
42//!
43//! // Extract text from all pages
44//! let extracted_text = document.extract_text()?;
45//! for (i, page_text) in extracted_text.iter().enumerate() {
46//! println!("Page {}: {}", i + 1, page_text.text);
47//! }
48//! # Ok(())
49//! # }
50//! ```
51
52#[cfg(test)]
53use super::objects::{PdfArray, PdfName};
54use super::objects::{PdfDictionary, PdfObject};
55use super::page_tree::{PageTree, ParsedPage};
56use super::reader::PdfReader;
57use super::{ParseError, ParseOptions, ParseResult};
58use std::cell::RefCell;
59use std::collections::HashMap;
60use std::fs::File;
61use std::io::{Read, Seek};
62use std::path::Path;
63use std::rc::Rc;
64
65/// Resource manager for efficient PDF object caching.
66///
67/// The ResourceManager provides centralized caching of PDF objects to avoid
68/// repeated parsing and to share resources between different parts of the document.
69/// It uses RefCell for interior mutability, allowing multiple immutable references
70/// to the document while still being able to update the cache.
71///
72/// # Caching Strategy
73///
74/// - Objects are cached on first access
75/// - Cache persists for the lifetime of the document
76/// - Manual cache clearing is supported for memory management
77///
78/// # Example
79///
80/// ```rust,no_run
81/// use oxidize_pdf::parser::document::ResourceManager;
82///
83/// let resources = ResourceManager::new();
84///
85/// // Objects are cached automatically when accessed through PdfDocument
86/// // Manual cache management:
87/// resources.clear_cache(); // Free memory when needed
88/// ```
89pub struct ResourceManager {
90 /// Cached objects indexed by (object_number, generation_number)
91 object_cache: RefCell<HashMap<(u32, u16), PdfObject>>,
92}
93
94impl Default for ResourceManager {
95 fn default() -> Self {
96 Self::new()
97 }
98}
99
100impl ResourceManager {
101 /// Create a new resource manager
102 pub fn new() -> Self {
103 Self {
104 object_cache: RefCell::new(HashMap::new()),
105 }
106 }
107
108 /// Get an object from cache if available.
109 ///
110 /// # Arguments
111 ///
112 /// * `obj_ref` - Object reference (object_number, generation_number)
113 ///
114 /// # Returns
115 ///
116 /// Cloned object if cached, None otherwise.
117 ///
118 /// # Example
119 ///
120 /// ```rust,no_run
121 /// # use oxidize_pdf::parser::document::ResourceManager;
122 /// # let resources = ResourceManager::new();
123 /// if let Some(obj) = resources.get_cached((10, 0)) {
124 /// println!("Object 10 0 R found in cache");
125 /// }
126 /// ```
127 pub fn get_cached(&self, obj_ref: (u32, u16)) -> Option<PdfObject> {
128 self.object_cache.borrow().get(&obj_ref).cloned()
129 }
130
131 /// Cache an object for future access.
132 ///
133 /// # Arguments
134 ///
135 /// * `obj_ref` - Object reference (object_number, generation_number)
136 /// * `obj` - The PDF object to cache
137 ///
138 /// # Example
139 ///
140 /// ```rust,no_run
141 /// # use oxidize_pdf::parser::document::ResourceManager;
142 /// # use oxidize_pdf::parser::objects::PdfObject;
143 /// # let resources = ResourceManager::new();
144 /// resources.cache_object((10, 0), PdfObject::Integer(42));
145 /// ```
146 pub fn cache_object(&self, obj_ref: (u32, u16), obj: PdfObject) {
147 self.object_cache.borrow_mut().insert(obj_ref, obj);
148 }
149
150 /// Clear all cached objects to free memory.
151 ///
152 /// Use this when processing large documents to manage memory usage.
153 ///
154 /// # Example
155 ///
156 /// ```rust,no_run
157 /// # use oxidize_pdf::parser::document::ResourceManager;
158 /// # let resources = ResourceManager::new();
159 /// // After processing many pages
160 /// resources.clear_cache();
161 /// println!("Cache cleared to free memory");
162 /// ```
163 pub fn clear_cache(&self) {
164 self.object_cache.borrow_mut().clear();
165 }
166}
167
168/// High-level PDF document interface for parsing and manipulation.
169///
170/// `PdfDocument` provides a clean, safe API for working with PDF files.
171/// It handles the complexity of PDF structure, object references, and resource
172/// management behind a simple interface.
173///
174/// # Type Parameter
175///
176/// * `R` - The reader type (must implement Read + Seek)
177///
178/// # Architecture Benefits
179///
180/// - **RefCell Usage**: Allows multiple parts of the API to access the document
181/// - **Lazy Loading**: Pages and resources are loaded on demand
182/// - **Automatic Caching**: Frequently accessed objects are cached
183/// - **Safe API**: Borrow checker issues are handled internally
184///
185/// # Example
186///
187/// ```rust,no_run
188/// use oxidize_pdf::parser::{PdfDocument, PdfReader};
189/// use std::fs::File;
190///
191/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
192/// // From a file
193/// let reader = PdfReader::open("document.pdf")?;
194/// let document = PdfDocument::new(reader);
195///
196/// // From any Read + Seek source
197/// let file = File::open("document.pdf")?;
198/// let reader = PdfReader::new(file)?;
199/// let document = PdfDocument::new(reader);
200///
201/// // Use the document
202/// let page_count = document.page_count()?;
203/// for i in 0..page_count {
204/// let page = document.get_page(i)?;
205/// // Process page...
206/// }
207/// # Ok(())
208/// # }
209/// ```
210pub struct PdfDocument<R: Read + Seek> {
211 /// The underlying PDF reader wrapped for interior mutability
212 reader: RefCell<PdfReader<R>>,
213 /// Page tree navigator (lazily initialized)
214 page_tree: RefCell<Option<PageTree>>,
215 /// Shared resource manager for object caching
216 resources: Rc<ResourceManager>,
217 /// Cached document metadata to avoid repeated parsing
218 metadata_cache: RefCell<Option<super::reader::DocumentMetadata>>,
219}
220
221impl<R: Read + Seek> PdfDocument<R> {
222 /// Create a new PDF document from a reader
223 pub fn new(reader: PdfReader<R>) -> Self {
224 Self {
225 reader: RefCell::new(reader),
226 page_tree: RefCell::new(None),
227 resources: Rc::new(ResourceManager::new()),
228 metadata_cache: RefCell::new(None),
229 }
230 }
231
232 /// Get the PDF version of the document.
233 ///
234 /// # Returns
235 ///
236 /// PDF version string (e.g., "1.4", "1.7", "2.0")
237 ///
238 /// # Example
239 ///
240 /// ```rust,no_run
241 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
242 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
243 /// # let reader = PdfReader::open("document.pdf")?;
244 /// # let document = PdfDocument::new(reader);
245 /// let version = document.version()?;
246 /// println!("PDF version: {}", version);
247 /// # Ok(())
248 /// # }
249 /// ```
250 pub fn version(&self) -> ParseResult<String> {
251 Ok(self.reader.borrow().version().to_string())
252 }
253
254 /// Get the parse options
255 pub fn options(&self) -> ParseOptions {
256 self.reader.borrow().options().clone()
257 }
258
259 /// Get the total number of pages in the document.
260 ///
261 /// # Returns
262 ///
263 /// The page count as an unsigned 32-bit integer.
264 ///
265 /// # Errors
266 ///
267 /// Returns an error if the page tree is malformed or missing.
268 ///
269 /// # Example
270 ///
271 /// ```rust,no_run
272 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
273 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
274 /// # let reader = PdfReader::open("document.pdf")?;
275 /// # let document = PdfDocument::new(reader);
276 /// let count = document.page_count()?;
277 /// println!("Document has {} pages", count);
278 ///
279 /// // Iterate through all pages
280 /// for i in 0..count {
281 /// let page = document.get_page(i)?;
282 /// // Process page...
283 /// }
284 /// # Ok(())
285 /// # }
286 /// ```
287 pub fn page_count(&self) -> ParseResult<u32> {
288 self.ensure_page_tree()?;
289 if let Some(pt) = self.page_tree.borrow().as_ref() {
290 Ok(pt.page_count())
291 } else {
292 // Fallback: should never reach here since ensure_page_tree() just ran
293 self.reader.borrow_mut().page_count()
294 }
295 }
296
297 /// Get document metadata including title, author, creation date, etc.
298 ///
299 /// Metadata is cached after first access for performance.
300 ///
301 /// # Returns
302 ///
303 /// A `DocumentMetadata` struct containing all available metadata fields.
304 ///
305 /// # Example
306 ///
307 /// ```rust,no_run
308 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
309 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
310 /// # let reader = PdfReader::open("document.pdf")?;
311 /// # let document = PdfDocument::new(reader);
312 /// let metadata = document.metadata()?;
313 ///
314 /// if let Some(title) = &metadata.title {
315 /// println!("Title: {}", title);
316 /// }
317 /// if let Some(author) = &metadata.author {
318 /// println!("Author: {}", author);
319 /// }
320 /// if let Some(creation_date) = &metadata.creation_date {
321 /// println!("Created: {}", creation_date);
322 /// }
323 /// println!("PDF Version: {}", metadata.version);
324 /// # Ok(())
325 /// # }
326 /// ```
327 pub fn metadata(&self) -> ParseResult<super::reader::DocumentMetadata> {
328 // Check cache first
329 if let Some(metadata) = self.metadata_cache.borrow().as_ref() {
330 return Ok(metadata.clone());
331 }
332
333 // Load metadata
334 let metadata = self.reader.borrow_mut().metadata()?;
335 self.metadata_cache.borrow_mut().replace(metadata.clone());
336 Ok(metadata)
337 }
338
339 /// Initialize the page tree if not already done.
340 ///
341 /// Builds a flat index of all leaf Page references by walking the tree once.
342 /// This provides O(1) page access and detects cycles and absurd /Count values.
343 fn ensure_page_tree(&self) -> ParseResult<()> {
344 if self.page_tree.borrow().is_none() {
345 let pages_dict = self.load_pages_dict()?;
346 let page_refs = {
347 let mut reader = self.reader.borrow_mut();
348 PageTree::flatten_page_tree(&mut *reader, &pages_dict)?
349 };
350 let page_tree = PageTree::new_with_flat_index(pages_dict, page_refs);
351 self.page_tree.borrow_mut().replace(page_tree);
352 }
353 Ok(())
354 }
355
356 /// Load the pages dictionary
357 fn load_pages_dict(&self) -> ParseResult<PdfDictionary> {
358 let mut reader = self.reader.borrow_mut();
359 let pages = reader.pages()?;
360 Ok(pages.clone())
361 }
362
363 /// Get a page by index (0-based).
364 ///
365 /// Pages are cached after first access. This method handles page tree
366 /// traversal and property inheritance automatically.
367 ///
368 /// # Arguments
369 ///
370 /// * `index` - Zero-based page index (0 to page_count-1)
371 ///
372 /// # Returns
373 ///
374 /// A complete `ParsedPage` with all properties and inherited resources.
375 ///
376 /// # Errors
377 ///
378 /// Returns an error if:
379 /// - Index is out of bounds
380 /// - Page tree is malformed
381 /// - Required page properties are missing
382 ///
383 /// # Example
384 ///
385 /// ```rust,no_run
386 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
387 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
388 /// # let reader = PdfReader::open("document.pdf")?;
389 /// # let document = PdfDocument::new(reader);
390 /// // Get the first page
391 /// let page = document.get_page(0)?;
392 ///
393 /// // Access page properties
394 /// println!("Page size: {}x{} points", page.width(), page.height());
395 /// println!("Rotation: {}°", page.rotation);
396 ///
397 /// // Get content streams
398 /// let streams = page.content_streams_with_document(&document)?;
399 /// println!("Page has {} content streams", streams.len());
400 /// # Ok(())
401 /// # }
402 /// ```
403 pub fn get_page(&self, index: u32) -> ParseResult<ParsedPage> {
404 self.ensure_page_tree()?;
405
406 // First check if page is already cached
407 if let Some(page_tree) = self.page_tree.borrow().as_ref() {
408 if let Some(page) = page_tree.get_cached_page(index) {
409 return Ok(page.clone());
410 }
411 }
412
413 // Try flat index O(1) lookup first
414 let (page_ref, has_flat_index) = {
415 let pt_borrow = self.page_tree.borrow();
416 let pt = pt_borrow.as_ref();
417 let ref_val = pt.and_then(|pt| pt.get_page_ref(index));
418 let has_index = pt.map_or(false, |pt| pt.page_count() > 0 || ref_val.is_some());
419 (ref_val, has_index)
420 };
421
422 let page = if let Some(page_ref) = page_ref {
423 self.load_page_by_ref(page_ref)?
424 } else if has_flat_index {
425 // Flat index exists but page not found — index is out of range
426 return Err(ParseError::SyntaxError {
427 position: 0,
428 message: format!(
429 "Page index {} out of range (document has {} pages)",
430 index,
431 self.page_tree
432 .borrow()
433 .as_ref()
434 .map_or(0, |pt| pt.page_count())
435 ),
436 });
437 } else {
438 // No flat index available — fallback to tree traversal
439 self.load_page_at_index(index)?
440 };
441
442 // Cache it
443 if let Some(page_tree) = self.page_tree.borrow_mut().as_mut() {
444 page_tree.cache_page(index, page.clone());
445 }
446
447 Ok(page)
448 }
449
450 /// Load a specific page by index (legacy tree traversal fallback)
451 fn load_page_at_index(&self, index: u32) -> ParseResult<ParsedPage> {
452 // Get the pages root
453 let pages_dict = self.load_pages_dict()?;
454
455 // Navigate to the specific page
456 let page_info = self.find_page_in_tree(&pages_dict, index, 0, None)?;
457
458 Ok(page_info)
459 }
460
461 /// Load a page directly by its object reference (O(1) via flat index).
462 fn load_page_by_ref(&self, page_ref: (u32, u16)) -> ParseResult<ParsedPage> {
463 let obj = self.get_object(page_ref.0, page_ref.1)?;
464 let dict = obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
465 position: 0,
466 message: format!(
467 "Page object {} {} R is not a dictionary",
468 page_ref.0, page_ref.1
469 ),
470 })?;
471
472 let inherited = self.collect_inherited_attributes(dict);
473 self.create_parsed_page(page_ref, dict, Some(&inherited))
474 }
475
476 /// Walk up the /Parent chain to collect inheritable attributes (Resources, MediaBox, CropBox, Rotate).
477 /// Uses cycle detection to prevent infinite loops in malformed PDFs.
478 fn collect_inherited_attributes(&self, page_dict: &PdfDictionary) -> PdfDictionary {
479 let mut inherited = PdfDictionary::new();
480 let inheritable_keys = ["Resources", "MediaBox", "CropBox", "Rotate"];
481
482 // Collect from the page's own parent chain
483 let mut current_parent_ref = page_dict.get("Parent").and_then(|p| p.as_reference());
484 let mut visited: std::collections::HashSet<(u32, u16)> = std::collections::HashSet::new();
485
486 while let Some(parent_ref) = current_parent_ref {
487 if !visited.insert(parent_ref) {
488 break; // Cycle detected
489 }
490
491 match self.get_object(parent_ref.0, parent_ref.1) {
492 Ok(obj) => {
493 if let Some(parent_dict) = obj.as_dict() {
494 for key in &inheritable_keys {
495 // Only inherit if the page itself doesn't have it
496 // and we haven't already found it in a closer ancestor
497 if !page_dict.contains_key(key) && !inherited.contains_key(key) {
498 if let Some(val) = parent_dict.get(key) {
499 inherited.insert((*key).to_string(), val.clone());
500 }
501 }
502 }
503 current_parent_ref =
504 parent_dict.get("Parent").and_then(|p| p.as_reference());
505 } else {
506 break;
507 }
508 }
509 Err(_) => break,
510 }
511 }
512
513 inherited
514 }
515
516 /// Find a page in the page tree (iterative implementation for stack safety)
517 fn find_page_in_tree(
518 &self,
519 root_node: &PdfDictionary,
520 target_index: u32,
521 initial_current_index: u32,
522 initial_inherited: Option<&PdfDictionary>,
523 ) -> ParseResult<ParsedPage> {
524 // Work item for the traversal queue
525 #[derive(Debug)]
526 struct WorkItem {
527 node_dict: PdfDictionary,
528 node_ref: Option<(u32, u16)>,
529 current_index: u32,
530 inherited: Option<PdfDictionary>,
531 }
532
533 // Initialize work queue with root node
534 let mut work_queue = Vec::new();
535 work_queue.push(WorkItem {
536 node_dict: root_node.clone(),
537 node_ref: None,
538 current_index: initial_current_index,
539 inherited: initial_inherited.cloned(),
540 });
541
542 // Iterative traversal
543 while let Some(work_item) = work_queue.pop() {
544 let WorkItem {
545 node_dict,
546 node_ref,
547 current_index,
548 inherited,
549 } = work_item;
550
551 let node_type = node_dict
552 .get_type()
553 .or_else(|| {
554 // If Type is missing, try to infer from content
555 if node_dict.contains_key("Kids") && node_dict.contains_key("Count") {
556 Some("Pages")
557 } else if node_dict.contains_key("Contents")
558 || node_dict.contains_key("MediaBox")
559 {
560 Some("Page")
561 } else {
562 None
563 }
564 })
565 .or_else(|| {
566 // If Type is missing, try to infer from structure
567 if node_dict.contains_key("Kids") {
568 Some("Pages")
569 } else if node_dict.contains_key("Contents")
570 || (node_dict.contains_key("MediaBox") && !node_dict.contains_key("Kids"))
571 {
572 Some("Page")
573 } else {
574 None
575 }
576 })
577 .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
578
579 match node_type {
580 "Pages" => {
581 // This is a page tree node
582 let kids = node_dict
583 .get("Kids")
584 .and_then(|obj| obj.as_array())
585 .or_else(|| {
586 // If Kids is missing, use empty array
587 tracing::debug!(
588 "Warning: Missing Kids array in Pages node, using empty array"
589 );
590 Some(&super::objects::EMPTY_PDF_ARRAY)
591 })
592 .ok_or_else(|| ParseError::MissingKey("Kids".to_string()))?;
593
594 // Merge inherited attributes
595 let mut merged_inherited = inherited.unwrap_or_else(PdfDictionary::new);
596
597 // Inheritable attributes
598 for key in ["Resources", "MediaBox", "CropBox", "Rotate"] {
599 if let Some(value) = node_dict.get(key) {
600 if !merged_inherited.contains_key(key) {
601 merged_inherited.insert(key.to_string(), value.clone());
602 }
603 }
604 }
605
606 // Process kids in reverse order (since we're using a stack/Vec::pop())
607 // This ensures we process them in the correct order
608 let mut current_idx = current_index;
609 let mut pending_kids = Vec::new();
610
611 for kid_ref in &kids.0 {
612 let kid_ref =
613 kid_ref
614 .as_reference()
615 .ok_or_else(|| ParseError::SyntaxError {
616 position: 0,
617 message: "Kids array must contain references".to_string(),
618 })?;
619
620 // Get the kid object
621 let kid_obj = self.get_object(kid_ref.0, kid_ref.1)?;
622 let kid_dict = match kid_obj.as_dict() {
623 Some(dict) => dict,
624 None => {
625 // Skip invalid page tree nodes in lenient mode
626 tracing::debug!(
627 "Warning: Page tree node {} {} R is not a dictionary, skipping",
628 kid_ref.0,
629 kid_ref.1
630 );
631 current_idx += 1; // Count as processed but skip
632 continue;
633 }
634 };
635
636 let kid_type = kid_dict
637 .get_type()
638 .or_else(|| {
639 // If Type is missing, try to infer from content
640 if kid_dict.contains_key("Kids") && kid_dict.contains_key("Count") {
641 Some("Pages")
642 } else if kid_dict.contains_key("Contents")
643 || kid_dict.contains_key("MediaBox")
644 {
645 Some("Page")
646 } else {
647 None
648 }
649 })
650 .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
651
652 let count = if kid_type == "Pages" {
653 kid_dict
654 .get("Count")
655 .and_then(|obj| obj.as_integer())
656 .unwrap_or(1) // Fallback to 1 if Count is missing (defensive)
657 as u32
658 } else {
659 1
660 };
661
662 if target_index < current_idx + count {
663 // Found the right subtree/page
664 if kid_type == "Page" {
665 // This is the page we want
666 return self.create_parsed_page(
667 kid_ref,
668 kid_dict,
669 Some(&merged_inherited),
670 );
671 } else {
672 // Need to traverse this subtree - add to queue
673 pending_kids.push(WorkItem {
674 node_dict: kid_dict.clone(),
675 node_ref: Some(kid_ref),
676 current_index: current_idx,
677 inherited: Some(merged_inherited.clone()),
678 });
679 break; // Found our target subtree, no need to continue
680 }
681 }
682
683 current_idx += count;
684 }
685
686 // Add pending kids to work queue in reverse order for correct processing
687 work_queue.extend(pending_kids.into_iter().rev());
688 }
689 "Page" => {
690 // This is a page object
691 if target_index != current_index {
692 return Err(ParseError::SyntaxError {
693 position: 0,
694 message: "Page index mismatch".to_string(),
695 });
696 }
697
698 // We need the reference for creating the parsed page
699 if let Some(page_ref) = node_ref {
700 return self.create_parsed_page(page_ref, &node_dict, inherited.as_ref());
701 } else {
702 return Err(ParseError::SyntaxError {
703 position: 0,
704 message: "Direct page object without reference".to_string(),
705 });
706 }
707 }
708 _ => {
709 return Err(ParseError::SyntaxError {
710 position: 0,
711 message: format!("Invalid page tree node type: {node_type}"),
712 });
713 }
714 }
715 }
716
717 // Try fallback: search for the page by direct object scanning
718 tracing::debug!(
719 "Warning: Page {} not found in tree, attempting direct lookup",
720 target_index
721 );
722
723 // Scan for Page objects directly (try first few hundred objects)
724 for obj_num in 1..500 {
725 if let Ok(obj) = self.reader.borrow_mut().get_object(obj_num, 0) {
726 if let Some(dict) = obj.as_dict() {
727 if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
728 if obj_type.0 == "Page" {
729 // Found a page, check if it's the right index (approximate)
730 return self.create_parsed_page((obj_num, 0), dict, None);
731 }
732 }
733 }
734 }
735 }
736
737 Err(ParseError::SyntaxError {
738 position: 0,
739 message: format!("Page {} not found in tree or document", target_index),
740 })
741 }
742
743 /// Create a ParsedPage from a page dictionary
744 fn create_parsed_page(
745 &self,
746 obj_ref: (u32, u16),
747 page_dict: &PdfDictionary,
748 inherited: Option<&PdfDictionary>,
749 ) -> ParseResult<ParsedPage> {
750 // Extract page attributes with fallback for missing MediaBox
751 let media_box = match self.get_rectangle(page_dict, inherited, "MediaBox")? {
752 Some(mb) => mb,
753 None => {
754 // Use default Letter size if MediaBox is missing
755 #[cfg(debug_assertions)]
756 tracing::debug!(
757 "Warning: Page {} {} R missing MediaBox, using default Letter size",
758 obj_ref.0,
759 obj_ref.1
760 );
761 [0.0, 0.0, 612.0, 792.0]
762 }
763 };
764
765 let crop_box = self.get_rectangle(page_dict, inherited, "CropBox")?;
766
767 let rotation = self
768 .get_integer(page_dict, inherited, "Rotate")?
769 .unwrap_or(0) as i32;
770
771 // Resolve the effective /Resources into an owned dictionary so that
772 // `ParsedPage::get_resources()` always yields a dictionary, even when
773 // /Resources is given as an indirect reference (issue #286). The page's
774 // own /Resources takes precedence over inherited ones; when it is an
775 // inline dictionary `get_resources()` returns it directly from the page
776 // dict, so we only need a resolved fallback for the reference / inherited
777 // cases.
778 let inherited_resources = {
779 let own_is_inline_dict = page_dict
780 .get("Resources")
781 .map(|o| o.as_dict().is_some())
782 .unwrap_or(false);
783 if own_is_inline_dict {
784 None
785 } else {
786 page_dict
787 .get("Resources")
788 .or_else(|| inherited.and_then(|i| i.get("Resources")))
789 .and_then(|r| self.resolve(r).ok())
790 .and_then(|r| r.as_dict().cloned())
791 }
792 };
793
794 // Get annotations if present
795 let annotations = page_dict
796 .get("Annots")
797 .and_then(|obj| obj.as_array())
798 .cloned();
799
800 Ok(ParsedPage {
801 obj_ref,
802 dict: page_dict.clone(),
803 inherited_resources,
804 media_box,
805 crop_box,
806 rotation,
807 annotations,
808 })
809 }
810
811 /// Get a rectangle value
812 fn get_rectangle(
813 &self,
814 node: &PdfDictionary,
815 inherited: Option<&PdfDictionary>,
816 key: &str,
817 ) -> ParseResult<Option<[f64; 4]>> {
818 let array = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
819
820 if let Some(array) = array.and_then(|obj| obj.as_array()) {
821 if array.len() != 4 {
822 return Err(ParseError::SyntaxError {
823 position: 0,
824 message: format!("{key} must have 4 elements"),
825 });
826 }
827
828 // After length check, we know array has exactly 4 elements
829 // Safe to index directly without unwrap
830 let rect = [
831 array.0[0].as_real().unwrap_or(0.0),
832 array.0[1].as_real().unwrap_or(0.0),
833 array.0[2].as_real().unwrap_or(0.0),
834 array.0[3].as_real().unwrap_or(0.0),
835 ];
836
837 Ok(Some(rect))
838 } else {
839 Ok(None)
840 }
841 }
842
843 /// Get an integer value
844 fn get_integer(
845 &self,
846 node: &PdfDictionary,
847 inherited: Option<&PdfDictionary>,
848 key: &str,
849 ) -> ParseResult<Option<i64>> {
850 let value = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
851
852 Ok(value.and_then(|obj| obj.as_integer()))
853 }
854
855 /// Get an object by its reference numbers.
856 ///
857 /// This method first checks the cache, then loads from the file if needed.
858 /// Objects are automatically cached after loading.
859 ///
860 /// # Arguments
861 ///
862 /// * `obj_num` - Object number
863 /// * `gen_num` - Generation number
864 ///
865 /// # Returns
866 ///
867 /// The resolved PDF object.
868 ///
869 /// # Errors
870 ///
871 /// Returns an error if:
872 /// - Object doesn't exist
873 /// - Object is part of an encrypted object stream
874 /// - File is corrupted
875 ///
876 /// # Example
877 ///
878 /// ```rust,no_run
879 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
880 /// # use oxidize_pdf::parser::objects::PdfObject;
881 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
882 /// # let reader = PdfReader::open("document.pdf")?;
883 /// # let document = PdfDocument::new(reader);
884 /// // Get object 10 0 R
885 /// let obj = document.get_object(10, 0)?;
886 ///
887 /// // Check object type
888 /// match obj {
889 /// PdfObject::Dictionary(dict) => {
890 /// println!("Object is a dictionary with {} entries", dict.0.len());
891 /// }
892 /// PdfObject::Stream(stream) => {
893 /// println!("Object is a stream");
894 /// }
895 /// _ => {}
896 /// }
897 /// # Ok(())
898 /// # }
899 /// ```
900 pub fn get_object(&self, obj_num: u32, gen_num: u16) -> ParseResult<PdfObject> {
901 // Check resource cache first
902 if let Some(obj) = self.resources.get_cached((obj_num, gen_num)) {
903 return Ok(obj);
904 }
905
906 // Load from reader
907 let obj = {
908 let mut reader = self.reader.borrow_mut();
909 reader.get_object(obj_num, gen_num)?.clone()
910 };
911
912 // Cache it
913 self.resources.cache_object((obj_num, gen_num), obj.clone());
914
915 Ok(obj)
916 }
917
918 /// Resolve a reference to get the actual object.
919 ///
920 /// If the input is a Reference, fetches the referenced object.
921 /// Otherwise returns a clone of the input object.
922 ///
923 /// # Arguments
924 ///
925 /// * `obj` - The object to resolve (may be a Reference or direct object)
926 ///
927 /// # Returns
928 ///
929 /// The resolved object (never a Reference).
930 ///
931 /// # Example
932 ///
933 /// ```rust,no_run
934 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
935 /// # use oxidize_pdf::parser::objects::PdfObject;
936 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
937 /// # let reader = PdfReader::open("document.pdf")?;
938 /// # let document = PdfDocument::new(reader);
939 /// # let page = document.get_page(0)?;
940 /// // Contents might be a reference or direct object
941 /// if let Some(contents) = page.dict.get("Contents") {
942 /// let resolved = document.resolve(contents)?;
943 /// match resolved {
944 /// PdfObject::Stream(_) => println!("Single content stream"),
945 /// PdfObject::Array(_) => println!("Multiple content streams"),
946 /// _ => println!("Unexpected content type"),
947 /// }
948 /// }
949 /// # Ok(())
950 /// # }
951 /// ```
952 pub fn resolve(&self, obj: &PdfObject) -> ParseResult<PdfObject> {
953 match obj {
954 PdfObject::Reference(obj_num, gen_num) => self.get_object(*obj_num, *gen_num),
955 _ => Ok(obj.clone()),
956 }
957 }
958
959 /// Get content streams for a specific page.
960 ///
961 /// This method handles both single streams and arrays of streams,
962 /// automatically decompressing them according to their filters.
963 ///
964 /// # Arguments
965 ///
966 /// * `page` - The page to get content streams from
967 ///
968 /// # Returns
969 ///
970 /// Vector of decompressed content stream data ready for parsing.
971 ///
972 /// # Example
973 ///
974 /// ```rust,no_run
975 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
976 /// # use oxidize_pdf::parser::content::ContentParser;
977 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
978 /// # let reader = PdfReader::open("document.pdf")?;
979 /// # let document = PdfDocument::new(reader);
980 /// let page = document.get_page(0)?;
981 /// let streams = document.get_page_content_streams(&page)?;
982 ///
983 /// // Parse content streams
984 /// for stream_data in streams {
985 /// let operations = ContentParser::parse(&stream_data)?;
986 /// println!("Stream has {} operations", operations.len());
987 /// }
988 /// # Ok(())
989 /// # }
990 /// ```
991 /// Get page resources dictionary.
992 ///
993 /// This method returns the resources dictionary for a page, which may include
994 /// fonts, images (XObjects), patterns, color spaces, and other resources.
995 ///
996 /// # Arguments
997 ///
998 /// * `page` - The page to get resources from
999 ///
1000 /// # Returns
1001 ///
1002 /// Optional resources dictionary if the page has resources.
1003 ///
1004 /// # Example
1005 ///
1006 /// ```rust,no_run
1007 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader, PdfObject, PdfName};
1008 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
1009 /// # let reader = PdfReader::open("document.pdf")?;
1010 /// # let document = PdfDocument::new(reader);
1011 /// let page = document.get_page(0)?;
1012 /// if let Some(resources) = document.get_page_resources(&page)? {
1013 /// // Check for images (XObjects)
1014 /// if let Some(PdfObject::Dictionary(xobjects)) = resources.0.get(&PdfName("XObject".to_string())) {
1015 /// for (name, _) in xobjects.0.iter() {
1016 /// println!("Found XObject: {}", name.0);
1017 /// }
1018 /// }
1019 /// }
1020 /// # Ok(())
1021 /// # }
1022 /// ```
1023 pub fn get_page_resources<'a>(
1024 &self,
1025 page: &'a ParsedPage,
1026 ) -> ParseResult<Option<&'a PdfDictionary>> {
1027 Ok(page.get_resources())
1028 }
1029
1030 pub fn get_page_content_streams(&self, page: &ParsedPage) -> ParseResult<Vec<Vec<u8>>> {
1031 let mut streams = Vec::new();
1032 let options = self.options();
1033
1034 if let Some(contents) = page.dict.get("Contents") {
1035 let resolved_contents = self.resolve(contents)?;
1036
1037 match &resolved_contents {
1038 PdfObject::Stream(stream) => {
1039 streams.push(stream.decode(&options)?);
1040 }
1041 PdfObject::Array(array) => {
1042 for item in &array.0 {
1043 let resolved = self.resolve(item)?;
1044 if let PdfObject::Stream(stream) = resolved {
1045 streams.push(stream.decode(&options)?);
1046 }
1047 }
1048 }
1049 _ => {
1050 return Err(ParseError::SyntaxError {
1051 position: 0,
1052 message: "Contents must be a stream or array of streams".to_string(),
1053 })
1054 }
1055 }
1056 }
1057
1058 Ok(streams)
1059 }
1060
1061 /// Extract text from all pages in the document.
1062 ///
1063 /// Uses the default text extraction settings. For custom settings,
1064 /// use `extract_text_with_options`.
1065 ///
1066 /// # Returns
1067 ///
1068 /// A vector of `ExtractedText`, one for each page in the document.
1069 ///
1070 /// # Example
1071 ///
1072 /// ```rust,no_run
1073 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
1074 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
1075 /// # let reader = PdfReader::open("document.pdf")?;
1076 /// # let document = PdfDocument::new(reader);
1077 /// let extracted_pages = document.extract_text()?;
1078 ///
1079 /// for (page_num, page_text) in extracted_pages.iter().enumerate() {
1080 /// println!("=== Page {} ===", page_num + 1);
1081 /// println!("{}", page_text.text);
1082 /// println!();
1083 /// }
1084 /// # Ok(())
1085 /// # }
1086 /// ```
1087 pub fn extract_text(&self) -> ParseResult<Vec<crate::text::ExtractedText>> {
1088 let mut extractor = crate::text::TextExtractor::new();
1089 extractor.extract_from_document(self)
1090 }
1091
1092 /// Extract text from a specific page.
1093 ///
1094 /// # Arguments
1095 ///
1096 /// * `page_index` - Zero-based page index
1097 ///
1098 /// # Returns
1099 ///
1100 /// Extracted text with optional position information.
1101 ///
1102 /// # Example
1103 ///
1104 /// ```rust,no_run
1105 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
1106 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
1107 /// # let reader = PdfReader::open("document.pdf")?;
1108 /// # let document = PdfDocument::new(reader);
1109 /// // Extract text from first page only
1110 /// let page_text = document.extract_text_from_page(0)?;
1111 /// println!("First page text: {}", page_text.text);
1112 ///
1113 /// // Access text fragments with positions (if preserved)
1114 /// for fragment in &page_text.fragments {
1115 /// println!("'{}' at ({}, {})", fragment.text, fragment.x, fragment.y);
1116 /// }
1117 /// # Ok(())
1118 /// # }
1119 /// ```
1120 pub fn extract_text_from_page(
1121 &self,
1122 page_index: u32,
1123 ) -> ParseResult<crate::text::ExtractedText> {
1124 let mut extractor = crate::text::TextExtractor::new();
1125 extractor.extract_from_page(self, page_index)
1126 }
1127
1128 /// Extract text from a specific page with custom options.
1129 ///
1130 /// This method combines the functionality of [`extract_text_from_page`] and
1131 /// [`extract_text_with_options`], allowing fine control over extraction
1132 /// behavior for a single page.
1133 ///
1134 /// # Arguments
1135 ///
1136 /// * `page_index` - Zero-based page index
1137 /// * `options` - Text extraction configuration
1138 ///
1139 /// # Returns
1140 ///
1141 /// Extracted text with optional position information.
1142 ///
1143 /// # Example
1144 ///
1145 /// ```rust,no_run
1146 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
1147 /// # use oxidize_pdf::text::ExtractionOptions;
1148 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
1149 /// # let reader = PdfReader::open("document.pdf")?;
1150 /// # let document = PdfDocument::new(reader);
1151 /// // Use higher space threshold for PDFs with micro-adjustments
1152 /// let options = ExtractionOptions {
1153 /// space_threshold: 0.4,
1154 /// ..Default::default()
1155 /// };
1156 ///
1157 /// let page_text = document.extract_text_from_page_with_options(0, options)?;
1158 /// println!("Text: {}", page_text.text);
1159 /// # Ok(())
1160 /// # }
1161 /// ```
1162 pub fn extract_text_from_page_with_options(
1163 &self,
1164 page_index: u32,
1165 options: crate::text::ExtractionOptions,
1166 ) -> ParseResult<crate::text::ExtractedText> {
1167 let mut extractor = crate::text::TextExtractor::with_options(options);
1168 extractor.extract_from_page(self, page_index)
1169 }
1170
1171 /// Extract text with custom extraction options.
1172 ///
1173 /// Allows fine control over text extraction behavior including
1174 /// layout preservation, spacing thresholds, and more.
1175 ///
1176 /// # Arguments
1177 ///
1178 /// * `options` - Text extraction configuration
1179 ///
1180 /// # Returns
1181 ///
1182 /// A vector of `ExtractedText`, one for each page.
1183 ///
1184 /// # Example
1185 ///
1186 /// ```rust,no_run
1187 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
1188 /// # use oxidize_pdf::text::ExtractionOptions;
1189 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
1190 /// # let reader = PdfReader::open("document.pdf")?;
1191 /// # let document = PdfDocument::new(reader);
1192 /// // Configure extraction to preserve layout
1193 /// let options = ExtractionOptions {
1194 /// preserve_layout: true,
1195 /// space_threshold: 0.3,
1196 /// newline_threshold: 10.0,
1197 /// ..Default::default()
1198 /// };
1199 ///
1200 /// let extracted_pages = document.extract_text_with_options(options)?;
1201 ///
1202 /// // Text fragments will include position information
1203 /// for page_text in extracted_pages {
1204 /// for fragment in &page_text.fragments {
1205 /// println!("{:?}", fragment);
1206 /// }
1207 /// }
1208 /// # Ok(())
1209 /// # }
1210 /// ```
1211 pub fn extract_text_with_options(
1212 &self,
1213 options: crate::text::ExtractionOptions,
1214 ) -> ParseResult<Vec<crate::text::ExtractedText>> {
1215 let mut extractor = crate::text::TextExtractor::with_options(options);
1216 extractor.extract_from_document(self)
1217 }
1218
1219 /// Get annotations from a specific page.
1220 ///
1221 /// Returns a vector of annotation dictionaries for the specified page.
1222 /// Each annotation dictionary contains properties like Type, Rect, Contents, etc.
1223 ///
1224 /// # Arguments
1225 ///
1226 /// * `page_index` - Zero-based page index
1227 ///
1228 /// # Returns
1229 ///
1230 /// A vector of PdfDictionary objects representing annotations, or an empty vector
1231 /// if the page has no annotations.
1232 ///
1233 /// # Example
1234 ///
1235 /// ```rust,no_run
1236 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
1237 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
1238 /// # let reader = PdfReader::open("document.pdf")?;
1239 /// # let document = PdfDocument::new(reader);
1240 /// let annotations = document.get_page_annotations(0)?;
1241 /// for annot in &annotations {
1242 /// if let Some(contents) = annot.get("Contents").and_then(|c| c.as_string()) {
1243 /// println!("Annotation: {:?}", contents);
1244 /// }
1245 /// }
1246 /// # Ok(())
1247 /// # }
1248 /// ```
1249 pub fn get_page_annotations(&self, page_index: u32) -> ParseResult<Vec<PdfDictionary>> {
1250 let page = self.get_page(page_index)?;
1251
1252 if let Some(annots_array) = page.get_annotations() {
1253 let mut annotations = Vec::new();
1254 let mut reader = self.reader.borrow_mut();
1255
1256 for annot_ref in &annots_array.0 {
1257 if let Some(ref_nums) = annot_ref.as_reference() {
1258 match reader.get_object(ref_nums.0, ref_nums.1) {
1259 Ok(obj) => {
1260 if let Some(dict) = obj.as_dict() {
1261 annotations.push(dict.clone());
1262 }
1263 }
1264 Err(_) => {
1265 // Skip annotations that can't be loaded
1266 continue;
1267 }
1268 }
1269 }
1270 }
1271
1272 Ok(annotations)
1273 } else {
1274 Ok(Vec::new())
1275 }
1276 }
1277
1278 /// Get all annotations from all pages in the document.
1279 ///
1280 /// Returns a vector of tuples containing (page_index, annotations) for each page
1281 /// that has annotations.
1282 ///
1283 /// # Returns
1284 ///
1285 /// A vector of tuples where the first element is the page index and the second
1286 /// is a vector of annotation dictionaries for that page.
1287 ///
1288 /// # Example
1289 ///
1290 /// ```rust,no_run
1291 /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
1292 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
1293 /// # let reader = PdfReader::open("document.pdf")?;
1294 /// # let document = PdfDocument::new(reader);
1295 /// let all_annotations = document.get_all_annotations()?;
1296 /// for (page_idx, annotations) in all_annotations {
1297 /// println!("Page {} has {} annotations", page_idx, annotations.len());
1298 /// }
1299 /// # Ok(())
1300 /// # }
1301 /// ```
1302 pub fn get_all_annotations(&self) -> ParseResult<Vec<(u32, Vec<PdfDictionary>)>> {
1303 let page_count = self.page_count()?;
1304 let mut all_annotations = Vec::new();
1305
1306 for i in 0..page_count {
1307 let annotations = self.get_page_annotations(i)?;
1308 if !annotations.is_empty() {
1309 all_annotations.push((i, annotations));
1310 }
1311 }
1312
1313 Ok(all_annotations)
1314 }
1315
1316 // --- VibeCoding Facade Methods ---
1317
1318 /// Export the document to LLM-optimized Markdown format.
1319 ///
1320 /// Delegates to [`crate::ai::export_to_markdown`]. Includes YAML frontmatter
1321 /// with document metadata followed by extracted text content.
1322 #[allow(deprecated)]
1323 pub fn to_markdown(&self) -> crate::error::Result<String> {
1324 crate::ai::export_to_markdown(self)
1325 }
1326
1327 /// Export the document to element-aware Markdown format.
1328 ///
1329 /// Unlike [`to_markdown`](Self::to_markdown), this method classifies elements
1330 /// by type and maps each to its canonical Markdown representation.
1331 pub fn to_element_markdown(&self) -> ParseResult<String> {
1332 let elements = self.partition()?;
1333 let exporter = crate::pipeline::export::ElementMarkdownExporter::default();
1334 Ok(exporter.export(&elements))
1335 }
1336
1337 /// Export the document to a contextual text format for LLM consumption.
1338 ///
1339 /// Delegates to [`crate::ai::export_to_contextual`].
1340 #[allow(deprecated)]
1341 pub fn to_contextual(&self) -> crate::error::Result<String> {
1342 crate::ai::export_to_contextual(self)
1343 }
1344
1345 /// Export the document to structured JSON format.
1346 ///
1347 /// Requires the `semantic` feature. Delegates to [`crate::ai::export_to_json`].
1348 #[cfg(feature = "semantic")]
1349 #[allow(deprecated)]
1350 pub fn to_json(&self) -> crate::error::Result<String> {
1351 crate::ai::export_to_json(self)
1352 }
1353
1354 /// Extract and chunk the document into RAG-ready chunks with full metadata.
1355 ///
1356 /// Uses default [`HybridChunkConfig`](crate::pipeline::HybridChunkConfig)
1357 /// (512 tokens, `AnyInlineContent` merge policy). Returns serializable
1358 /// [`RagChunk`](crate::pipeline::RagChunk)s with page numbers, bounding boxes,
1359 /// element types, and heading context — everything a vector store needs.
1360 ///
1361 /// # Example
1362 ///
1363 /// ```rust,no_run
1364 /// use oxidize_pdf::parser::{PdfDocument, PdfReader};
1365 ///
1366 /// let doc = PdfDocument::open("document.pdf")?;
1367 /// let chunks = doc.rag_chunks()?;
1368 /// for chunk in &chunks {
1369 /// println!("Chunk {}: pages {:?}, ~{} tokens",
1370 /// chunk.chunk_index, chunk.page_numbers, chunk.token_estimate);
1371 /// }
1372 /// # Ok::<(), Box<dyn std::error::Error>>(())
1373 /// ```
1374 pub fn rag_chunks(&self) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
1375 self.rag_chunks_with(crate::pipeline::HybridChunkConfig::default())
1376 }
1377
1378 /// Extract and chunk the document with a custom chunking configuration.
1379 ///
1380 /// Use this when the default 512-token limit is too large or too small for your
1381 /// vector store or embedding model. All other metadata (pages, bounding boxes,
1382 /// element types, heading context) is identical to [`rag_chunks()`](Self::rag_chunks).
1383 ///
1384 /// # Example
1385 ///
1386 /// ```rust,no_run
1387 /// use oxidize_pdf::parser::{PdfDocument, PdfReader};
1388 /// use oxidize_pdf::pipeline::HybridChunkConfig;
1389 ///
1390 /// let doc = PdfDocument::open("document.pdf")?;
1391 /// let config = HybridChunkConfig {
1392 /// max_tokens: 256,
1393 /// ..HybridChunkConfig::default()
1394 /// };
1395 /// let chunks = doc.rag_chunks_with(config)?;
1396 /// println!("Got {} chunks at 256-token limit", chunks.len());
1397 /// # Ok::<(), Box<dyn std::error::Error>>(())
1398 /// ```
1399 pub fn rag_chunks_with(
1400 &self,
1401 config: crate::pipeline::HybridChunkConfig,
1402 ) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
1403 let elements = self.partition()?;
1404 let chunker = crate::pipeline::HybridChunker::new(config);
1405 let hybrid_chunks = chunker.chunk(&elements);
1406 Ok(self.build_rag_chunks(&hybrid_chunks, None))
1407 }
1408
1409 /// Build RAG chunks stamped with source-document metadata.
1410 ///
1411 /// Auto-fills `title`/`author`/`creation_date`/`total_pages` from the info
1412 /// dictionary (only where the caller left them `None`); the caller-supplied
1413 /// `source` provides `filename`/`doc_hash` (and may override any auto-filled
1414 /// field). `doc_hash`, when set, becomes the stable prefix of every
1415 /// `chunk_id`. Same chunking pipeline as [`rag_chunks`](Self::rag_chunks).
1416 ///
1417 /// # Example
1418 ///
1419 /// ```rust,no_run
1420 /// use oxidize_pdf::parser::PdfDocument;
1421 /// use oxidize_pdf::pipeline::DocumentSource;
1422 ///
1423 /// let doc = PdfDocument::open("document.pdf")?;
1424 /// let mut source = DocumentSource::default();
1425 /// source.filename = Some("document.pdf".to_string());
1426 /// source.doc_hash = Some("sha256-prefix".to_string());
1427 /// let chunks = doc.rag_chunks_with_source(source)?;
1428 /// # Ok::<(), Box<dyn std::error::Error>>(())
1429 /// ```
1430 pub fn rag_chunks_with_source(
1431 &self,
1432 source: crate::pipeline::DocumentSource,
1433 ) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
1434 self.rag_chunks_with_source_and_config(
1435 source,
1436 crate::pipeline::HybridChunkConfig::default(),
1437 )
1438 }
1439
1440 /// Like [`rag_chunks_with_source`](Self::rag_chunks_with_source) but with a
1441 /// custom chunking configuration — for callers that need both
1442 /// source-document stamping and a non-default token budget.
1443 ///
1444 /// # Example
1445 ///
1446 /// ```rust,no_run
1447 /// use oxidize_pdf::parser::PdfDocument;
1448 /// use oxidize_pdf::pipeline::{DocumentSource, HybridChunkConfig};
1449 ///
1450 /// let doc = PdfDocument::open("document.pdf")?;
1451 /// let source = DocumentSource::with_file(Some("document.pdf".into()), None);
1452 /// let config = HybridChunkConfig { max_tokens: 256, ..Default::default() };
1453 /// let chunks = doc.rag_chunks_with_source_and_config(source, config)?;
1454 /// # Ok::<(), Box<dyn std::error::Error>>(())
1455 /// ```
1456 pub fn rag_chunks_with_source_and_config(
1457 &self,
1458 mut source: crate::pipeline::DocumentSource,
1459 config: crate::pipeline::HybridChunkConfig,
1460 ) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
1461 self.autofill_source(&mut source);
1462 let elements = self.partition()?;
1463 let chunker = crate::pipeline::HybridChunker::new(config);
1464 let hybrid_chunks = chunker.chunk(&elements);
1465 Ok(self.build_rag_chunks(&hybrid_chunks, Some(source)))
1466 }
1467
1468 /// Fill `title`/`author`/`creation_date`/`total_pages` from the info
1469 /// dictionary where the caller left them `None`.
1470 fn autofill_source(&self, source: &mut crate::pipeline::DocumentSource) {
1471 if let Ok(meta) = self.metadata() {
1472 source.title = source.title.take().or(meta.title);
1473 source.author = source.author.take().or(meta.author);
1474 source.creation_date = source.creation_date.take().or(meta.creation_date);
1475 source.total_pages = source.total_pages.or(meta.page_count);
1476 }
1477 if source.total_pages.is_none() {
1478 source.total_pages = self.page_count().ok();
1479 }
1480 }
1481
1482 /// Run a custom [`AnalysisPipeline`](crate::pipeline::AnalysisPipeline):
1483 /// partition, optionally classify elements, apply the pipeline's chunking
1484 /// strategy, build linked `RagChunk`s (ids, prev/next, metadata, optional
1485 /// source) exactly as the other `rag_chunks*` entry points do, then run any
1486 /// enrichers over each chunk's `extra` bag.
1487 ///
1488 /// `AnalysisPipeline::new()` reproduces [`rag_chunks`](Self::rag_chunks).
1489 ///
1490 /// **Stability:** requires `unstable-spi`; exempt from semver until promoted.
1491 ///
1492 /// # Example
1493 ///
1494 /// ```rust,no_run
1495 /// # use oxidize_pdf::parser::PdfDocument;
1496 /// # use oxidize_pdf::pipeline::AnalysisPipeline;
1497 /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
1498 /// let doc = PdfDocument::open("document.pdf")?;
1499 /// // Default pipeline == rag_chunks(); swap in a custom strategy/classifier/
1500 /// // enricher via the builder to extend it.
1501 /// let chunks = doc.rag_chunks_with_pipeline(&AnalysisPipeline::new())?;
1502 /// println!("{} chunks", chunks.len());
1503 /// # Ok(())
1504 /// # }
1505 /// ```
1506 #[cfg(feature = "unstable-spi")]
1507 pub fn rag_chunks_with_pipeline(
1508 &self,
1509 pipeline: &crate::pipeline::AnalysisPipeline,
1510 ) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
1511 let mut source = pipeline.source.clone();
1512 if let Some(src) = source.as_mut() {
1513 self.autofill_source(src);
1514 }
1515 let mut elements = self.partition()?;
1516 if let Some(classifier) = pipeline.classifier.as_deref() {
1517 // Two passes: read labels against an immutable slice, then apply —
1518 // the classifier inspects neighbours via `ClassifyContext`, so it
1519 // cannot run while the slice is being mutated.
1520 let labels: Vec<Option<crate::pipeline::ClassLabel>> = (0..elements.len())
1521 .map(|index| {
1522 let ctx = crate::pipeline::ClassifyContext {
1523 elements: &elements,
1524 index,
1525 };
1526 classifier.classify(&elements[index], &ctx)
1527 })
1528 .collect();
1529 for (element, label) in elements.iter_mut().zip(labels) {
1530 if let Some(label) = label {
1531 element.metadata_mut().class_label = Some(label.0.into_owned());
1532 }
1533 }
1534 }
1535 let groups = pipeline.chunking.chunk(&elements);
1536 let hybrid: Vec<crate::pipeline::HybridChunk> = groups
1537 .into_iter()
1538 .map(|g| crate::pipeline::HybridChunk::from_group(g, pipeline.max_tokens))
1539 .collect();
1540 // `mut` is needed only for the enricher pass below (gated `semantic`);
1541 // without that feature the binding is never mutated — silence the warning.
1542 #[allow(unused_mut)]
1543 let mut chunks = self.build_rag_chunks(&hybrid, source);
1544 #[cfg(feature = "semantic")]
1545 if !pipeline.enrichers.is_empty() {
1546 // Enrich each chunk's `extra` bag. The hybrid chunk (kept alongside)
1547 // supplies the source elements; text/heading_path are snapshotted to
1548 // release the immutable borrow before mutating `metadata`.
1549 for (chunk, hc) in chunks.iter_mut().zip(hybrid.iter()) {
1550 let text = chunk.text.clone();
1551 let heading_path = chunk.metadata.heading_path.clone();
1552 let ctx = crate::pipeline::EnrichContext {
1553 text: &text,
1554 elements: hc.elements(),
1555 heading_path: &heading_path,
1556 };
1557 for enricher in &pipeline.enrichers {
1558 enricher.enrich(&ctx, &mut chunk.metadata);
1559 }
1560 }
1561 }
1562 Ok(chunks)
1563 }
1564
1565 /// Build linked [`RagChunk`]s from hybrid chunks, optionally stamping a
1566 /// [`DocumentSource`](crate::pipeline::DocumentSource), then wiring
1567 /// prev/next ids. Shared by all `rag_chunks*` entry points (DRY).
1568 fn build_rag_chunks(
1569 &self,
1570 hybrid_chunks: &[crate::pipeline::HybridChunk],
1571 source: Option<crate::pipeline::DocumentSource>,
1572 ) -> Vec<crate::pipeline::RagChunk> {
1573 let mut chunks: Vec<crate::pipeline::RagChunk> = match &source {
1574 Some(s) => hybrid_chunks
1575 .iter()
1576 .enumerate()
1577 .map(|(i, hc)| crate::pipeline::RagChunk::from_hybrid_chunk_with_source(i, hc, s))
1578 .collect(),
1579 None => hybrid_chunks
1580 .iter()
1581 .enumerate()
1582 .map(|(i, hc)| crate::pipeline::RagChunk::from_hybrid_chunk(i, hc))
1583 .collect(),
1584 };
1585 crate::pipeline::chunk_metadata::link_chunks(&mut chunks);
1586 chunks
1587 }
1588
1589 /// Extract and chunk the document using a pre-configured extraction profile.
1590 ///
1591 /// Combines [`partition_with_profile`](Self::partition_with_profile) with
1592 /// [`HybridChunker`](crate::pipeline::HybridChunker) using default chunking
1593 /// settings. Use [`rag_chunks_with`](Self::rag_chunks_with) when you need
1594 /// to tune `max_tokens` or `overlap_tokens`.
1595 ///
1596 /// # Example
1597 ///
1598 /// ```rust,no_run
1599 /// use oxidize_pdf::parser::PdfDocument;
1600 /// use oxidize_pdf::pipeline::ExtractionProfile;
1601 ///
1602 /// let doc = PdfDocument::open("document.pdf")?;
1603 /// let chunks = doc.rag_chunks_with_profile(ExtractionProfile::Rag)?;
1604 /// println!("Got {} RAG chunks", chunks.len());
1605 /// # Ok::<(), Box<dyn std::error::Error>>(())
1606 /// ```
1607 pub fn rag_chunks_with_profile(
1608 &self,
1609 profile: crate::pipeline::ExtractionProfile,
1610 ) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
1611 let elements = self.partition_with_profile(profile)?;
1612 let chunker = crate::pipeline::HybridChunker::default();
1613 let hybrid_chunks = chunker.chunk(&elements);
1614 Ok(self.build_rag_chunks(&hybrid_chunks, None))
1615 }
1616
1617 /// Combine a pre-configured extraction profile with a custom chunking config.
1618 ///
1619 /// Use this when you need both profile-tuned partitioning (e.g. `Rag` with
1620 /// XYCut reading order) and a non-default chunk size.
1621 ///
1622 /// # Example
1623 ///
1624 /// ```rust,no_run
1625 /// use oxidize_pdf::parser::PdfDocument;
1626 /// use oxidize_pdf::pipeline::{ExtractionProfile, HybridChunkConfig};
1627 ///
1628 /// let doc = PdfDocument::open("document.pdf")?;
1629 /// let config = HybridChunkConfig { max_tokens: 256, ..Default::default() };
1630 /// let chunks = doc.rag_chunks_with_profile_config(ExtractionProfile::Rag, config)?;
1631 /// # Ok::<(), Box<dyn std::error::Error>>(())
1632 /// ```
1633 pub fn rag_chunks_with_profile_config(
1634 &self,
1635 profile: crate::pipeline::ExtractionProfile,
1636 config: crate::pipeline::HybridChunkConfig,
1637 ) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
1638 let elements = self.partition_with_profile(profile)?;
1639 let chunker = crate::pipeline::HybridChunker::new(config);
1640 let hybrid_chunks = chunker.chunk(&elements);
1641 Ok(self.build_rag_chunks(&hybrid_chunks, None))
1642 }
1643
1644 /// Extract chunks as a JSON string ready for vector store ingestion.
1645 ///
1646 /// # Feature flags
1647 ///
1648 /// Requires the `semantic` feature: `oxidize-pdf = { features = ["semantic"] }`.
1649 /// Without it this method is not compiled.
1650 #[cfg(feature = "semantic")]
1651 pub fn rag_chunks_json(&self) -> ParseResult<String> {
1652 let chunks = self.rag_chunks()?;
1653 serde_json::to_string(&chunks).map_err(|e| ParseError::SerializationError(e.to_string()))
1654 }
1655
1656 /// Split the document text into chunks of approximately `target_tokens` size.
1657 ///
1658 /// Uses a default overlap of 10% of the target token count.
1659 #[deprecated(
1660 since = "2.2.0",
1661 note = "Use rag_chunks() for structure-aware RAG chunking"
1662 )]
1663 #[allow(deprecated)]
1664 pub fn chunk(
1665 &self,
1666 target_tokens: usize,
1667 ) -> crate::error::Result<Vec<crate::ai::DocumentChunk>> {
1668 let overlap = target_tokens / 10;
1669 self.chunk_with(target_tokens, overlap)
1670 }
1671
1672 /// Split the document text into chunks with explicit size and overlap control.
1673 #[deprecated(
1674 since = "2.2.0",
1675 note = "Use rag_chunks_with() for structure-aware RAG chunking"
1676 )]
1677 pub fn chunk_with(
1678 &self,
1679 target_tokens: usize,
1680 overlap: usize,
1681 ) -> crate::error::Result<Vec<crate::ai::DocumentChunk>> {
1682 let chunker = crate::ai::DocumentChunker::new(target_tokens, overlap);
1683 let extracted = self.extract_text()?;
1684 let page_texts: Vec<(usize, String)> = extracted
1685 .iter()
1686 .enumerate()
1687 .map(|(i, t)| (i + 1, t.text.clone()))
1688 .collect();
1689 chunker
1690 .chunk_text_with_pages(&page_texts)
1691 .map_err(|e| crate::error::PdfError::InvalidStructure(e.to_string()))
1692 }
1693
1694 /// Partition the document into typed elements using default configuration.
1695 ///
1696 /// Extracts text with layout preservation, then classifies fragments into
1697 /// [`Element`](crate::pipeline::Element) variants (Title, Paragraph, Table, etc.).
1698 pub fn partition(&self) -> ParseResult<Vec<crate::pipeline::Element>> {
1699 self.partition_with(crate::pipeline::PartitionConfig::default())
1700 }
1701
1702 /// Partition the document into typed elements with custom configuration.
1703 pub fn partition_with(
1704 &self,
1705 config: crate::pipeline::PartitionConfig,
1706 ) -> ParseResult<Vec<crate::pipeline::Element>> {
1707 let options = crate::text::ExtractionOptions {
1708 preserve_layout: true,
1709 reconstruct_paragraphs: true,
1710 ..Default::default()
1711 };
1712 self.do_partition_pages(options, config)
1713 }
1714
1715 /// Partition the document using a pre-configured extraction profile.
1716 pub fn partition_with_profile(
1717 &self,
1718 profile: crate::pipeline::ExtractionProfile,
1719 ) -> ParseResult<Vec<crate::pipeline::Element>> {
1720 let profile_cfg = profile.config();
1721 let options = crate::text::ExtractionOptions {
1722 preserve_layout: true,
1723 reconstruct_paragraphs: true,
1724 space_threshold: profile_cfg.extraction.space_threshold,
1725 detect_columns: profile_cfg.extraction.detect_columns,
1726 ..crate::text::ExtractionOptions::default()
1727 };
1728 self.do_partition_pages(options, profile_cfg.partition)
1729 }
1730
1731 fn do_partition_pages(
1732 &self,
1733 options: crate::text::ExtractionOptions,
1734 config: crate::pipeline::PartitionConfig,
1735 ) -> ParseResult<Vec<crate::pipeline::Element>> {
1736 // Read the gating flags before `config` is moved into the partitioner,
1737 // so we avoid cloning the config just to inspect two bools.
1738 let extract_graphics = config.detect_tables && config.prefer_ruling_tables;
1739
1740 // The reconstructed `pages` (extracted with `reconstruct_paragraphs = true`)
1741 // merge per-cell fragments into paragraph-granular fragments (issue #261),
1742 // which the ruling-based table detector cannot map back to grid cells. When
1743 // a page actually has a drawn table grid we re-extract just that page with
1744 // `reconstruct_paragraphs = false` to recover cell-granular fragments for
1745 // the detector; the reconstructed fragments still drive prose
1746 // classification. Inherit every other option (notably `space_threshold`
1747 // and `detect_columns`, which profiles override) so cell text is assembled
1748 // identically to the primary pass. Built before `options` is moved into
1749 // `extract_text_with_options`.
1750 let mut raw_options = options.clone();
1751 raw_options.reconstruct_paragraphs = false;
1752
1753 let pages = self.extract_text_with_options(options)?;
1754
1755 let partitioner = crate::pipeline::Partitioner::new(config);
1756 let mut graphics_extractor = crate::graphics::extraction::GraphicsExtractor::default();
1757 // Extracting per table-bearing page (rather than a second whole-document
1758 // pass) keeps the cost proportional to pages that need it and zero for
1759 // table-free documents even with `prefer_ruling_tables` on.
1760 let mut raw_extractor = crate::text::TextExtractor::with_options(raw_options);
1761
1762 let mut all_elements = Vec::new();
1763 for (page_idx, page_text) in pages.iter().enumerate() {
1764 let page_idx_u32 = u32::try_from(page_idx).map_err(|_| ParseError::SyntaxError {
1765 position: 0,
1766 message: format!("Page index {} exceeds u32 range", page_idx),
1767 })?;
1768 let page_height = self
1769 .get_page(page_idx_u32)
1770 .map(|p| p.height())
1771 .unwrap_or(842.0);
1772 let page_graphics = if extract_graphics {
1773 graphics_extractor.extract_from_page(self, page_idx).ok()
1774 } else {
1775 None
1776 };
1777 // Re-extract cell-granular fragments only for pages with a drawn grid.
1778 let raw_page = if page_graphics
1779 .as_ref()
1780 .is_some_and(|g| g.has_table_structure())
1781 {
1782 raw_extractor.extract_from_page(self, page_idx_u32).ok()
1783 } else {
1784 None
1785 };
1786 let raw_fragments = raw_page.as_ref().map(|pt| pt.fragments.as_slice());
1787 let elements = partitioner.partition_fragments_with_graphics_raw(
1788 &page_text.fragments,
1789 raw_fragments,
1790 page_graphics.as_ref(),
1791 page_idx_u32,
1792 page_height,
1793 );
1794 all_elements.extend(elements);
1795 }
1796
1797 Ok(all_elements)
1798 }
1799
1800 /// Partition the document into typed elements and build a relationship graph.
1801 ///
1802 /// Returns a tuple of `(elements, graph)` where the graph captures parent/child
1803 /// and next/prev relationships between elements by index.
1804 ///
1805 /// # Example
1806 ///
1807 /// ```rust,no_run
1808 /// use oxidize_pdf::parser::PdfDocument;
1809 /// use oxidize_pdf::pipeline::PartitionConfig;
1810 ///
1811 /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
1812 /// let doc = PdfDocument::open("document.pdf")?;
1813 /// let (elements, graph) = doc.partition_graph(PartitionConfig::default())?;
1814 ///
1815 /// for title_idx in graph.top_level_sections() {
1816 /// println!("Section: {}", elements[title_idx].text());
1817 /// for child_idx in graph.elements_in_section(title_idx) {
1818 /// println!(" {}", elements[child_idx].text());
1819 /// }
1820 /// }
1821 /// # Ok(())
1822 /// # }
1823 /// ```
1824 pub fn partition_graph(
1825 &self,
1826 config: crate::pipeline::PartitionConfig,
1827 ) -> ParseResult<(Vec<crate::pipeline::Element>, crate::pipeline::ElementGraph)> {
1828 let elements = self.partition_with(config)?;
1829 let graph = crate::pipeline::ElementGraph::build(&elements);
1830 Ok((elements, graph))
1831 }
1832}
1833
1834impl PdfDocument<File> {
1835 /// Open a PDF file by path — the simplest way to start working with a PDF.
1836 ///
1837 /// This is a convenience method that combines `PdfReader::open()` and
1838 /// `PdfDocument::new()` into a single call.
1839 ///
1840 /// # Example
1841 ///
1842 /// ```rust,no_run
1843 /// use oxidize_pdf::parser::PdfDocument;
1844 ///
1845 /// let doc = PdfDocument::open("report.pdf").unwrap();
1846 /// let text = doc.extract_text().unwrap();
1847 /// let markdown = doc.to_markdown().unwrap();
1848 /// ```
1849 pub fn open<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
1850 PdfReader::open_document(path)
1851 }
1852}
1853
1854#[cfg(test)]
1855mod tests {
1856 use super::*;
1857 use crate::parser::objects::{PdfObject, PdfString};
1858 use std::io::Cursor;
1859
1860 // Helper function to create a minimal PDF in memory
1861 fn create_minimal_pdf() -> Vec<u8> {
1862 let mut pdf = Vec::new();
1863
1864 // PDF header
1865 pdf.extend_from_slice(b"%PDF-1.4\n");
1866
1867 // Catalog object
1868 pdf.extend_from_slice(b"1 0 obj\n");
1869 pdf.extend_from_slice(b"<< /Type /Catalog /Pages 2 0 R >>\n");
1870 pdf.extend_from_slice(b"endobj\n");
1871
1872 // Pages object
1873 pdf.extend_from_slice(b"2 0 obj\n");
1874 pdf.extend_from_slice(b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>\n");
1875 pdf.extend_from_slice(b"endobj\n");
1876
1877 // Page object
1878 pdf.extend_from_slice(b"3 0 obj\n");
1879 pdf.extend_from_slice(
1880 b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << >> >>\n",
1881 );
1882 pdf.extend_from_slice(b"endobj\n");
1883
1884 // Cross-reference table
1885 let xref_pos = pdf.len();
1886 pdf.extend_from_slice(b"xref\n");
1887 pdf.extend_from_slice(b"0 4\n");
1888 pdf.extend_from_slice(b"0000000000 65535 f \n");
1889 pdf.extend_from_slice(b"0000000009 00000 n \n");
1890 pdf.extend_from_slice(b"0000000058 00000 n \n");
1891 pdf.extend_from_slice(b"0000000115 00000 n \n");
1892
1893 // Trailer
1894 pdf.extend_from_slice(b"trailer\n");
1895 pdf.extend_from_slice(b"<< /Size 4 /Root 1 0 R >>\n");
1896 pdf.extend_from_slice(b"startxref\n");
1897 pdf.extend_from_slice(format!("{xref_pos}\n").as_bytes());
1898 pdf.extend_from_slice(b"%%EOF\n");
1899
1900 pdf
1901 }
1902
1903 // Helper to create a PDF with metadata
1904 fn create_pdf_with_metadata() -> Vec<u8> {
1905 let mut pdf = Vec::new();
1906
1907 // PDF header
1908 pdf.extend_from_slice(b"%PDF-1.5\n");
1909
1910 // Record positions for xref
1911 let obj1_pos = pdf.len();
1912
1913 // Catalog object
1914 pdf.extend_from_slice(b"1 0 obj\n");
1915 pdf.extend_from_slice(b"<< /Type /Catalog /Pages 2 0 R >>\n");
1916 pdf.extend_from_slice(b"endobj\n");
1917
1918 let obj2_pos = pdf.len();
1919
1920 // Pages object
1921 pdf.extend_from_slice(b"2 0 obj\n");
1922 pdf.extend_from_slice(b"<< /Type /Pages /Kids [] /Count 0 >>\n");
1923 pdf.extend_from_slice(b"endobj\n");
1924
1925 let obj3_pos = pdf.len();
1926
1927 // Info object
1928 pdf.extend_from_slice(b"3 0 obj\n");
1929 pdf.extend_from_slice(
1930 b"<< /Title (Test Document) /Author (Test Author) /Subject (Test Subject) >>\n",
1931 );
1932 pdf.extend_from_slice(b"endobj\n");
1933
1934 // Cross-reference table
1935 let xref_pos = pdf.len();
1936 pdf.extend_from_slice(b"xref\n");
1937 pdf.extend_from_slice(b"0 4\n");
1938 pdf.extend_from_slice(b"0000000000 65535 f \n");
1939 pdf.extend_from_slice(format!("{obj1_pos:010} 00000 n \n").as_bytes());
1940 pdf.extend_from_slice(format!("{obj2_pos:010} 00000 n \n").as_bytes());
1941 pdf.extend_from_slice(format!("{obj3_pos:010} 00000 n \n").as_bytes());
1942
1943 // Trailer
1944 pdf.extend_from_slice(b"trailer\n");
1945 pdf.extend_from_slice(b"<< /Size 4 /Root 1 0 R /Info 3 0 R >>\n");
1946 pdf.extend_from_slice(b"startxref\n");
1947 pdf.extend_from_slice(format!("{xref_pos}\n").as_bytes());
1948 pdf.extend_from_slice(b"%%EOF\n");
1949
1950 pdf
1951 }
1952
1953 #[test]
1954 fn test_pdf_document_new() {
1955 let pdf_data = create_minimal_pdf();
1956 let cursor = Cursor::new(pdf_data);
1957 let reader = PdfReader::new(cursor).unwrap();
1958 let document = PdfDocument::new(reader);
1959
1960 // Verify document is created with empty caches
1961 assert!(document.page_tree.borrow().is_none());
1962 assert!(document.metadata_cache.borrow().is_none());
1963 }
1964
1965 #[test]
1966 fn test_version() {
1967 let pdf_data = create_minimal_pdf();
1968 let cursor = Cursor::new(pdf_data);
1969 let reader = PdfReader::new(cursor).unwrap();
1970 let document = PdfDocument::new(reader);
1971
1972 let version = document.version().unwrap();
1973 assert_eq!(version, "1.4");
1974 }
1975
1976 #[test]
1977 fn test_page_count() {
1978 let pdf_data = create_minimal_pdf();
1979 let cursor = Cursor::new(pdf_data);
1980 let reader = PdfReader::new(cursor).unwrap();
1981 let document = PdfDocument::new(reader);
1982
1983 let count = document.page_count().unwrap();
1984 assert_eq!(count, 1);
1985 }
1986
1987 #[test]
1988 fn test_metadata() {
1989 let pdf_data = create_pdf_with_metadata();
1990 let cursor = Cursor::new(pdf_data);
1991 let reader = PdfReader::new(cursor).unwrap();
1992 let document = PdfDocument::new(reader);
1993
1994 let metadata = document.metadata().unwrap();
1995 assert_eq!(metadata.title, Some("Test Document".to_string()));
1996 assert_eq!(metadata.author, Some("Test Author".to_string()));
1997 assert_eq!(metadata.subject, Some("Test Subject".to_string()));
1998
1999 // Verify caching works
2000 let metadata2 = document.metadata().unwrap();
2001 assert_eq!(metadata.title, metadata2.title);
2002 }
2003
2004 #[test]
2005 fn test_get_page() {
2006 let pdf_data = create_minimal_pdf();
2007 let cursor = Cursor::new(pdf_data);
2008 let reader = PdfReader::new(cursor).unwrap();
2009 let document = PdfDocument::new(reader);
2010
2011 // Get first page
2012 let page = document.get_page(0).unwrap();
2013 assert_eq!(page.media_box, [0.0, 0.0, 612.0, 792.0]);
2014
2015 // Verify caching works
2016 let page2 = document.get_page(0).unwrap();
2017 assert_eq!(page.media_box, page2.media_box);
2018 }
2019
2020 #[test]
2021 fn test_get_page_out_of_bounds() {
2022 let pdf_data = create_minimal_pdf();
2023 let cursor = Cursor::new(pdf_data);
2024 let reader = PdfReader::new(cursor).unwrap();
2025 let document = PdfDocument::new(reader);
2026
2027 // Try to get page that doesn't exist
2028 let result = document.get_page(10);
2029 // With fallback lookup, this might succeed or fail gracefully
2030 if result.is_err() {
2031 assert!(result.unwrap_err().to_string().contains("Page"));
2032 } else {
2033 // If succeeds, should return a valid page
2034 let _page = result.unwrap();
2035 }
2036 }
2037
2038 #[test]
2039 fn test_resource_manager_caching() {
2040 let resources = ResourceManager::new();
2041
2042 // Test caching an object
2043 let obj_ref = (1, 0);
2044 let obj = PdfObject::String(PdfString("Test".as_bytes().to_vec()));
2045
2046 assert!(resources.get_cached(obj_ref).is_none());
2047
2048 resources.cache_object(obj_ref, obj.clone());
2049
2050 let cached = resources.get_cached(obj_ref).unwrap();
2051 assert_eq!(cached, obj);
2052
2053 // Test clearing cache
2054 resources.clear_cache();
2055 assert!(resources.get_cached(obj_ref).is_none());
2056 }
2057
2058 #[test]
2059 fn test_get_object() {
2060 let pdf_data = create_minimal_pdf();
2061 let cursor = Cursor::new(pdf_data);
2062 let reader = PdfReader::new(cursor).unwrap();
2063 let document = PdfDocument::new(reader);
2064
2065 // Get catalog object
2066 let catalog = document.get_object(1, 0).unwrap();
2067 if let PdfObject::Dictionary(dict) = catalog {
2068 if let Some(PdfObject::Name(name)) = dict.get("Type") {
2069 assert_eq!(name.0, "Catalog");
2070 } else {
2071 panic!("Expected /Type name");
2072 }
2073 } else {
2074 panic!("Expected dictionary object");
2075 }
2076 }
2077
2078 #[test]
2079 fn test_resolve_reference() {
2080 let pdf_data = create_minimal_pdf();
2081 let cursor = Cursor::new(pdf_data);
2082 let reader = PdfReader::new(cursor).unwrap();
2083 let document = PdfDocument::new(reader);
2084
2085 // Create a reference to the catalog
2086 let ref_obj = PdfObject::Reference(1, 0);
2087
2088 // Resolve it
2089 let resolved = document.resolve(&ref_obj).unwrap();
2090 if let PdfObject::Dictionary(dict) = resolved {
2091 if let Some(PdfObject::Name(name)) = dict.get("Type") {
2092 assert_eq!(name.0, "Catalog");
2093 } else {
2094 panic!("Expected /Type name");
2095 }
2096 } else {
2097 panic!("Expected dictionary object");
2098 }
2099 }
2100
2101 #[test]
2102 fn test_resolve_non_reference() {
2103 let pdf_data = create_minimal_pdf();
2104 let cursor = Cursor::new(pdf_data);
2105 let reader = PdfReader::new(cursor).unwrap();
2106 let document = PdfDocument::new(reader);
2107
2108 // Try to resolve a non-reference object
2109 let obj = PdfObject::String(PdfString("Test".as_bytes().to_vec()));
2110 let resolved = document.resolve(&obj).unwrap();
2111
2112 // Should return the same object
2113 assert_eq!(resolved, obj);
2114 }
2115
2116 #[test]
2117 fn test_invalid_pdf_data() {
2118 let invalid_data = b"This is not a PDF";
2119 let cursor = Cursor::new(invalid_data.to_vec());
2120 let result = PdfReader::new(cursor);
2121
2122 assert!(result.is_err());
2123 }
2124
2125 #[test]
2126 fn test_empty_page_tree() {
2127 // Create PDF with empty page tree
2128 let pdf_data = create_pdf_with_metadata(); // This has 0 pages
2129 let cursor = Cursor::new(pdf_data);
2130 let reader = PdfReader::new(cursor).unwrap();
2131 let document = PdfDocument::new(reader);
2132
2133 let count = document.page_count().unwrap();
2134 assert_eq!(count, 0);
2135
2136 // Try to get a page from empty document
2137 let result = document.get_page(0);
2138 assert!(result.is_err());
2139 }
2140
2141 #[test]
2142 fn test_extract_text_empty_document() {
2143 let pdf_data = create_pdf_with_metadata();
2144 let cursor = Cursor::new(pdf_data);
2145 let reader = PdfReader::new(cursor).unwrap();
2146 let document = PdfDocument::new(reader);
2147
2148 let text = document.extract_text().unwrap();
2149 assert!(text.is_empty());
2150 }
2151
2152 #[test]
2153 fn test_concurrent_access() {
2154 let pdf_data = create_minimal_pdf();
2155 let cursor = Cursor::new(pdf_data);
2156 let reader = PdfReader::new(cursor).unwrap();
2157 let document = PdfDocument::new(reader);
2158
2159 // Access multiple things concurrently
2160 let version = document.version().unwrap();
2161 let count = document.page_count().unwrap();
2162 let page = document.get_page(0).unwrap();
2163
2164 assert_eq!(version, "1.4");
2165 assert_eq!(count, 1);
2166 assert_eq!(page.media_box[2], 612.0);
2167 }
2168
2169 // Additional comprehensive tests
2170 mod comprehensive_tests {
2171 use super::*;
2172
2173 #[test]
2174 fn test_resource_manager_default() {
2175 let resources = ResourceManager::default();
2176 assert!(resources.get_cached((1, 0)).is_none());
2177 }
2178
2179 #[test]
2180 fn test_resource_manager_multiple_objects() {
2181 let resources = ResourceManager::new();
2182
2183 // Cache multiple objects
2184 resources.cache_object((1, 0), PdfObject::Integer(42));
2185 resources.cache_object((2, 0), PdfObject::Boolean(true));
2186 resources.cache_object(
2187 (3, 0),
2188 PdfObject::String(PdfString("test".as_bytes().to_vec())),
2189 );
2190
2191 // Verify all are cached
2192 assert!(resources.get_cached((1, 0)).is_some());
2193 assert!(resources.get_cached((2, 0)).is_some());
2194 assert!(resources.get_cached((3, 0)).is_some());
2195
2196 // Clear and verify empty
2197 resources.clear_cache();
2198 assert!(resources.get_cached((1, 0)).is_none());
2199 assert!(resources.get_cached((2, 0)).is_none());
2200 assert!(resources.get_cached((3, 0)).is_none());
2201 }
2202
2203 #[test]
2204 fn test_resource_manager_object_overwrite() {
2205 let resources = ResourceManager::new();
2206
2207 // Cache an object
2208 resources.cache_object((1, 0), PdfObject::Integer(42));
2209 assert_eq!(resources.get_cached((1, 0)), Some(PdfObject::Integer(42)));
2210
2211 // Overwrite with different object
2212 resources.cache_object((1, 0), PdfObject::Boolean(true));
2213 assert_eq!(resources.get_cached((1, 0)), Some(PdfObject::Boolean(true)));
2214 }
2215
2216 #[test]
2217 fn test_get_object_caching() {
2218 let pdf_data = create_minimal_pdf();
2219 let cursor = Cursor::new(pdf_data);
2220 let reader = PdfReader::new(cursor).unwrap();
2221 let document = PdfDocument::new(reader);
2222
2223 // Get object first time (should cache)
2224 let obj1 = document.get_object(1, 0).unwrap();
2225
2226 // Get same object again (should use cache)
2227 let obj2 = document.get_object(1, 0).unwrap();
2228
2229 // Objects should be identical
2230 assert_eq!(obj1, obj2);
2231
2232 // Verify it's cached
2233 assert!(document.resources.get_cached((1, 0)).is_some());
2234 }
2235
2236 #[test]
2237 fn test_get_object_different_generations() {
2238 let pdf_data = create_minimal_pdf();
2239 let cursor = Cursor::new(pdf_data);
2240 let reader = PdfReader::new(cursor).unwrap();
2241 let document = PdfDocument::new(reader);
2242
2243 // Get object with generation 0
2244 let _obj1 = document.get_object(1, 0).unwrap();
2245
2246 // Try to get same object with different generation (should fail)
2247 let result = document.get_object(1, 1);
2248 assert!(result.is_err());
2249
2250 // Original should still be cached
2251 assert!(document.resources.get_cached((1, 0)).is_some());
2252 }
2253
2254 #[test]
2255 fn test_get_object_nonexistent() {
2256 let pdf_data = create_minimal_pdf();
2257 let cursor = Cursor::new(pdf_data);
2258 let reader = PdfReader::new(cursor).unwrap();
2259 let document = PdfDocument::new(reader);
2260
2261 // Try to get non-existent object
2262 let result = document.get_object(999, 0);
2263 assert!(result.is_err());
2264 }
2265
2266 #[test]
2267 fn test_resolve_nested_references() {
2268 let pdf_data = create_minimal_pdf();
2269 let cursor = Cursor::new(pdf_data);
2270 let reader = PdfReader::new(cursor).unwrap();
2271 let document = PdfDocument::new(reader);
2272
2273 // Test resolving a reference
2274 let ref_obj = PdfObject::Reference(2, 0);
2275 let resolved = document.resolve(&ref_obj).unwrap();
2276
2277 // Should resolve to the pages object
2278 if let PdfObject::Dictionary(dict) = resolved {
2279 if let Some(PdfObject::Name(name)) = dict.get("Type") {
2280 assert_eq!(name.0, "Pages");
2281 }
2282 }
2283 }
2284
2285 #[test]
2286 fn test_resolve_various_object_types() {
2287 let pdf_data = create_minimal_pdf();
2288 let cursor = Cursor::new(pdf_data);
2289 let reader = PdfReader::new(cursor).unwrap();
2290 let document = PdfDocument::new(reader);
2291
2292 // Test resolving different object types
2293 let test_objects = vec![
2294 PdfObject::Integer(42),
2295 PdfObject::Boolean(true),
2296 PdfObject::String(PdfString("test".as_bytes().to_vec())),
2297 PdfObject::Real(3.14),
2298 PdfObject::Null,
2299 ];
2300
2301 for obj in test_objects {
2302 let resolved = document.resolve(&obj).unwrap();
2303 assert_eq!(resolved, obj);
2304 }
2305 }
2306
2307 #[test]
2308 fn test_get_page_cached() {
2309 let pdf_data = create_minimal_pdf();
2310 let cursor = Cursor::new(pdf_data);
2311 let reader = PdfReader::new(cursor).unwrap();
2312 let document = PdfDocument::new(reader);
2313
2314 // Get page first time
2315 let page1 = document.get_page(0).unwrap();
2316
2317 // Get same page again
2318 let page2 = document.get_page(0).unwrap();
2319
2320 // Should be identical
2321 assert_eq!(page1.media_box, page2.media_box);
2322 assert_eq!(page1.rotation, page2.rotation);
2323 assert_eq!(page1.obj_ref, page2.obj_ref);
2324 }
2325
2326 #[test]
2327 fn test_metadata_caching() {
2328 let pdf_data = create_pdf_with_metadata();
2329 let cursor = Cursor::new(pdf_data);
2330 let reader = PdfReader::new(cursor).unwrap();
2331 let document = PdfDocument::new(reader);
2332
2333 // Get metadata first time
2334 let meta1 = document.metadata().unwrap();
2335
2336 // Get metadata again
2337 let meta2 = document.metadata().unwrap();
2338
2339 // Should be identical
2340 assert_eq!(meta1.title, meta2.title);
2341 assert_eq!(meta1.author, meta2.author);
2342 assert_eq!(meta1.subject, meta2.subject);
2343 assert_eq!(meta1.version, meta2.version);
2344 }
2345
2346 #[test]
2347 fn test_page_tree_initialization() {
2348 let pdf_data = create_minimal_pdf();
2349 let cursor = Cursor::new(pdf_data);
2350 let reader = PdfReader::new(cursor).unwrap();
2351 let document = PdfDocument::new(reader);
2352
2353 // Initially page tree should be None
2354 assert!(document.page_tree.borrow().is_none());
2355
2356 // After getting page count, page tree should be initialized
2357 let _count = document.page_count().unwrap();
2358 // Note: page_tree is private, so we can't directly check it
2359 // But we can verify it works by getting a page
2360 let _page = document.get_page(0).unwrap();
2361 }
2362
2363 #[test]
2364 fn test_get_page_resources() {
2365 let pdf_data = create_minimal_pdf();
2366 let cursor = Cursor::new(pdf_data);
2367 let reader = PdfReader::new(cursor).unwrap();
2368 let document = PdfDocument::new(reader);
2369
2370 let page = document.get_page(0).unwrap();
2371 let resources = document.get_page_resources(&page).unwrap();
2372
2373 // The minimal PDF has empty resources
2374 assert!(resources.is_some());
2375 }
2376
2377 #[test]
2378 fn test_get_page_content_streams_empty() {
2379 let pdf_data = create_minimal_pdf();
2380 let cursor = Cursor::new(pdf_data);
2381 let reader = PdfReader::new(cursor).unwrap();
2382 let document = PdfDocument::new(reader);
2383
2384 let page = document.get_page(0).unwrap();
2385 let streams = document.get_page_content_streams(&page).unwrap();
2386
2387 // Minimal PDF has no content streams
2388 assert!(streams.is_empty());
2389 }
2390
2391 #[test]
2392 fn test_extract_text_from_page() {
2393 let pdf_data = create_minimal_pdf();
2394 let cursor = Cursor::new(pdf_data);
2395 let reader = PdfReader::new(cursor).unwrap();
2396 let document = PdfDocument::new(reader);
2397
2398 let result = document.extract_text_from_page(0);
2399 // Should succeed even with empty page
2400 assert!(result.is_ok());
2401 }
2402
2403 #[test]
2404 fn test_extract_text_from_page_out_of_bounds() {
2405 let pdf_data = create_minimal_pdf();
2406 let cursor = Cursor::new(pdf_data);
2407 let reader = PdfReader::new(cursor).unwrap();
2408 let document = PdfDocument::new(reader);
2409
2410 let result = document.extract_text_from_page(999);
2411 // With fallback lookup, this might succeed or fail gracefully
2412 if result.is_err() {
2413 assert!(result.unwrap_err().to_string().contains("Page"));
2414 } else {
2415 // If succeeds, should return empty or valid text
2416 let _text = result.unwrap();
2417 }
2418 }
2419
2420 #[test]
2421 fn test_extract_text_with_options() {
2422 let pdf_data = create_minimal_pdf();
2423 let cursor = Cursor::new(pdf_data);
2424 let reader = PdfReader::new(cursor).unwrap();
2425 let document = PdfDocument::new(reader);
2426
2427 let options = crate::text::ExtractionOptions {
2428 preserve_layout: true,
2429 space_threshold: 0.5,
2430 newline_threshold: 15.0,
2431 ..Default::default()
2432 };
2433
2434 let result = document.extract_text_with_options(options);
2435 assert!(result.is_ok());
2436 }
2437
2438 #[test]
2439 fn test_version_different_pdf_versions() {
2440 // Test with different PDF versions
2441 let versions = vec!["1.3", "1.4", "1.5", "1.6", "1.7"];
2442
2443 for version in versions {
2444 let mut pdf_data = Vec::new();
2445
2446 // PDF header
2447 pdf_data.extend_from_slice(format!("%PDF-{version}\n").as_bytes());
2448
2449 // Track positions for xref
2450 let obj1_pos = pdf_data.len();
2451
2452 // Catalog object
2453 pdf_data.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
2454
2455 let obj2_pos = pdf_data.len();
2456
2457 // Pages object
2458 pdf_data
2459 .extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n");
2460
2461 // Cross-reference table
2462 let xref_pos = pdf_data.len();
2463 pdf_data.extend_from_slice(b"xref\n");
2464 pdf_data.extend_from_slice(b"0 3\n");
2465 pdf_data.extend_from_slice(b"0000000000 65535 f \n");
2466 pdf_data.extend_from_slice(format!("{obj1_pos:010} 00000 n \n").as_bytes());
2467 pdf_data.extend_from_slice(format!("{obj2_pos:010} 00000 n \n").as_bytes());
2468
2469 // Trailer
2470 pdf_data.extend_from_slice(b"trailer\n");
2471 pdf_data.extend_from_slice(b"<< /Size 3 /Root 1 0 R >>\n");
2472 pdf_data.extend_from_slice(b"startxref\n");
2473 pdf_data.extend_from_slice(format!("{xref_pos}\n").as_bytes());
2474 pdf_data.extend_from_slice(b"%%EOF\n");
2475
2476 let cursor = Cursor::new(pdf_data);
2477 let reader = PdfReader::new(cursor).unwrap();
2478 let document = PdfDocument::new(reader);
2479
2480 let pdf_version = document.version().unwrap();
2481 assert_eq!(pdf_version, version);
2482 }
2483 }
2484
2485 #[test]
2486 fn test_page_count_zero() {
2487 let pdf_data = create_pdf_with_metadata(); // Has 0 pages
2488 let cursor = Cursor::new(pdf_data);
2489 let reader = PdfReader::new(cursor).unwrap();
2490 let document = PdfDocument::new(reader);
2491
2492 let count = document.page_count().unwrap();
2493 assert_eq!(count, 0);
2494 }
2495
2496 #[test]
2497 fn test_multiple_object_access() {
2498 let pdf_data = create_minimal_pdf();
2499 let cursor = Cursor::new(pdf_data);
2500 let reader = PdfReader::new(cursor).unwrap();
2501 let document = PdfDocument::new(reader);
2502
2503 // Access multiple objects
2504 let catalog = document.get_object(1, 0).unwrap();
2505 let pages = document.get_object(2, 0).unwrap();
2506 let page = document.get_object(3, 0).unwrap();
2507
2508 // Verify they're all different objects
2509 assert_ne!(catalog, pages);
2510 assert_ne!(pages, page);
2511 assert_ne!(catalog, page);
2512 }
2513
2514 #[test]
2515 fn test_error_handling_invalid_object_reference() {
2516 let pdf_data = create_minimal_pdf();
2517 let cursor = Cursor::new(pdf_data);
2518 let reader = PdfReader::new(cursor).unwrap();
2519 let document = PdfDocument::new(reader);
2520
2521 // Try to resolve an invalid reference
2522 let invalid_ref = PdfObject::Reference(999, 0);
2523 let result = document.resolve(&invalid_ref);
2524 assert!(result.is_err());
2525 }
2526
2527 #[test]
2528 fn test_concurrent_metadata_access() {
2529 let pdf_data = create_pdf_with_metadata();
2530 let cursor = Cursor::new(pdf_data);
2531 let reader = PdfReader::new(cursor).unwrap();
2532 let document = PdfDocument::new(reader);
2533
2534 // Access metadata and other properties concurrently
2535 let metadata = document.metadata().unwrap();
2536 let version = document.version().unwrap();
2537 let count = document.page_count().unwrap();
2538
2539 assert_eq!(metadata.title, Some("Test Document".to_string()));
2540 assert_eq!(version, "1.5");
2541 assert_eq!(count, 0);
2542 }
2543
2544 #[test]
2545 fn test_page_properties_comprehensive() {
2546 let pdf_data = create_minimal_pdf();
2547 let cursor = Cursor::new(pdf_data);
2548 let reader = PdfReader::new(cursor).unwrap();
2549 let document = PdfDocument::new(reader);
2550
2551 let page = document.get_page(0).unwrap();
2552
2553 // Test all page properties
2554 assert_eq!(page.media_box, [0.0, 0.0, 612.0, 792.0]);
2555 assert_eq!(page.crop_box, None);
2556 assert_eq!(page.rotation, 0);
2557 assert_eq!(page.obj_ref, (3, 0));
2558
2559 // Test width/height calculation
2560 assert_eq!(page.width(), 612.0);
2561 assert_eq!(page.height(), 792.0);
2562 }
2563
2564 #[test]
2565 fn test_memory_usage_efficiency() {
2566 let pdf_data = create_minimal_pdf();
2567 let cursor = Cursor::new(pdf_data);
2568 let reader = PdfReader::new(cursor).unwrap();
2569 let document = PdfDocument::new(reader);
2570
2571 // Access same page multiple times
2572 for _ in 0..10 {
2573 let _page = document.get_page(0).unwrap();
2574 }
2575
2576 // Should only have one copy in cache
2577 let page_count = document.page_count().unwrap();
2578 assert_eq!(page_count, 1);
2579 }
2580
2581 #[test]
2582 fn test_reader_borrow_safety() {
2583 let pdf_data = create_minimal_pdf();
2584 let cursor = Cursor::new(pdf_data);
2585 let reader = PdfReader::new(cursor).unwrap();
2586 let document = PdfDocument::new(reader);
2587
2588 // Multiple concurrent borrows should work
2589 let version = document.version().unwrap();
2590 let count = document.page_count().unwrap();
2591 let metadata = document.metadata().unwrap();
2592
2593 assert_eq!(version, "1.4");
2594 assert_eq!(count, 1);
2595 assert!(metadata.title.is_none());
2596 }
2597
2598 #[test]
2599 fn test_cache_consistency() {
2600 let pdf_data = create_minimal_pdf();
2601 let cursor = Cursor::new(pdf_data);
2602 let reader = PdfReader::new(cursor).unwrap();
2603 let document = PdfDocument::new(reader);
2604
2605 // Get object and verify caching
2606 let obj1 = document.get_object(1, 0).unwrap();
2607 let cached = document.resources.get_cached((1, 0)).unwrap();
2608
2609 assert_eq!(obj1, cached);
2610
2611 // Clear cache and get object again
2612 document.resources.clear_cache();
2613 let obj2 = document.get_object(1, 0).unwrap();
2614
2615 // Should be same content but loaded fresh
2616 assert_eq!(obj1, obj2);
2617 }
2618 }
2619
2620 #[test]
2621 fn test_resource_manager_new() {
2622 let resources = ResourceManager::new();
2623 assert!(resources.get_cached((1, 0)).is_none());
2624 }
2625
2626 #[test]
2627 fn test_resource_manager_cache_and_get() {
2628 let resources = ResourceManager::new();
2629
2630 // Cache an object
2631 let obj = PdfObject::Integer(42);
2632 resources.cache_object((10, 0), obj.clone());
2633
2634 // Should be retrievable
2635 let cached = resources.get_cached((10, 0));
2636 assert!(cached.is_some());
2637 assert_eq!(cached.unwrap(), obj);
2638
2639 // Non-existent object
2640 assert!(resources.get_cached((11, 0)).is_none());
2641 }
2642
2643 #[test]
2644 fn test_resource_manager_clear_cache() {
2645 let resources = ResourceManager::new();
2646
2647 // Cache multiple objects
2648 resources.cache_object((1, 0), PdfObject::Integer(1));
2649 resources.cache_object((2, 0), PdfObject::Integer(2));
2650 resources.cache_object((3, 0), PdfObject::Integer(3));
2651
2652 // Verify they're cached
2653 assert!(resources.get_cached((1, 0)).is_some());
2654 assert!(resources.get_cached((2, 0)).is_some());
2655 assert!(resources.get_cached((3, 0)).is_some());
2656
2657 // Clear cache
2658 resources.clear_cache();
2659
2660 // Should all be gone
2661 assert!(resources.get_cached((1, 0)).is_none());
2662 assert!(resources.get_cached((2, 0)).is_none());
2663 assert!(resources.get_cached((3, 0)).is_none());
2664 }
2665
2666 #[test]
2667 fn test_resource_manager_overwrite_cached() {
2668 let resources = ResourceManager::new();
2669
2670 // Cache initial object
2671 resources.cache_object((1, 0), PdfObject::Integer(42));
2672 assert_eq!(
2673 resources.get_cached((1, 0)).unwrap(),
2674 PdfObject::Integer(42)
2675 );
2676
2677 // Overwrite with new object
2678 resources.cache_object((1, 0), PdfObject::Integer(100));
2679 assert_eq!(
2680 resources.get_cached((1, 0)).unwrap(),
2681 PdfObject::Integer(100)
2682 );
2683 }
2684
2685 #[test]
2686 fn test_resource_manager_multiple_generations() {
2687 let resources = ResourceManager::new();
2688
2689 // Cache objects with different generations
2690 resources.cache_object((1, 0), PdfObject::Integer(10));
2691 resources.cache_object((1, 1), PdfObject::Integer(11));
2692 resources.cache_object((1, 2), PdfObject::Integer(12));
2693
2694 // Each should be distinct
2695 assert_eq!(
2696 resources.get_cached((1, 0)).unwrap(),
2697 PdfObject::Integer(10)
2698 );
2699 assert_eq!(
2700 resources.get_cached((1, 1)).unwrap(),
2701 PdfObject::Integer(11)
2702 );
2703 assert_eq!(
2704 resources.get_cached((1, 2)).unwrap(),
2705 PdfObject::Integer(12)
2706 );
2707 }
2708
2709 #[test]
2710 fn test_resource_manager_cache_complex_objects() {
2711 let resources = ResourceManager::new();
2712
2713 // Cache different object types
2714 resources.cache_object((1, 0), PdfObject::Boolean(true));
2715 resources.cache_object((2, 0), PdfObject::Real(3.14159));
2716 resources.cache_object(
2717 (3, 0),
2718 PdfObject::String(PdfString::new(b"Hello PDF".to_vec())),
2719 );
2720 resources.cache_object((4, 0), PdfObject::Name(PdfName::new("Type".to_string())));
2721
2722 let mut dict = PdfDictionary::new();
2723 dict.insert(
2724 "Key".to_string(),
2725 PdfObject::String(PdfString::new(b"Value".to_vec())),
2726 );
2727 resources.cache_object((5, 0), PdfObject::Dictionary(dict));
2728
2729 let array = vec![PdfObject::Integer(1), PdfObject::Integer(2)];
2730 resources.cache_object((6, 0), PdfObject::Array(PdfArray(array)));
2731
2732 // Verify all cached correctly
2733 assert_eq!(
2734 resources.get_cached((1, 0)).unwrap(),
2735 PdfObject::Boolean(true)
2736 );
2737 assert_eq!(
2738 resources.get_cached((2, 0)).unwrap(),
2739 PdfObject::Real(3.14159)
2740 );
2741 assert_eq!(
2742 resources.get_cached((3, 0)).unwrap(),
2743 PdfObject::String(PdfString::new(b"Hello PDF".to_vec()))
2744 );
2745 assert_eq!(
2746 resources.get_cached((4, 0)).unwrap(),
2747 PdfObject::Name(PdfName::new("Type".to_string()))
2748 );
2749 assert!(matches!(
2750 resources.get_cached((5, 0)).unwrap(),
2751 PdfObject::Dictionary(_)
2752 ));
2753 assert!(matches!(
2754 resources.get_cached((6, 0)).unwrap(),
2755 PdfObject::Array(_)
2756 ));
2757 }
2758
2759 // Tests for PdfDocument removed due to API incompatibilities
2760 // The methods tested don't exist in the current implementation
2761
2762 /*
2763 #[test]
2764 fn test_pdf_document_new_initialization() {
2765 // Create a minimal PDF for testing
2766 let data = b"%PDF-1.4
2767 1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2768 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2769 3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2770 xref
2771 0 4
2772 0000000000 65535 f
2773 0000000009 00000 n
2774 0000000052 00000 n
2775 0000000101 00000 n
2776 trailer<</Size 4/Root 1 0 R>>
2777 startxref
2778 164
2779 %%EOF";
2780 let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2781 let document = PdfDocument::new(reader);
2782
2783 // Document should be created successfully
2784 // Initially no page tree loaded
2785 assert!(document.page_tree.borrow().is_none());
2786 assert!(document.metadata_cache.borrow().is_none());
2787 }
2788
2789 #[test]
2790 fn test_pdf_document_version() {
2791 // Create a minimal PDF for testing
2792 let data = b"%PDF-1.4
2793 1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2794 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2795 3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2796 xref
2797 0 4
2798 0000000000 65535 f
2799 0000000009 00000 n
2800 0000000052 00000 n
2801 0000000101 00000 n
2802 trailer<</Size 4/Root 1 0 R>>
2803 startxref
2804 164
2805 %%EOF";
2806 let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2807 let document = PdfDocument::new(reader);
2808
2809 let version = document.version().unwrap();
2810 assert!(!version.is_empty());
2811 // Most PDFs are version 1.4 to 1.7
2812 assert!(version.starts_with("1.") || version.starts_with("2."));
2813 }
2814
2815 #[test]
2816 fn test_pdf_document_page_count() {
2817 // Create a minimal PDF for testing
2818 let data = b"%PDF-1.4
2819 1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2820 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2821 3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2822 xref
2823 0 4
2824 0000000000 65535 f
2825 0000000009 00000 n
2826 0000000052 00000 n
2827 0000000101 00000 n
2828 trailer<</Size 4/Root 1 0 R>>
2829 startxref
2830 164
2831 %%EOF";
2832 let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2833 let document = PdfDocument::new(reader);
2834
2835 let count = document.page_count().unwrap();
2836 assert!(count > 0);
2837 }
2838
2839 #[test]
2840 fn test_pdf_document_metadata() {
2841 // Create a minimal PDF for testing
2842 let data = b"%PDF-1.4
2843 1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2844 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2845 3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2846 xref
2847 0 4
2848 0000000000 65535 f
2849 0000000009 00000 n
2850 0000000052 00000 n
2851 0000000101 00000 n
2852 trailer<</Size 4/Root 1 0 R>>
2853 startxref
2854 164
2855 %%EOF";
2856 let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2857 let document = PdfDocument::new(reader);
2858
2859 let metadata = document.metadata().unwrap();
2860 // Metadata should be cached after first access
2861 assert!(document.metadata_cache.borrow().is_some());
2862
2863 // Second call should use cache
2864 let metadata2 = document.metadata().unwrap();
2865 assert_eq!(metadata.title, metadata2.title);
2866 }
2867
2868 #[test]
2869 fn test_pdf_document_get_page() {
2870 // Create a minimal PDF for testing
2871 let data = b"%PDF-1.4
2872 1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2873 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2874 3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2875 xref
2876 0 4
2877 0000000000 65535 f
2878 0000000009 00000 n
2879 0000000052 00000 n
2880 0000000101 00000 n
2881 trailer<</Size 4/Root 1 0 R>>
2882 startxref
2883 164
2884 %%EOF";
2885 let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2886 let document = PdfDocument::new(reader);
2887
2888 // Get first page
2889 let page = document.get_page(0).unwrap();
2890 assert!(page.width() > 0.0);
2891 assert!(page.height() > 0.0);
2892
2893 // Page tree should be loaded now
2894 assert!(document.page_tree.borrow().is_some());
2895 }
2896
2897 #[test]
2898 fn test_pdf_document_get_page_out_of_bounds() {
2899 // Create a minimal PDF for testing
2900 let data = b"%PDF-1.4
2901 1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2902 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2903 3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2904 xref
2905 0 4
2906 0000000000 65535 f
2907 0000000009 00000 n
2908 0000000052 00000 n
2909 0000000101 00000 n
2910 trailer<</Size 4/Root 1 0 R>>
2911 startxref
2912 164
2913 %%EOF";
2914 let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2915 let document = PdfDocument::new(reader);
2916
2917 let page_count = document.page_count().unwrap();
2918
2919 // Try to get page beyond count
2920 let result = document.get_page(page_count + 10);
2921 assert!(result.is_err());
2922 }
2923
2924
2925 #[test]
2926 fn test_pdf_document_get_object() {
2927 // Create a minimal PDF for testing
2928 let data = b"%PDF-1.4
2929 1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2930 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2931 3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2932 xref
2933 0 4
2934 0000000000 65535 f
2935 0000000009 00000 n
2936 0000000052 00000 n
2937 0000000101 00000 n
2938 trailer<</Size 4/Root 1 0 R>>
2939 startxref
2940 164
2941 %%EOF";
2942 let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2943 let document = PdfDocument::new(reader);
2944
2945 // Get an object (catalog is usually object 1 0)
2946 let obj = document.get_object(1, 0);
2947 assert!(obj.is_ok());
2948
2949 // Object should be cached
2950 assert!(document.resources.get_cached((1, 0)).is_some());
2951 }
2952
2953
2954
2955 #[test]
2956 fn test_pdf_document_extract_text_from_page() {
2957 // Create a minimal PDF for testing
2958 let data = b"%PDF-1.4
2959 1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2960 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2961 3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2962 xref
2963 0 4
2964 0000000000 65535 f
2965 0000000009 00000 n
2966 0000000052 00000 n
2967 0000000101 00000 n
2968 trailer<</Size 4/Root 1 0 R>>
2969 startxref
2970 164
2971 %%EOF";
2972 let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2973 let document = PdfDocument::new(reader);
2974
2975 // Try to extract text from first page
2976 let result = document.extract_text_from_page(0);
2977 // Even if no text, should not error
2978 assert!(result.is_ok());
2979 }
2980
2981 #[test]
2982 fn test_pdf_document_extract_all_text() {
2983 // Create a minimal PDF for testing
2984 let data = b"%PDF-1.4
2985 1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2986 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2987 3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2988 xref
2989 0 4
2990 0000000000 65535 f
2991 0000000009 00000 n
2992 0000000052 00000 n
2993 0000000101 00000 n
2994 trailer<</Size 4/Root 1 0 R>>
2995 startxref
2996 164
2997 %%EOF";
2998 let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2999 let document = PdfDocument::new(reader);
3000
3001 let extracted = document.extract_text().unwrap();
3002 let page_count = document.page_count().unwrap();
3003
3004 // Should have text for each page
3005 assert_eq!(extracted.len(), page_count);
3006 }
3007
3008
3009 #[test]
3010 fn test_pdf_document_ensure_page_tree() {
3011 // Create a minimal PDF for testing
3012 let data = b"%PDF-1.4
3013 1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
3014 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
3015 3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
3016 xref
3017 0 4
3018 0000000000 65535 f
3019 0000000009 00000 n
3020 0000000052 00000 n
3021 0000000101 00000 n
3022 trailer<</Size 4/Root 1 0 R>>
3023 startxref
3024 164
3025 %%EOF";
3026 let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
3027 let document = PdfDocument::new(reader);
3028
3029 // Initially no page tree
3030 assert!(document.page_tree.borrow().is_none());
3031
3032 // After ensuring, should be loaded
3033 document.ensure_page_tree().unwrap();
3034 assert!(document.page_tree.borrow().is_some());
3035
3036 // Second call should not error
3037 document.ensure_page_tree().unwrap();
3038 }
3039
3040 #[test]
3041 fn test_resource_manager_concurrent_access() {
3042 let resources = ResourceManager::new();
3043
3044 // Simulate concurrent-like access pattern
3045 resources.cache_object((1, 0), PdfObject::Integer(1));
3046 let obj1 = resources.get_cached((1, 0));
3047
3048 resources.cache_object((2, 0), PdfObject::Integer(2));
3049 let obj2 = resources.get_cached((2, 0));
3050
3051 // Both should be accessible
3052 assert_eq!(obj1.unwrap(), PdfObject::Integer(1));
3053 assert_eq!(obj2.unwrap(), PdfObject::Integer(2));
3054 }
3055
3056 #[test]
3057 fn test_resource_manager_large_cache() {
3058 let resources = ResourceManager::new();
3059
3060 // Cache many objects
3061 for i in 0..1000 {
3062 resources.cache_object((i, 0), PdfObject::Integer(i as i64));
3063 }
3064
3065 // Verify random access
3066 assert_eq!(resources.get_cached((500, 0)).unwrap(), PdfObject::Integer(500));
3067 assert_eq!(resources.get_cached((999, 0)).unwrap(), PdfObject::Integer(999));
3068 assert_eq!(resources.get_cached((0, 0)).unwrap(), PdfObject::Integer(0));
3069
3070 // Clear should remove all
3071 resources.clear_cache();
3072 assert!(resources.get_cached((500, 0)).is_none());
3073 }
3074 */
3075}