oxidize_pdf/parser/
document.rs

1//! PDF Document wrapper - High-level interface for PDF parsing and manipulation
2//!
3//! This module provides a robust, high-level interface for working with PDF documents.
4//! It solves Rust's borrow checker challenges through careful use of interior mutability
5//! (RefCell) and separation of concerns between parsing, caching, and page access.
6//!
7//! # Architecture
8//!
9//! The module uses a layered architecture:
10//! - **PdfDocument**: Main entry point with RefCell-based state management
11//! - **ResourceManager**: Centralized object caching with interior mutability
12//! - **PdfReader**: Low-level file access (wrapped in RefCell)
13//! - **PageTree**: Lazy-loaded page navigation
14//!
15//! # Key Features
16//!
17//! - **Automatic caching**: Objects are cached after first access
18//! - **Resource management**: Shared resources are handled efficiently
19//! - **Page navigation**: Fast access to any page in the document
20//! - **Reference resolution**: Automatic resolution of indirect references
21//! - **Text extraction**: Built-in support for extracting text from pages
22//!
23//! # Example
24//!
25//! ```rust,no_run
26//! use oxidize_pdf::parser::{PdfDocument, PdfReader};
27//!
28//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
29//! // Open a PDF document
30//! let reader = PdfReader::open("document.pdf")?;
31//! let document = PdfDocument::new(reader);
32//!
33//! // Get document information
34//! let page_count = document.page_count()?;
35//! let metadata = document.metadata()?;
36//! println!("Title: {:?}", metadata.title);
37//! println!("Pages: {}", page_count);
38//!
39//! // Access a specific page
40//! let page = document.get_page(0)?;
41//! println!("Page size: {}x{}", page.width(), page.height());
42//!
43//! // Extract text from all pages
44//! let extracted_text = document.extract_text()?;
45//! for (i, page_text) in extracted_text.iter().enumerate() {
46//!     println!("Page {}: {}", i + 1, page_text.text);
47//! }
48//! # Ok(())
49//! # }
50//! ```
51
52#[cfg(test)]
53use super::objects::{PdfArray, PdfName};
54use super::objects::{PdfDictionary, PdfObject};
55use super::page_tree::{PageTree, ParsedPage};
56use super::reader::PdfReader;
57use super::{ParseError, ParseOptions, ParseResult};
58use std::cell::RefCell;
59use std::collections::HashMap;
60use std::fs::File;
61use std::io::{Read, Seek};
62use std::path::Path;
63use std::rc::Rc;
64
65/// Resource manager for efficient PDF object caching.
66///
67/// The ResourceManager provides centralized caching of PDF objects to avoid
68/// repeated parsing and to share resources between different parts of the document.
69/// It uses RefCell for interior mutability, allowing multiple immutable references
70/// to the document while still being able to update the cache.
71///
72/// # Caching Strategy
73///
74/// - Objects are cached on first access
75/// - Cache persists for the lifetime of the document
76/// - Manual cache clearing is supported for memory management
77///
78/// # Example
79///
80/// ```rust,no_run
81/// use oxidize_pdf::parser::document::ResourceManager;
82///
83/// let resources = ResourceManager::new();
84///
85/// // Objects are cached automatically when accessed through PdfDocument
86/// // Manual cache management:
87/// resources.clear_cache(); // Free memory when needed
88/// ```
89pub struct ResourceManager {
90    /// Cached objects indexed by (object_number, generation_number)
91    object_cache: RefCell<HashMap<(u32, u16), PdfObject>>,
92}
93
94impl Default for ResourceManager {
95    fn default() -> Self {
96        Self::new()
97    }
98}
99
100impl ResourceManager {
101    /// Create a new resource manager
102    pub fn new() -> Self {
103        Self {
104            object_cache: RefCell::new(HashMap::new()),
105        }
106    }
107
108    /// Get an object from cache if available.
109    ///
110    /// # Arguments
111    ///
112    /// * `obj_ref` - Object reference (object_number, generation_number)
113    ///
114    /// # Returns
115    ///
116    /// Cloned object if cached, None otherwise.
117    ///
118    /// # Example
119    ///
120    /// ```rust,no_run
121    /// # use oxidize_pdf::parser::document::ResourceManager;
122    /// # let resources = ResourceManager::new();
123    /// if let Some(obj) = resources.get_cached((10, 0)) {
124    ///     println!("Object 10 0 R found in cache");
125    /// }
126    /// ```
127    pub fn get_cached(&self, obj_ref: (u32, u16)) -> Option<PdfObject> {
128        self.object_cache.borrow().get(&obj_ref).cloned()
129    }
130
131    /// Cache an object for future access.
132    ///
133    /// # Arguments
134    ///
135    /// * `obj_ref` - Object reference (object_number, generation_number)
136    /// * `obj` - The PDF object to cache
137    ///
138    /// # Example
139    ///
140    /// ```rust,no_run
141    /// # use oxidize_pdf::parser::document::ResourceManager;
142    /// # use oxidize_pdf::parser::objects::PdfObject;
143    /// # let resources = ResourceManager::new();
144    /// resources.cache_object((10, 0), PdfObject::Integer(42));
145    /// ```
146    pub fn cache_object(&self, obj_ref: (u32, u16), obj: PdfObject) {
147        self.object_cache.borrow_mut().insert(obj_ref, obj);
148    }
149
150    /// Clear all cached objects to free memory.
151    ///
152    /// Use this when processing large documents to manage memory usage.
153    ///
154    /// # Example
155    ///
156    /// ```rust,no_run
157    /// # use oxidize_pdf::parser::document::ResourceManager;
158    /// # let resources = ResourceManager::new();
159    /// // After processing many pages
160    /// resources.clear_cache();
161    /// println!("Cache cleared to free memory");
162    /// ```
163    pub fn clear_cache(&self) {
164        self.object_cache.borrow_mut().clear();
165    }
166}
167
168/// High-level PDF document interface for parsing and manipulation.
169///
170/// `PdfDocument` provides a clean, safe API for working with PDF files.
171/// It handles the complexity of PDF structure, object references, and resource
172/// management behind a simple interface.
173///
174/// # Type Parameter
175///
176/// * `R` - The reader type (must implement Read + Seek)
177///
178/// # Architecture Benefits
179///
180/// - **RefCell Usage**: Allows multiple parts of the API to access the document
181/// - **Lazy Loading**: Pages and resources are loaded on demand
182/// - **Automatic Caching**: Frequently accessed objects are cached
183/// - **Safe API**: Borrow checker issues are handled internally
184///
185/// # Example
186///
187/// ```rust,no_run
188/// use oxidize_pdf::parser::{PdfDocument, PdfReader};
189/// use std::fs::File;
190///
191/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
192/// // From a file
193/// let reader = PdfReader::open("document.pdf")?;
194/// let document = PdfDocument::new(reader);
195///
196/// // From any Read + Seek source
197/// let file = File::open("document.pdf")?;
198/// let reader = PdfReader::new(file)?;
199/// let document = PdfDocument::new(reader);
200///
201/// // Use the document
202/// let page_count = document.page_count()?;
203/// for i in 0..page_count {
204///     let page = document.get_page(i)?;
205///     // Process page...
206/// }
207/// # Ok(())
208/// # }
209/// ```
210pub struct PdfDocument<R: Read + Seek> {
211    /// The underlying PDF reader wrapped for interior mutability
212    reader: RefCell<PdfReader<R>>,
213    /// Page tree navigator (lazily initialized)
214    page_tree: RefCell<Option<PageTree>>,
215    /// Shared resource manager for object caching
216    resources: Rc<ResourceManager>,
217    /// Cached document metadata to avoid repeated parsing
218    metadata_cache: RefCell<Option<super::reader::DocumentMetadata>>,
219}
220
221impl<R: Read + Seek> PdfDocument<R> {
222    /// Create a new PDF document from a reader
223    pub fn new(reader: PdfReader<R>) -> Self {
224        Self {
225            reader: RefCell::new(reader),
226            page_tree: RefCell::new(None),
227            resources: Rc::new(ResourceManager::new()),
228            metadata_cache: RefCell::new(None),
229        }
230    }
231
232    /// Get the PDF version of the document.
233    ///
234    /// # Returns
235    ///
236    /// PDF version string (e.g., "1.4", "1.7", "2.0")
237    ///
238    /// # Example
239    ///
240    /// ```rust,no_run
241    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
242    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
243    /// # let reader = PdfReader::open("document.pdf")?;
244    /// # let document = PdfDocument::new(reader);
245    /// let version = document.version()?;
246    /// println!("PDF version: {}", version);
247    /// # Ok(())
248    /// # }
249    /// ```
250    pub fn version(&self) -> ParseResult<String> {
251        Ok(self.reader.borrow().version().to_string())
252    }
253
254    /// Get the parse options
255    pub fn options(&self) -> ParseOptions {
256        self.reader.borrow().options().clone()
257    }
258
259    /// Get the total number of pages in the document.
260    ///
261    /// # Returns
262    ///
263    /// The page count as an unsigned 32-bit integer.
264    ///
265    /// # Errors
266    ///
267    /// Returns an error if the page tree is malformed or missing.
268    ///
269    /// # Example
270    ///
271    /// ```rust,no_run
272    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
273    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
274    /// # let reader = PdfReader::open("document.pdf")?;
275    /// # let document = PdfDocument::new(reader);
276    /// let count = document.page_count()?;
277    /// println!("Document has {} pages", count);
278    ///
279    /// // Iterate through all pages
280    /// for i in 0..count {
281    ///     let page = document.get_page(i)?;
282    ///     // Process page...
283    /// }
284    /// # Ok(())
285    /// # }
286    /// ```
287    pub fn page_count(&self) -> ParseResult<u32> {
288        self.ensure_page_tree()?;
289        if let Some(pt) = self.page_tree.borrow().as_ref() {
290            Ok(pt.page_count())
291        } else {
292            // Fallback: should never reach here since ensure_page_tree() just ran
293            self.reader.borrow_mut().page_count()
294        }
295    }
296
297    /// Get document metadata including title, author, creation date, etc.
298    ///
299    /// Metadata is cached after first access for performance.
300    ///
301    /// # Returns
302    ///
303    /// A `DocumentMetadata` struct containing all available metadata fields.
304    ///
305    /// # Example
306    ///
307    /// ```rust,no_run
308    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
309    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
310    /// # let reader = PdfReader::open("document.pdf")?;
311    /// # let document = PdfDocument::new(reader);
312    /// let metadata = document.metadata()?;
313    ///
314    /// if let Some(title) = &metadata.title {
315    ///     println!("Title: {}", title);
316    /// }
317    /// if let Some(author) = &metadata.author {
318    ///     println!("Author: {}", author);
319    /// }
320    /// if let Some(creation_date) = &metadata.creation_date {
321    ///     println!("Created: {}", creation_date);
322    /// }
323    /// println!("PDF Version: {}", metadata.version);
324    /// # Ok(())
325    /// # }
326    /// ```
327    pub fn metadata(&self) -> ParseResult<super::reader::DocumentMetadata> {
328        // Check cache first
329        if let Some(metadata) = self.metadata_cache.borrow().as_ref() {
330            return Ok(metadata.clone());
331        }
332
333        // Load metadata
334        let metadata = self.reader.borrow_mut().metadata()?;
335        self.metadata_cache.borrow_mut().replace(metadata.clone());
336        Ok(metadata)
337    }
338
339    /// Initialize the page tree if not already done.
340    ///
341    /// Builds a flat index of all leaf Page references by walking the tree once.
342    /// This provides O(1) page access and detects cycles and absurd /Count values.
343    fn ensure_page_tree(&self) -> ParseResult<()> {
344        if self.page_tree.borrow().is_none() {
345            let pages_dict = self.load_pages_dict()?;
346            let page_refs = {
347                let mut reader = self.reader.borrow_mut();
348                PageTree::flatten_page_tree(&mut *reader, &pages_dict)?
349            };
350            let page_tree = PageTree::new_with_flat_index(pages_dict, page_refs);
351            self.page_tree.borrow_mut().replace(page_tree);
352        }
353        Ok(())
354    }
355
356    /// Load the pages dictionary
357    fn load_pages_dict(&self) -> ParseResult<PdfDictionary> {
358        let mut reader = self.reader.borrow_mut();
359        let pages = reader.pages()?;
360        Ok(pages.clone())
361    }
362
363    /// Get a page by index (0-based).
364    ///
365    /// Pages are cached after first access. This method handles page tree
366    /// traversal and property inheritance automatically.
367    ///
368    /// # Arguments
369    ///
370    /// * `index` - Zero-based page index (0 to page_count-1)
371    ///
372    /// # Returns
373    ///
374    /// A complete `ParsedPage` with all properties and inherited resources.
375    ///
376    /// # Errors
377    ///
378    /// Returns an error if:
379    /// - Index is out of bounds
380    /// - Page tree is malformed
381    /// - Required page properties are missing
382    ///
383    /// # Example
384    ///
385    /// ```rust,no_run
386    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
387    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
388    /// # let reader = PdfReader::open("document.pdf")?;
389    /// # let document = PdfDocument::new(reader);
390    /// // Get the first page
391    /// let page = document.get_page(0)?;
392    ///
393    /// // Access page properties
394    /// println!("Page size: {}x{} points", page.width(), page.height());
395    /// println!("Rotation: {}°", page.rotation);
396    ///
397    /// // Get content streams
398    /// let streams = page.content_streams_with_document(&document)?;
399    /// println!("Page has {} content streams", streams.len());
400    /// # Ok(())
401    /// # }
402    /// ```
403    pub fn get_page(&self, index: u32) -> ParseResult<ParsedPage> {
404        self.ensure_page_tree()?;
405
406        // First check if page is already cached
407        if let Some(page_tree) = self.page_tree.borrow().as_ref() {
408            if let Some(page) = page_tree.get_cached_page(index) {
409                return Ok(page.clone());
410            }
411        }
412
413        // Try flat index O(1) lookup first
414        let (page_ref, has_flat_index) = {
415            let pt_borrow = self.page_tree.borrow();
416            let pt = pt_borrow.as_ref();
417            let ref_val = pt.and_then(|pt| pt.get_page_ref(index));
418            let has_index = pt.map_or(false, |pt| pt.page_count() > 0 || ref_val.is_some());
419            (ref_val, has_index)
420        };
421
422        let page = if let Some(page_ref) = page_ref {
423            self.load_page_by_ref(page_ref)?
424        } else if has_flat_index {
425            // Flat index exists but page not found — index is out of range
426            return Err(ParseError::SyntaxError {
427                position: 0,
428                message: format!(
429                    "Page index {} out of range (document has {} pages)",
430                    index,
431                    self.page_tree
432                        .borrow()
433                        .as_ref()
434                        .map_or(0, |pt| pt.page_count())
435                ),
436            });
437        } else {
438            // No flat index available — fallback to tree traversal
439            self.load_page_at_index(index)?
440        };
441
442        // Cache it
443        if let Some(page_tree) = self.page_tree.borrow_mut().as_mut() {
444            page_tree.cache_page(index, page.clone());
445        }
446
447        Ok(page)
448    }
449
450    /// Load a specific page by index (legacy tree traversal fallback)
451    fn load_page_at_index(&self, index: u32) -> ParseResult<ParsedPage> {
452        // Get the pages root
453        let pages_dict = self.load_pages_dict()?;
454
455        // Navigate to the specific page
456        let page_info = self.find_page_in_tree(&pages_dict, index, 0, None)?;
457
458        Ok(page_info)
459    }
460
461    /// Load a page directly by its object reference (O(1) via flat index).
462    fn load_page_by_ref(&self, page_ref: (u32, u16)) -> ParseResult<ParsedPage> {
463        let obj = self.get_object(page_ref.0, page_ref.1)?;
464        let dict = obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
465            position: 0,
466            message: format!(
467                "Page object {} {} R is not a dictionary",
468                page_ref.0, page_ref.1
469            ),
470        })?;
471
472        let inherited = self.collect_inherited_attributes(dict);
473        self.create_parsed_page(page_ref, dict, Some(&inherited))
474    }
475
476    /// Walk up the /Parent chain to collect inheritable attributes (Resources, MediaBox, CropBox, Rotate).
477    /// Uses cycle detection to prevent infinite loops in malformed PDFs.
478    fn collect_inherited_attributes(&self, page_dict: &PdfDictionary) -> PdfDictionary {
479        let mut inherited = PdfDictionary::new();
480        let inheritable_keys = ["Resources", "MediaBox", "CropBox", "Rotate"];
481
482        // Collect from the page's own parent chain
483        let mut current_parent_ref = page_dict.get("Parent").and_then(|p| p.as_reference());
484        let mut visited: std::collections::HashSet<(u32, u16)> = std::collections::HashSet::new();
485
486        while let Some(parent_ref) = current_parent_ref {
487            if !visited.insert(parent_ref) {
488                break; // Cycle detected
489            }
490
491            match self.get_object(parent_ref.0, parent_ref.1) {
492                Ok(obj) => {
493                    if let Some(parent_dict) = obj.as_dict() {
494                        for key in &inheritable_keys {
495                            // Only inherit if the page itself doesn't have it
496                            // and we haven't already found it in a closer ancestor
497                            if !page_dict.contains_key(key) && !inherited.contains_key(key) {
498                                if let Some(val) = parent_dict.get(key) {
499                                    inherited.insert((*key).to_string(), val.clone());
500                                }
501                            }
502                        }
503                        current_parent_ref =
504                            parent_dict.get("Parent").and_then(|p| p.as_reference());
505                    } else {
506                        break;
507                    }
508                }
509                Err(_) => break,
510            }
511        }
512
513        inherited
514    }
515
516    /// Find a page in the page tree (iterative implementation for stack safety)
517    fn find_page_in_tree(
518        &self,
519        root_node: &PdfDictionary,
520        target_index: u32,
521        initial_current_index: u32,
522        initial_inherited: Option<&PdfDictionary>,
523    ) -> ParseResult<ParsedPage> {
524        // Work item for the traversal queue
525        #[derive(Debug)]
526        struct WorkItem {
527            node_dict: PdfDictionary,
528            node_ref: Option<(u32, u16)>,
529            current_index: u32,
530            inherited: Option<PdfDictionary>,
531        }
532
533        // Initialize work queue with root node
534        let mut work_queue = Vec::new();
535        work_queue.push(WorkItem {
536            node_dict: root_node.clone(),
537            node_ref: None,
538            current_index: initial_current_index,
539            inherited: initial_inherited.cloned(),
540        });
541
542        // Iterative traversal
543        while let Some(work_item) = work_queue.pop() {
544            let WorkItem {
545                node_dict,
546                node_ref,
547                current_index,
548                inherited,
549            } = work_item;
550
551            let node_type = node_dict
552                .get_type()
553                .or_else(|| {
554                    // If Type is missing, try to infer from content
555                    if node_dict.contains_key("Kids") && node_dict.contains_key("Count") {
556                        Some("Pages")
557                    } else if node_dict.contains_key("Contents")
558                        || node_dict.contains_key("MediaBox")
559                    {
560                        Some("Page")
561                    } else {
562                        None
563                    }
564                })
565                .or_else(|| {
566                    // If Type is missing, try to infer from structure
567                    if node_dict.contains_key("Kids") {
568                        Some("Pages")
569                    } else if node_dict.contains_key("Contents")
570                        || (node_dict.contains_key("MediaBox") && !node_dict.contains_key("Kids"))
571                    {
572                        Some("Page")
573                    } else {
574                        None
575                    }
576                })
577                .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
578
579            match node_type {
580                "Pages" => {
581                    // This is a page tree node
582                    let kids = node_dict
583                        .get("Kids")
584                        .and_then(|obj| obj.as_array())
585                        .or_else(|| {
586                            // If Kids is missing, use empty array
587                            tracing::debug!(
588                                "Warning: Missing Kids array in Pages node, using empty array"
589                            );
590                            Some(&super::objects::EMPTY_PDF_ARRAY)
591                        })
592                        .ok_or_else(|| ParseError::MissingKey("Kids".to_string()))?;
593
594                    // Merge inherited attributes
595                    let mut merged_inherited = inherited.unwrap_or_else(PdfDictionary::new);
596
597                    // Inheritable attributes
598                    for key in ["Resources", "MediaBox", "CropBox", "Rotate"] {
599                        if let Some(value) = node_dict.get(key) {
600                            if !merged_inherited.contains_key(key) {
601                                merged_inherited.insert(key.to_string(), value.clone());
602                            }
603                        }
604                    }
605
606                    // Process kids in reverse order (since we're using a stack/Vec::pop())
607                    // This ensures we process them in the correct order
608                    let mut current_idx = current_index;
609                    let mut pending_kids = Vec::new();
610
611                    for kid_ref in &kids.0 {
612                        let kid_ref =
613                            kid_ref
614                                .as_reference()
615                                .ok_or_else(|| ParseError::SyntaxError {
616                                    position: 0,
617                                    message: "Kids array must contain references".to_string(),
618                                })?;
619
620                        // Get the kid object
621                        let kid_obj = self.get_object(kid_ref.0, kid_ref.1)?;
622                        let kid_dict = match kid_obj.as_dict() {
623                            Some(dict) => dict,
624                            None => {
625                                // Skip invalid page tree nodes in lenient mode
626                                tracing::debug!(
627                                    "Warning: Page tree node {} {} R is not a dictionary, skipping",
628                                    kid_ref.0,
629                                    kid_ref.1
630                                );
631                                current_idx += 1; // Count as processed but skip
632                                continue;
633                            }
634                        };
635
636                        let kid_type = kid_dict
637                            .get_type()
638                            .or_else(|| {
639                                // If Type is missing, try to infer from content
640                                if kid_dict.contains_key("Kids") && kid_dict.contains_key("Count") {
641                                    Some("Pages")
642                                } else if kid_dict.contains_key("Contents")
643                                    || kid_dict.contains_key("MediaBox")
644                                {
645                                    Some("Page")
646                                } else {
647                                    None
648                                }
649                            })
650                            .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
651
652                        let count = if kid_type == "Pages" {
653                            kid_dict
654                                .get("Count")
655                                .and_then(|obj| obj.as_integer())
656                                .unwrap_or(1) // Fallback to 1 if Count is missing (defensive)
657                                as u32
658                        } else {
659                            1
660                        };
661
662                        if target_index < current_idx + count {
663                            // Found the right subtree/page
664                            if kid_type == "Page" {
665                                // This is the page we want
666                                return self.create_parsed_page(
667                                    kid_ref,
668                                    kid_dict,
669                                    Some(&merged_inherited),
670                                );
671                            } else {
672                                // Need to traverse this subtree - add to queue
673                                pending_kids.push(WorkItem {
674                                    node_dict: kid_dict.clone(),
675                                    node_ref: Some(kid_ref),
676                                    current_index: current_idx,
677                                    inherited: Some(merged_inherited.clone()),
678                                });
679                                break; // Found our target subtree, no need to continue
680                            }
681                        }
682
683                        current_idx += count;
684                    }
685
686                    // Add pending kids to work queue in reverse order for correct processing
687                    work_queue.extend(pending_kids.into_iter().rev());
688                }
689                "Page" => {
690                    // This is a page object
691                    if target_index != current_index {
692                        return Err(ParseError::SyntaxError {
693                            position: 0,
694                            message: "Page index mismatch".to_string(),
695                        });
696                    }
697
698                    // We need the reference for creating the parsed page
699                    if let Some(page_ref) = node_ref {
700                        return self.create_parsed_page(page_ref, &node_dict, inherited.as_ref());
701                    } else {
702                        return Err(ParseError::SyntaxError {
703                            position: 0,
704                            message: "Direct page object without reference".to_string(),
705                        });
706                    }
707                }
708                _ => {
709                    return Err(ParseError::SyntaxError {
710                        position: 0,
711                        message: format!("Invalid page tree node type: {node_type}"),
712                    });
713                }
714            }
715        }
716
717        // Try fallback: search for the page by direct object scanning
718        tracing::debug!(
719            "Warning: Page {} not found in tree, attempting direct lookup",
720            target_index
721        );
722
723        // Scan for Page objects directly (try first few hundred objects)
724        for obj_num in 1..500 {
725            if let Ok(obj) = self.reader.borrow_mut().get_object(obj_num, 0) {
726                if let Some(dict) = obj.as_dict() {
727                    if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
728                        if obj_type.0 == "Page" {
729                            // Found a page, check if it's the right index (approximate)
730                            return self.create_parsed_page((obj_num, 0), dict, None);
731                        }
732                    }
733                }
734            }
735        }
736
737        Err(ParseError::SyntaxError {
738            position: 0,
739            message: format!("Page {} not found in tree or document", target_index),
740        })
741    }
742
743    /// Create a ParsedPage from a page dictionary
744    fn create_parsed_page(
745        &self,
746        obj_ref: (u32, u16),
747        page_dict: &PdfDictionary,
748        inherited: Option<&PdfDictionary>,
749    ) -> ParseResult<ParsedPage> {
750        // Extract page attributes with fallback for missing MediaBox
751        let media_box = match self.get_rectangle(page_dict, inherited, "MediaBox")? {
752            Some(mb) => mb,
753            None => {
754                // Use default Letter size if MediaBox is missing
755                #[cfg(debug_assertions)]
756                tracing::debug!(
757                    "Warning: Page {} {} R missing MediaBox, using default Letter size",
758                    obj_ref.0,
759                    obj_ref.1
760                );
761                [0.0, 0.0, 612.0, 792.0]
762            }
763        };
764
765        let crop_box = self.get_rectangle(page_dict, inherited, "CropBox")?;
766
767        let rotation = self
768            .get_integer(page_dict, inherited, "Rotate")?
769            .unwrap_or(0) as i32;
770
771        // Resolve the effective /Resources into an owned dictionary so that
772        // `ParsedPage::get_resources()` always yields a dictionary, even when
773        // /Resources is given as an indirect reference (issue #286). The page's
774        // own /Resources takes precedence over inherited ones; when it is an
775        // inline dictionary `get_resources()` returns it directly from the page
776        // dict, so we only need a resolved fallback for the reference / inherited
777        // cases.
778        let inherited_resources = {
779            let own_is_inline_dict = page_dict
780                .get("Resources")
781                .map(|o| o.as_dict().is_some())
782                .unwrap_or(false);
783            if own_is_inline_dict {
784                None
785            } else {
786                page_dict
787                    .get("Resources")
788                    .or_else(|| inherited.and_then(|i| i.get("Resources")))
789                    .and_then(|r| self.resolve(r).ok())
790                    .and_then(|r| r.as_dict().cloned())
791            }
792        };
793
794        // Get annotations if present
795        let annotations = page_dict
796            .get("Annots")
797            .and_then(|obj| obj.as_array())
798            .cloned();
799
800        Ok(ParsedPage {
801            obj_ref,
802            dict: page_dict.clone(),
803            inherited_resources,
804            media_box,
805            crop_box,
806            rotation,
807            annotations,
808        })
809    }
810
811    /// Get a rectangle value
812    fn get_rectangle(
813        &self,
814        node: &PdfDictionary,
815        inherited: Option<&PdfDictionary>,
816        key: &str,
817    ) -> ParseResult<Option<[f64; 4]>> {
818        let array = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
819
820        if let Some(array) = array.and_then(|obj| obj.as_array()) {
821            if array.len() != 4 {
822                return Err(ParseError::SyntaxError {
823                    position: 0,
824                    message: format!("{key} must have 4 elements"),
825                });
826            }
827
828            // After length check, we know array has exactly 4 elements
829            // Safe to index directly without unwrap
830            let rect = [
831                array.0[0].as_real().unwrap_or(0.0),
832                array.0[1].as_real().unwrap_or(0.0),
833                array.0[2].as_real().unwrap_or(0.0),
834                array.0[3].as_real().unwrap_or(0.0),
835            ];
836
837            Ok(Some(rect))
838        } else {
839            Ok(None)
840        }
841    }
842
843    /// Get an integer value
844    fn get_integer(
845        &self,
846        node: &PdfDictionary,
847        inherited: Option<&PdfDictionary>,
848        key: &str,
849    ) -> ParseResult<Option<i64>> {
850        let value = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
851
852        Ok(value.and_then(|obj| obj.as_integer()))
853    }
854
855    /// Get an object by its reference numbers.
856    ///
857    /// This method first checks the cache, then loads from the file if needed.
858    /// Objects are automatically cached after loading.
859    ///
860    /// # Arguments
861    ///
862    /// * `obj_num` - Object number
863    /// * `gen_num` - Generation number
864    ///
865    /// # Returns
866    ///
867    /// The resolved PDF object.
868    ///
869    /// # Errors
870    ///
871    /// Returns an error if:
872    /// - Object doesn't exist
873    /// - Object is part of an encrypted object stream
874    /// - File is corrupted
875    ///
876    /// # Example
877    ///
878    /// ```rust,no_run
879    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
880    /// # use oxidize_pdf::parser::objects::PdfObject;
881    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
882    /// # let reader = PdfReader::open("document.pdf")?;
883    /// # let document = PdfDocument::new(reader);
884    /// // Get object 10 0 R
885    /// let obj = document.get_object(10, 0)?;
886    ///
887    /// // Check object type
888    /// match obj {
889    ///     PdfObject::Dictionary(dict) => {
890    ///         println!("Object is a dictionary with {} entries", dict.0.len());
891    ///     }
892    ///     PdfObject::Stream(stream) => {
893    ///         println!("Object is a stream");
894    ///     }
895    ///     _ => {}
896    /// }
897    /// # Ok(())
898    /// # }
899    /// ```
900    pub fn get_object(&self, obj_num: u32, gen_num: u16) -> ParseResult<PdfObject> {
901        // Check resource cache first
902        if let Some(obj) = self.resources.get_cached((obj_num, gen_num)) {
903            return Ok(obj);
904        }
905
906        // Load from reader
907        let obj = {
908            let mut reader = self.reader.borrow_mut();
909            reader.get_object(obj_num, gen_num)?.clone()
910        };
911
912        // Cache it
913        self.resources.cache_object((obj_num, gen_num), obj.clone());
914
915        Ok(obj)
916    }
917
918    /// Resolve a reference to get the actual object.
919    ///
920    /// If the input is a Reference, fetches the referenced object.
921    /// Otherwise returns a clone of the input object.
922    ///
923    /// # Arguments
924    ///
925    /// * `obj` - The object to resolve (may be a Reference or direct object)
926    ///
927    /// # Returns
928    ///
929    /// The resolved object (never a Reference).
930    ///
931    /// # Example
932    ///
933    /// ```rust,no_run
934    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
935    /// # use oxidize_pdf::parser::objects::PdfObject;
936    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
937    /// # let reader = PdfReader::open("document.pdf")?;
938    /// # let document = PdfDocument::new(reader);
939    /// # let page = document.get_page(0)?;
940    /// // Contents might be a reference or direct object
941    /// if let Some(contents) = page.dict.get("Contents") {
942    ///     let resolved = document.resolve(contents)?;
943    ///     match resolved {
944    ///         PdfObject::Stream(_) => println!("Single content stream"),
945    ///         PdfObject::Array(_) => println!("Multiple content streams"),
946    ///         _ => println!("Unexpected content type"),
947    ///     }
948    /// }
949    /// # Ok(())
950    /// # }
951    /// ```
952    pub fn resolve(&self, obj: &PdfObject) -> ParseResult<PdfObject> {
953        match obj {
954            PdfObject::Reference(obj_num, gen_num) => self.get_object(*obj_num, *gen_num),
955            _ => Ok(obj.clone()),
956        }
957    }
958
959    /// Get content streams for a specific page.
960    ///
961    /// This method handles both single streams and arrays of streams,
962    /// automatically decompressing them according to their filters.
963    ///
964    /// # Arguments
965    ///
966    /// * `page` - The page to get content streams from
967    ///
968    /// # Returns
969    ///
970    /// Vector of decompressed content stream data ready for parsing.
971    ///
972    /// # Example
973    ///
974    /// ```rust,no_run
975    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
976    /// # use oxidize_pdf::parser::content::ContentParser;
977    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
978    /// # let reader = PdfReader::open("document.pdf")?;
979    /// # let document = PdfDocument::new(reader);
980    /// let page = document.get_page(0)?;
981    /// let streams = document.get_page_content_streams(&page)?;
982    ///
983    /// // Parse content streams
984    /// for stream_data in streams {
985    ///     let operations = ContentParser::parse(&stream_data)?;
986    ///     println!("Stream has {} operations", operations.len());
987    /// }
988    /// # Ok(())
989    /// # }
990    /// ```
991    /// Get page resources dictionary.
992    ///
993    /// This method returns the resources dictionary for a page, which may include
994    /// fonts, images (XObjects), patterns, color spaces, and other resources.
995    ///
996    /// # Arguments
997    ///
998    /// * `page` - The page to get resources from
999    ///
1000    /// # Returns
1001    ///
1002    /// Optional resources dictionary if the page has resources.
1003    ///
1004    /// # Example
1005    ///
1006    /// ```rust,no_run
1007    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader, PdfObject, PdfName};
1008    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
1009    /// # let reader = PdfReader::open("document.pdf")?;
1010    /// # let document = PdfDocument::new(reader);
1011    /// let page = document.get_page(0)?;
1012    /// if let Some(resources) = document.get_page_resources(&page)? {
1013    ///     // Check for images (XObjects)
1014    ///     if let Some(PdfObject::Dictionary(xobjects)) = resources.0.get(&PdfName("XObject".to_string())) {
1015    ///         for (name, _) in xobjects.0.iter() {
1016    ///             println!("Found XObject: {}", name.0);
1017    ///         }
1018    ///     }
1019    /// }
1020    /// # Ok(())
1021    /// # }
1022    /// ```
1023    pub fn get_page_resources<'a>(
1024        &self,
1025        page: &'a ParsedPage,
1026    ) -> ParseResult<Option<&'a PdfDictionary>> {
1027        Ok(page.get_resources())
1028    }
1029
1030    pub fn get_page_content_streams(&self, page: &ParsedPage) -> ParseResult<Vec<Vec<u8>>> {
1031        let mut streams = Vec::new();
1032        let options = self.options();
1033
1034        if let Some(contents) = page.dict.get("Contents") {
1035            let resolved_contents = self.resolve(contents)?;
1036
1037            match &resolved_contents {
1038                PdfObject::Stream(stream) => {
1039                    streams.push(stream.decode(&options)?);
1040                }
1041                PdfObject::Array(array) => {
1042                    for item in &array.0 {
1043                        let resolved = self.resolve(item)?;
1044                        if let PdfObject::Stream(stream) = resolved {
1045                            streams.push(stream.decode(&options)?);
1046                        }
1047                    }
1048                }
1049                _ => {
1050                    return Err(ParseError::SyntaxError {
1051                        position: 0,
1052                        message: "Contents must be a stream or array of streams".to_string(),
1053                    })
1054                }
1055            }
1056        }
1057
1058        Ok(streams)
1059    }
1060
1061    /// Extract text from all pages in the document.
1062    ///
1063    /// Uses the default text extraction settings. For custom settings,
1064    /// use `extract_text_with_options`.
1065    ///
1066    /// # Returns
1067    ///
1068    /// A vector of `ExtractedText`, one for each page in the document.
1069    ///
1070    /// # Example
1071    ///
1072    /// ```rust,no_run
1073    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
1074    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
1075    /// # let reader = PdfReader::open("document.pdf")?;
1076    /// # let document = PdfDocument::new(reader);
1077    /// let extracted_pages = document.extract_text()?;
1078    ///
1079    /// for (page_num, page_text) in extracted_pages.iter().enumerate() {
1080    ///     println!("=== Page {} ===", page_num + 1);
1081    ///     println!("{}", page_text.text);
1082    ///     println!();
1083    /// }
1084    /// # Ok(())
1085    /// # }
1086    /// ```
1087    pub fn extract_text(&self) -> ParseResult<Vec<crate::text::ExtractedText>> {
1088        let mut extractor = crate::text::TextExtractor::new();
1089        extractor.extract_from_document(self)
1090    }
1091
1092    /// Extract text from a specific page.
1093    ///
1094    /// # Arguments
1095    ///
1096    /// * `page_index` - Zero-based page index
1097    ///
1098    /// # Returns
1099    ///
1100    /// Extracted text with optional position information.
1101    ///
1102    /// # Example
1103    ///
1104    /// ```rust,no_run
1105    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
1106    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
1107    /// # let reader = PdfReader::open("document.pdf")?;
1108    /// # let document = PdfDocument::new(reader);
1109    /// // Extract text from first page only
1110    /// let page_text = document.extract_text_from_page(0)?;
1111    /// println!("First page text: {}", page_text.text);
1112    ///
1113    /// // Access text fragments with positions (if preserved)
1114    /// for fragment in &page_text.fragments {
1115    ///     println!("'{}' at ({}, {})", fragment.text, fragment.x, fragment.y);
1116    /// }
1117    /// # Ok(())
1118    /// # }
1119    /// ```
1120    pub fn extract_text_from_page(
1121        &self,
1122        page_index: u32,
1123    ) -> ParseResult<crate::text::ExtractedText> {
1124        let mut extractor = crate::text::TextExtractor::new();
1125        extractor.extract_from_page(self, page_index)
1126    }
1127
1128    /// Extract text from a specific page with custom options.
1129    ///
1130    /// This method combines the functionality of [`extract_text_from_page`] and
1131    /// [`extract_text_with_options`], allowing fine control over extraction
1132    /// behavior for a single page.
1133    ///
1134    /// # Arguments
1135    ///
1136    /// * `page_index` - Zero-based page index
1137    /// * `options` - Text extraction configuration
1138    ///
1139    /// # Returns
1140    ///
1141    /// Extracted text with optional position information.
1142    ///
1143    /// # Example
1144    ///
1145    /// ```rust,no_run
1146    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
1147    /// # use oxidize_pdf::text::ExtractionOptions;
1148    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
1149    /// # let reader = PdfReader::open("document.pdf")?;
1150    /// # let document = PdfDocument::new(reader);
1151    /// // Use higher space threshold for PDFs with micro-adjustments
1152    /// let options = ExtractionOptions {
1153    ///     space_threshold: 0.4,
1154    ///     ..Default::default()
1155    /// };
1156    ///
1157    /// let page_text = document.extract_text_from_page_with_options(0, options)?;
1158    /// println!("Text: {}", page_text.text);
1159    /// # Ok(())
1160    /// # }
1161    /// ```
1162    pub fn extract_text_from_page_with_options(
1163        &self,
1164        page_index: u32,
1165        options: crate::text::ExtractionOptions,
1166    ) -> ParseResult<crate::text::ExtractedText> {
1167        let mut extractor = crate::text::TextExtractor::with_options(options);
1168        extractor.extract_from_page(self, page_index)
1169    }
1170
1171    /// Extract text with custom extraction options.
1172    ///
1173    /// Allows fine control over text extraction behavior including
1174    /// layout preservation, spacing thresholds, and more.
1175    ///
1176    /// # Arguments
1177    ///
1178    /// * `options` - Text extraction configuration
1179    ///
1180    /// # Returns
1181    ///
1182    /// A vector of `ExtractedText`, one for each page.
1183    ///
1184    /// # Example
1185    ///
1186    /// ```rust,no_run
1187    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
1188    /// # use oxidize_pdf::text::ExtractionOptions;
1189    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
1190    /// # let reader = PdfReader::open("document.pdf")?;
1191    /// # let document = PdfDocument::new(reader);
1192    /// // Configure extraction to preserve layout
1193    /// let options = ExtractionOptions {
1194    ///     preserve_layout: true,
1195    ///     space_threshold: 0.3,
1196    ///     newline_threshold: 10.0,
1197    ///     ..Default::default()
1198    /// };
1199    ///
1200    /// let extracted_pages = document.extract_text_with_options(options)?;
1201    ///
1202    /// // Text fragments will include position information
1203    /// for page_text in extracted_pages {
1204    ///     for fragment in &page_text.fragments {
1205    ///         println!("{:?}", fragment);
1206    ///     }
1207    /// }
1208    /// # Ok(())
1209    /// # }
1210    /// ```
1211    pub fn extract_text_with_options(
1212        &self,
1213        options: crate::text::ExtractionOptions,
1214    ) -> ParseResult<Vec<crate::text::ExtractedText>> {
1215        let mut extractor = crate::text::TextExtractor::with_options(options);
1216        extractor.extract_from_document(self)
1217    }
1218
1219    /// Get annotations from a specific page.
1220    ///
1221    /// Returns a vector of annotation dictionaries for the specified page.
1222    /// Each annotation dictionary contains properties like Type, Rect, Contents, etc.
1223    ///
1224    /// # Arguments
1225    ///
1226    /// * `page_index` - Zero-based page index
1227    ///
1228    /// # Returns
1229    ///
1230    /// A vector of PdfDictionary objects representing annotations, or an empty vector
1231    /// if the page has no annotations.
1232    ///
1233    /// # Example
1234    ///
1235    /// ```rust,no_run
1236    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
1237    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
1238    /// # let reader = PdfReader::open("document.pdf")?;
1239    /// # let document = PdfDocument::new(reader);
1240    /// let annotations = document.get_page_annotations(0)?;
1241    /// for annot in &annotations {
1242    ///     if let Some(contents) = annot.get("Contents").and_then(|c| c.as_string()) {
1243    ///         println!("Annotation: {:?}", contents);
1244    ///     }
1245    /// }
1246    /// # Ok(())
1247    /// # }
1248    /// ```
1249    pub fn get_page_annotations(&self, page_index: u32) -> ParseResult<Vec<PdfDictionary>> {
1250        let page = self.get_page(page_index)?;
1251
1252        if let Some(annots_array) = page.get_annotations() {
1253            let mut annotations = Vec::new();
1254            let mut reader = self.reader.borrow_mut();
1255
1256            for annot_ref in &annots_array.0 {
1257                if let Some(ref_nums) = annot_ref.as_reference() {
1258                    match reader.get_object(ref_nums.0, ref_nums.1) {
1259                        Ok(obj) => {
1260                            if let Some(dict) = obj.as_dict() {
1261                                annotations.push(dict.clone());
1262                            }
1263                        }
1264                        Err(_) => {
1265                            // Skip annotations that can't be loaded
1266                            continue;
1267                        }
1268                    }
1269                }
1270            }
1271
1272            Ok(annotations)
1273        } else {
1274            Ok(Vec::new())
1275        }
1276    }
1277
1278    /// Get all annotations from all pages in the document.
1279    ///
1280    /// Returns a vector of tuples containing (page_index, annotations) for each page
1281    /// that has annotations.
1282    ///
1283    /// # Returns
1284    ///
1285    /// A vector of tuples where the first element is the page index and the second
1286    /// is a vector of annotation dictionaries for that page.
1287    ///
1288    /// # Example
1289    ///
1290    /// ```rust,no_run
1291    /// # use oxidize_pdf::parser::{PdfDocument, PdfReader};
1292    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
1293    /// # let reader = PdfReader::open("document.pdf")?;
1294    /// # let document = PdfDocument::new(reader);
1295    /// let all_annotations = document.get_all_annotations()?;
1296    /// for (page_idx, annotations) in all_annotations {
1297    ///     println!("Page {} has {} annotations", page_idx, annotations.len());
1298    /// }
1299    /// # Ok(())
1300    /// # }
1301    /// ```
1302    pub fn get_all_annotations(&self) -> ParseResult<Vec<(u32, Vec<PdfDictionary>)>> {
1303        let page_count = self.page_count()?;
1304        let mut all_annotations = Vec::new();
1305
1306        for i in 0..page_count {
1307            let annotations = self.get_page_annotations(i)?;
1308            if !annotations.is_empty() {
1309                all_annotations.push((i, annotations));
1310            }
1311        }
1312
1313        Ok(all_annotations)
1314    }
1315
1316    // --- VibeCoding Facade Methods ---
1317
1318    /// Export the document to LLM-optimized Markdown format.
1319    ///
1320    /// Delegates to [`crate::ai::export_to_markdown`]. Includes YAML frontmatter
1321    /// with document metadata followed by extracted text content.
1322    #[allow(deprecated)]
1323    pub fn to_markdown(&self) -> crate::error::Result<String> {
1324        crate::ai::export_to_markdown(self)
1325    }
1326
1327    /// Export the document to element-aware Markdown format.
1328    ///
1329    /// Unlike [`to_markdown`](Self::to_markdown), this method classifies elements
1330    /// by type and maps each to its canonical Markdown representation.
1331    pub fn to_element_markdown(&self) -> ParseResult<String> {
1332        let elements = self.partition()?;
1333        let exporter = crate::pipeline::export::ElementMarkdownExporter::default();
1334        Ok(exporter.export(&elements))
1335    }
1336
1337    /// Export the document to a contextual text format for LLM consumption.
1338    ///
1339    /// Delegates to [`crate::ai::export_to_contextual`].
1340    #[allow(deprecated)]
1341    pub fn to_contextual(&self) -> crate::error::Result<String> {
1342        crate::ai::export_to_contextual(self)
1343    }
1344
1345    /// Export the document to structured JSON format.
1346    ///
1347    /// Requires the `semantic` feature. Delegates to [`crate::ai::export_to_json`].
1348    #[cfg(feature = "semantic")]
1349    #[allow(deprecated)]
1350    pub fn to_json(&self) -> crate::error::Result<String> {
1351        crate::ai::export_to_json(self)
1352    }
1353
1354    /// Extract and chunk the document into RAG-ready chunks with full metadata.
1355    ///
1356    /// Uses default [`HybridChunkConfig`](crate::pipeline::HybridChunkConfig)
1357    /// (512 tokens, `AnyInlineContent` merge policy). Returns serializable
1358    /// [`RagChunk`](crate::pipeline::RagChunk)s with page numbers, bounding boxes,
1359    /// element types, and heading context — everything a vector store needs.
1360    ///
1361    /// # Example
1362    ///
1363    /// ```rust,no_run
1364    /// use oxidize_pdf::parser::{PdfDocument, PdfReader};
1365    ///
1366    /// let doc = PdfDocument::open("document.pdf")?;
1367    /// let chunks = doc.rag_chunks()?;
1368    /// for chunk in &chunks {
1369    ///     println!("Chunk {}: pages {:?}, ~{} tokens",
1370    ///         chunk.chunk_index, chunk.page_numbers, chunk.token_estimate);
1371    /// }
1372    /// # Ok::<(), Box<dyn std::error::Error>>(())
1373    /// ```
1374    pub fn rag_chunks(&self) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
1375        self.rag_chunks_with(crate::pipeline::HybridChunkConfig::default())
1376    }
1377
1378    /// Extract and chunk the document with a custom chunking configuration.
1379    ///
1380    /// Use this when the default 512-token limit is too large or too small for your
1381    /// vector store or embedding model. All other metadata (pages, bounding boxes,
1382    /// element types, heading context) is identical to [`rag_chunks()`](Self::rag_chunks).
1383    ///
1384    /// # Example
1385    ///
1386    /// ```rust,no_run
1387    /// use oxidize_pdf::parser::{PdfDocument, PdfReader};
1388    /// use oxidize_pdf::pipeline::HybridChunkConfig;
1389    ///
1390    /// let doc = PdfDocument::open("document.pdf")?;
1391    /// let config = HybridChunkConfig {
1392    ///     max_tokens: 256,
1393    ///     ..HybridChunkConfig::default()
1394    /// };
1395    /// let chunks = doc.rag_chunks_with(config)?;
1396    /// println!("Got {} chunks at 256-token limit", chunks.len());
1397    /// # Ok::<(), Box<dyn std::error::Error>>(())
1398    /// ```
1399    pub fn rag_chunks_with(
1400        &self,
1401        config: crate::pipeline::HybridChunkConfig,
1402    ) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
1403        let elements = self.partition()?;
1404        let chunker = crate::pipeline::HybridChunker::new(config);
1405        let hybrid_chunks = chunker.chunk(&elements);
1406        Ok(self.build_rag_chunks(&hybrid_chunks, None))
1407    }
1408
1409    /// Build RAG chunks stamped with source-document metadata.
1410    ///
1411    /// Auto-fills `title`/`author`/`creation_date`/`total_pages` from the info
1412    /// dictionary (only where the caller left them `None`); the caller-supplied
1413    /// `source` provides `filename`/`doc_hash` (and may override any auto-filled
1414    /// field). `doc_hash`, when set, becomes the stable prefix of every
1415    /// `chunk_id`. Same chunking pipeline as [`rag_chunks`](Self::rag_chunks).
1416    ///
1417    /// # Example
1418    ///
1419    /// ```rust,no_run
1420    /// use oxidize_pdf::parser::PdfDocument;
1421    /// use oxidize_pdf::pipeline::DocumentSource;
1422    ///
1423    /// let doc = PdfDocument::open("document.pdf")?;
1424    /// let mut source = DocumentSource::default();
1425    /// source.filename = Some("document.pdf".to_string());
1426    /// source.doc_hash = Some("sha256-prefix".to_string());
1427    /// let chunks = doc.rag_chunks_with_source(source)?;
1428    /// # Ok::<(), Box<dyn std::error::Error>>(())
1429    /// ```
1430    pub fn rag_chunks_with_source(
1431        &self,
1432        source: crate::pipeline::DocumentSource,
1433    ) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
1434        self.rag_chunks_with_source_and_config(
1435            source,
1436            crate::pipeline::HybridChunkConfig::default(),
1437        )
1438    }
1439
1440    /// Like [`rag_chunks_with_source`](Self::rag_chunks_with_source) but with a
1441    /// custom chunking configuration — for callers that need both
1442    /// source-document stamping and a non-default token budget.
1443    ///
1444    /// # Example
1445    ///
1446    /// ```rust,no_run
1447    /// use oxidize_pdf::parser::PdfDocument;
1448    /// use oxidize_pdf::pipeline::{DocumentSource, HybridChunkConfig};
1449    ///
1450    /// let doc = PdfDocument::open("document.pdf")?;
1451    /// let source = DocumentSource::with_file(Some("document.pdf".into()), None);
1452    /// let config = HybridChunkConfig { max_tokens: 256, ..Default::default() };
1453    /// let chunks = doc.rag_chunks_with_source_and_config(source, config)?;
1454    /// # Ok::<(), Box<dyn std::error::Error>>(())
1455    /// ```
1456    pub fn rag_chunks_with_source_and_config(
1457        &self,
1458        mut source: crate::pipeline::DocumentSource,
1459        config: crate::pipeline::HybridChunkConfig,
1460    ) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
1461        self.autofill_source(&mut source);
1462        let elements = self.partition()?;
1463        let chunker = crate::pipeline::HybridChunker::new(config);
1464        let hybrid_chunks = chunker.chunk(&elements);
1465        Ok(self.build_rag_chunks(&hybrid_chunks, Some(source)))
1466    }
1467
1468    /// Fill `title`/`author`/`creation_date`/`total_pages` from the info
1469    /// dictionary where the caller left them `None`.
1470    fn autofill_source(&self, source: &mut crate::pipeline::DocumentSource) {
1471        if let Ok(meta) = self.metadata() {
1472            source.title = source.title.take().or(meta.title);
1473            source.author = source.author.take().or(meta.author);
1474            source.creation_date = source.creation_date.take().or(meta.creation_date);
1475            source.total_pages = source.total_pages.or(meta.page_count);
1476        }
1477        if source.total_pages.is_none() {
1478            source.total_pages = self.page_count().ok();
1479        }
1480    }
1481
1482    /// Run a custom [`AnalysisPipeline`](crate::pipeline::AnalysisPipeline):
1483    /// partition, optionally classify elements, apply the pipeline's chunking
1484    /// strategy, build linked `RagChunk`s (ids, prev/next, metadata, optional
1485    /// source) exactly as the other `rag_chunks*` entry points do, then run any
1486    /// enrichers over each chunk's `extra` bag.
1487    ///
1488    /// `AnalysisPipeline::new()` reproduces [`rag_chunks`](Self::rag_chunks).
1489    ///
1490    /// **Stability:** requires `unstable-spi`; exempt from semver until promoted.
1491    ///
1492    /// # Example
1493    ///
1494    /// ```rust,no_run
1495    /// # use oxidize_pdf::parser::PdfDocument;
1496    /// # use oxidize_pdf::pipeline::AnalysisPipeline;
1497    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
1498    /// let doc = PdfDocument::open("document.pdf")?;
1499    /// // Default pipeline == rag_chunks(); swap in a custom strategy/classifier/
1500    /// // enricher via the builder to extend it.
1501    /// let chunks = doc.rag_chunks_with_pipeline(&AnalysisPipeline::new())?;
1502    /// println!("{} chunks", chunks.len());
1503    /// # Ok(())
1504    /// # }
1505    /// ```
1506    #[cfg(feature = "unstable-spi")]
1507    pub fn rag_chunks_with_pipeline(
1508        &self,
1509        pipeline: &crate::pipeline::AnalysisPipeline,
1510    ) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
1511        let mut source = pipeline.source.clone();
1512        if let Some(src) = source.as_mut() {
1513            self.autofill_source(src);
1514        }
1515        let mut elements = self.partition()?;
1516        if let Some(classifier) = pipeline.classifier.as_deref() {
1517            // Two passes: read labels against an immutable slice, then apply —
1518            // the classifier inspects neighbours via `ClassifyContext`, so it
1519            // cannot run while the slice is being mutated.
1520            let labels: Vec<Option<crate::pipeline::ClassLabel>> = (0..elements.len())
1521                .map(|index| {
1522                    let ctx = crate::pipeline::ClassifyContext {
1523                        elements: &elements,
1524                        index,
1525                    };
1526                    classifier.classify(&elements[index], &ctx)
1527                })
1528                .collect();
1529            for (element, label) in elements.iter_mut().zip(labels) {
1530                if let Some(label) = label {
1531                    element.metadata_mut().class_label = Some(label.0.into_owned());
1532                }
1533            }
1534        }
1535        let groups = pipeline.chunking.chunk(&elements);
1536        let hybrid: Vec<crate::pipeline::HybridChunk> = groups
1537            .into_iter()
1538            .map(|g| crate::pipeline::HybridChunk::from_group(g, pipeline.max_tokens))
1539            .collect();
1540        // `mut` is needed only for the enricher pass below (gated `semantic`);
1541        // without that feature the binding is never mutated — silence the warning.
1542        #[allow(unused_mut)]
1543        let mut chunks = self.build_rag_chunks(&hybrid, source);
1544        #[cfg(feature = "semantic")]
1545        if !pipeline.enrichers.is_empty() {
1546            // Enrich each chunk's `extra` bag. The hybrid chunk (kept alongside)
1547            // supplies the source elements; text/heading_path are snapshotted to
1548            // release the immutable borrow before mutating `metadata`.
1549            for (chunk, hc) in chunks.iter_mut().zip(hybrid.iter()) {
1550                let text = chunk.text.clone();
1551                let heading_path = chunk.metadata.heading_path.clone();
1552                let ctx = crate::pipeline::EnrichContext {
1553                    text: &text,
1554                    elements: hc.elements(),
1555                    heading_path: &heading_path,
1556                };
1557                for enricher in &pipeline.enrichers {
1558                    enricher.enrich(&ctx, &mut chunk.metadata);
1559                }
1560            }
1561        }
1562        Ok(chunks)
1563    }
1564
1565    /// Build linked [`RagChunk`]s from hybrid chunks, optionally stamping a
1566    /// [`DocumentSource`](crate::pipeline::DocumentSource), then wiring
1567    /// prev/next ids. Shared by all `rag_chunks*` entry points (DRY).
1568    fn build_rag_chunks(
1569        &self,
1570        hybrid_chunks: &[crate::pipeline::HybridChunk],
1571        source: Option<crate::pipeline::DocumentSource>,
1572    ) -> Vec<crate::pipeline::RagChunk> {
1573        let mut chunks: Vec<crate::pipeline::RagChunk> = match &source {
1574            Some(s) => hybrid_chunks
1575                .iter()
1576                .enumerate()
1577                .map(|(i, hc)| crate::pipeline::RagChunk::from_hybrid_chunk_with_source(i, hc, s))
1578                .collect(),
1579            None => hybrid_chunks
1580                .iter()
1581                .enumerate()
1582                .map(|(i, hc)| crate::pipeline::RagChunk::from_hybrid_chunk(i, hc))
1583                .collect(),
1584        };
1585        crate::pipeline::chunk_metadata::link_chunks(&mut chunks);
1586        chunks
1587    }
1588
1589    /// Extract and chunk the document using a pre-configured extraction profile.
1590    ///
1591    /// Combines [`partition_with_profile`](Self::partition_with_profile) with
1592    /// [`HybridChunker`](crate::pipeline::HybridChunker) using default chunking
1593    /// settings. Use [`rag_chunks_with`](Self::rag_chunks_with) when you need
1594    /// to tune `max_tokens` or `overlap_tokens`.
1595    ///
1596    /// # Example
1597    ///
1598    /// ```rust,no_run
1599    /// use oxidize_pdf::parser::PdfDocument;
1600    /// use oxidize_pdf::pipeline::ExtractionProfile;
1601    ///
1602    /// let doc = PdfDocument::open("document.pdf")?;
1603    /// let chunks = doc.rag_chunks_with_profile(ExtractionProfile::Rag)?;
1604    /// println!("Got {} RAG chunks", chunks.len());
1605    /// # Ok::<(), Box<dyn std::error::Error>>(())
1606    /// ```
1607    pub fn rag_chunks_with_profile(
1608        &self,
1609        profile: crate::pipeline::ExtractionProfile,
1610    ) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
1611        let elements = self.partition_with_profile(profile)?;
1612        let chunker = crate::pipeline::HybridChunker::default();
1613        let hybrid_chunks = chunker.chunk(&elements);
1614        Ok(self.build_rag_chunks(&hybrid_chunks, None))
1615    }
1616
1617    /// Combine a pre-configured extraction profile with a custom chunking config.
1618    ///
1619    /// Use this when you need both profile-tuned partitioning (e.g. `Rag` with
1620    /// XYCut reading order) and a non-default chunk size.
1621    ///
1622    /// # Example
1623    ///
1624    /// ```rust,no_run
1625    /// use oxidize_pdf::parser::PdfDocument;
1626    /// use oxidize_pdf::pipeline::{ExtractionProfile, HybridChunkConfig};
1627    ///
1628    /// let doc = PdfDocument::open("document.pdf")?;
1629    /// let config = HybridChunkConfig { max_tokens: 256, ..Default::default() };
1630    /// let chunks = doc.rag_chunks_with_profile_config(ExtractionProfile::Rag, config)?;
1631    /// # Ok::<(), Box<dyn std::error::Error>>(())
1632    /// ```
1633    pub fn rag_chunks_with_profile_config(
1634        &self,
1635        profile: crate::pipeline::ExtractionProfile,
1636        config: crate::pipeline::HybridChunkConfig,
1637    ) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
1638        let elements = self.partition_with_profile(profile)?;
1639        let chunker = crate::pipeline::HybridChunker::new(config);
1640        let hybrid_chunks = chunker.chunk(&elements);
1641        Ok(self.build_rag_chunks(&hybrid_chunks, None))
1642    }
1643
1644    /// Extract chunks as a JSON string ready for vector store ingestion.
1645    ///
1646    /// # Feature flags
1647    ///
1648    /// Requires the `semantic` feature: `oxidize-pdf = { features = ["semantic"] }`.
1649    /// Without it this method is not compiled.
1650    #[cfg(feature = "semantic")]
1651    pub fn rag_chunks_json(&self) -> ParseResult<String> {
1652        let chunks = self.rag_chunks()?;
1653        serde_json::to_string(&chunks).map_err(|e| ParseError::SerializationError(e.to_string()))
1654    }
1655
1656    /// Split the document text into chunks of approximately `target_tokens` size.
1657    ///
1658    /// Uses a default overlap of 10% of the target token count.
1659    #[deprecated(
1660        since = "2.2.0",
1661        note = "Use rag_chunks() for structure-aware RAG chunking"
1662    )]
1663    #[allow(deprecated)]
1664    pub fn chunk(
1665        &self,
1666        target_tokens: usize,
1667    ) -> crate::error::Result<Vec<crate::ai::DocumentChunk>> {
1668        let overlap = target_tokens / 10;
1669        self.chunk_with(target_tokens, overlap)
1670    }
1671
1672    /// Split the document text into chunks with explicit size and overlap control.
1673    #[deprecated(
1674        since = "2.2.0",
1675        note = "Use rag_chunks_with() for structure-aware RAG chunking"
1676    )]
1677    pub fn chunk_with(
1678        &self,
1679        target_tokens: usize,
1680        overlap: usize,
1681    ) -> crate::error::Result<Vec<crate::ai::DocumentChunk>> {
1682        let chunker = crate::ai::DocumentChunker::new(target_tokens, overlap);
1683        let extracted = self.extract_text()?;
1684        let page_texts: Vec<(usize, String)> = extracted
1685            .iter()
1686            .enumerate()
1687            .map(|(i, t)| (i + 1, t.text.clone()))
1688            .collect();
1689        chunker
1690            .chunk_text_with_pages(&page_texts)
1691            .map_err(|e| crate::error::PdfError::InvalidStructure(e.to_string()))
1692    }
1693
1694    /// Partition the document into typed elements using default configuration.
1695    ///
1696    /// Extracts text with layout preservation, then classifies fragments into
1697    /// [`Element`](crate::pipeline::Element) variants (Title, Paragraph, Table, etc.).
1698    pub fn partition(&self) -> ParseResult<Vec<crate::pipeline::Element>> {
1699        self.partition_with(crate::pipeline::PartitionConfig::default())
1700    }
1701
1702    /// Partition the document into typed elements with custom configuration.
1703    pub fn partition_with(
1704        &self,
1705        config: crate::pipeline::PartitionConfig,
1706    ) -> ParseResult<Vec<crate::pipeline::Element>> {
1707        let options = crate::text::ExtractionOptions {
1708            preserve_layout: true,
1709            reconstruct_paragraphs: true,
1710            ..Default::default()
1711        };
1712        self.do_partition_pages(options, config)
1713    }
1714
1715    /// Partition the document using a pre-configured extraction profile.
1716    pub fn partition_with_profile(
1717        &self,
1718        profile: crate::pipeline::ExtractionProfile,
1719    ) -> ParseResult<Vec<crate::pipeline::Element>> {
1720        let profile_cfg = profile.config();
1721        let options = crate::text::ExtractionOptions {
1722            preserve_layout: true,
1723            reconstruct_paragraphs: true,
1724            space_threshold: profile_cfg.extraction.space_threshold,
1725            detect_columns: profile_cfg.extraction.detect_columns,
1726            ..crate::text::ExtractionOptions::default()
1727        };
1728        self.do_partition_pages(options, profile_cfg.partition)
1729    }
1730
1731    fn do_partition_pages(
1732        &self,
1733        options: crate::text::ExtractionOptions,
1734        config: crate::pipeline::PartitionConfig,
1735    ) -> ParseResult<Vec<crate::pipeline::Element>> {
1736        // Read the gating flags before `config` is moved into the partitioner,
1737        // so we avoid cloning the config just to inspect two bools.
1738        let extract_graphics = config.detect_tables && config.prefer_ruling_tables;
1739
1740        // The reconstructed `pages` (extracted with `reconstruct_paragraphs = true`)
1741        // merge per-cell fragments into paragraph-granular fragments (issue #261),
1742        // which the ruling-based table detector cannot map back to grid cells. When
1743        // a page actually has a drawn table grid we re-extract just that page with
1744        // `reconstruct_paragraphs = false` to recover cell-granular fragments for
1745        // the detector; the reconstructed fragments still drive prose
1746        // classification. Inherit every other option (notably `space_threshold`
1747        // and `detect_columns`, which profiles override) so cell text is assembled
1748        // identically to the primary pass. Built before `options` is moved into
1749        // `extract_text_with_options`.
1750        let mut raw_options = options.clone();
1751        raw_options.reconstruct_paragraphs = false;
1752
1753        let pages = self.extract_text_with_options(options)?;
1754
1755        let partitioner = crate::pipeline::Partitioner::new(config);
1756        let mut graphics_extractor = crate::graphics::extraction::GraphicsExtractor::default();
1757        // Extracting per table-bearing page (rather than a second whole-document
1758        // pass) keeps the cost proportional to pages that need it and zero for
1759        // table-free documents even with `prefer_ruling_tables` on.
1760        let mut raw_extractor = crate::text::TextExtractor::with_options(raw_options);
1761
1762        let mut all_elements = Vec::new();
1763        for (page_idx, page_text) in pages.iter().enumerate() {
1764            let page_idx_u32 = u32::try_from(page_idx).map_err(|_| ParseError::SyntaxError {
1765                position: 0,
1766                message: format!("Page index {} exceeds u32 range", page_idx),
1767            })?;
1768            let page_height = self
1769                .get_page(page_idx_u32)
1770                .map(|p| p.height())
1771                .unwrap_or(842.0);
1772            let page_graphics = if extract_graphics {
1773                graphics_extractor.extract_from_page(self, page_idx).ok()
1774            } else {
1775                None
1776            };
1777            // Re-extract cell-granular fragments only for pages with a drawn grid.
1778            let raw_page = if page_graphics
1779                .as_ref()
1780                .is_some_and(|g| g.has_table_structure())
1781            {
1782                raw_extractor.extract_from_page(self, page_idx_u32).ok()
1783            } else {
1784                None
1785            };
1786            let raw_fragments = raw_page.as_ref().map(|pt| pt.fragments.as_slice());
1787            let elements = partitioner.partition_fragments_with_graphics_raw(
1788                &page_text.fragments,
1789                raw_fragments,
1790                page_graphics.as_ref(),
1791                page_idx_u32,
1792                page_height,
1793            );
1794            all_elements.extend(elements);
1795        }
1796
1797        Ok(all_elements)
1798    }
1799
1800    /// Partition the document into typed elements and build a relationship graph.
1801    ///
1802    /// Returns a tuple of `(elements, graph)` where the graph captures parent/child
1803    /// and next/prev relationships between elements by index.
1804    ///
1805    /// # Example
1806    ///
1807    /// ```rust,no_run
1808    /// use oxidize_pdf::parser::PdfDocument;
1809    /// use oxidize_pdf::pipeline::PartitionConfig;
1810    ///
1811    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
1812    /// let doc = PdfDocument::open("document.pdf")?;
1813    /// let (elements, graph) = doc.partition_graph(PartitionConfig::default())?;
1814    ///
1815    /// for title_idx in graph.top_level_sections() {
1816    ///     println!("Section: {}", elements[title_idx].text());
1817    ///     for child_idx in graph.elements_in_section(title_idx) {
1818    ///         println!("  {}", elements[child_idx].text());
1819    ///     }
1820    /// }
1821    /// # Ok(())
1822    /// # }
1823    /// ```
1824    pub fn partition_graph(
1825        &self,
1826        config: crate::pipeline::PartitionConfig,
1827    ) -> ParseResult<(Vec<crate::pipeline::Element>, crate::pipeline::ElementGraph)> {
1828        let elements = self.partition_with(config)?;
1829        let graph = crate::pipeline::ElementGraph::build(&elements);
1830        Ok((elements, graph))
1831    }
1832}
1833
1834impl PdfDocument<File> {
1835    /// Open a PDF file by path — the simplest way to start working with a PDF.
1836    ///
1837    /// This is a convenience method that combines `PdfReader::open()` and
1838    /// `PdfDocument::new()` into a single call.
1839    ///
1840    /// # Example
1841    ///
1842    /// ```rust,no_run
1843    /// use oxidize_pdf::parser::PdfDocument;
1844    ///
1845    /// let doc = PdfDocument::open("report.pdf").unwrap();
1846    /// let text = doc.extract_text().unwrap();
1847    /// let markdown = doc.to_markdown().unwrap();
1848    /// ```
1849    pub fn open<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
1850        PdfReader::open_document(path)
1851    }
1852}
1853
1854#[cfg(test)]
1855mod tests {
1856    use super::*;
1857    use crate::parser::objects::{PdfObject, PdfString};
1858    use std::io::Cursor;
1859
1860    // Helper function to create a minimal PDF in memory
1861    fn create_minimal_pdf() -> Vec<u8> {
1862        let mut pdf = Vec::new();
1863
1864        // PDF header
1865        pdf.extend_from_slice(b"%PDF-1.4\n");
1866
1867        // Catalog object
1868        pdf.extend_from_slice(b"1 0 obj\n");
1869        pdf.extend_from_slice(b"<< /Type /Catalog /Pages 2 0 R >>\n");
1870        pdf.extend_from_slice(b"endobj\n");
1871
1872        // Pages object
1873        pdf.extend_from_slice(b"2 0 obj\n");
1874        pdf.extend_from_slice(b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>\n");
1875        pdf.extend_from_slice(b"endobj\n");
1876
1877        // Page object
1878        pdf.extend_from_slice(b"3 0 obj\n");
1879        pdf.extend_from_slice(
1880            b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << >> >>\n",
1881        );
1882        pdf.extend_from_slice(b"endobj\n");
1883
1884        // Cross-reference table
1885        let xref_pos = pdf.len();
1886        pdf.extend_from_slice(b"xref\n");
1887        pdf.extend_from_slice(b"0 4\n");
1888        pdf.extend_from_slice(b"0000000000 65535 f \n");
1889        pdf.extend_from_slice(b"0000000009 00000 n \n");
1890        pdf.extend_from_slice(b"0000000058 00000 n \n");
1891        pdf.extend_from_slice(b"0000000115 00000 n \n");
1892
1893        // Trailer
1894        pdf.extend_from_slice(b"trailer\n");
1895        pdf.extend_from_slice(b"<< /Size 4 /Root 1 0 R >>\n");
1896        pdf.extend_from_slice(b"startxref\n");
1897        pdf.extend_from_slice(format!("{xref_pos}\n").as_bytes());
1898        pdf.extend_from_slice(b"%%EOF\n");
1899
1900        pdf
1901    }
1902
1903    // Helper to create a PDF with metadata
1904    fn create_pdf_with_metadata() -> Vec<u8> {
1905        let mut pdf = Vec::new();
1906
1907        // PDF header
1908        pdf.extend_from_slice(b"%PDF-1.5\n");
1909
1910        // Record positions for xref
1911        let obj1_pos = pdf.len();
1912
1913        // Catalog object
1914        pdf.extend_from_slice(b"1 0 obj\n");
1915        pdf.extend_from_slice(b"<< /Type /Catalog /Pages 2 0 R >>\n");
1916        pdf.extend_from_slice(b"endobj\n");
1917
1918        let obj2_pos = pdf.len();
1919
1920        // Pages object
1921        pdf.extend_from_slice(b"2 0 obj\n");
1922        pdf.extend_from_slice(b"<< /Type /Pages /Kids [] /Count 0 >>\n");
1923        pdf.extend_from_slice(b"endobj\n");
1924
1925        let obj3_pos = pdf.len();
1926
1927        // Info object
1928        pdf.extend_from_slice(b"3 0 obj\n");
1929        pdf.extend_from_slice(
1930            b"<< /Title (Test Document) /Author (Test Author) /Subject (Test Subject) >>\n",
1931        );
1932        pdf.extend_from_slice(b"endobj\n");
1933
1934        // Cross-reference table
1935        let xref_pos = pdf.len();
1936        pdf.extend_from_slice(b"xref\n");
1937        pdf.extend_from_slice(b"0 4\n");
1938        pdf.extend_from_slice(b"0000000000 65535 f \n");
1939        pdf.extend_from_slice(format!("{obj1_pos:010} 00000 n \n").as_bytes());
1940        pdf.extend_from_slice(format!("{obj2_pos:010} 00000 n \n").as_bytes());
1941        pdf.extend_from_slice(format!("{obj3_pos:010} 00000 n \n").as_bytes());
1942
1943        // Trailer
1944        pdf.extend_from_slice(b"trailer\n");
1945        pdf.extend_from_slice(b"<< /Size 4 /Root 1 0 R /Info 3 0 R >>\n");
1946        pdf.extend_from_slice(b"startxref\n");
1947        pdf.extend_from_slice(format!("{xref_pos}\n").as_bytes());
1948        pdf.extend_from_slice(b"%%EOF\n");
1949
1950        pdf
1951    }
1952
1953    #[test]
1954    fn test_pdf_document_new() {
1955        let pdf_data = create_minimal_pdf();
1956        let cursor = Cursor::new(pdf_data);
1957        let reader = PdfReader::new(cursor).unwrap();
1958        let document = PdfDocument::new(reader);
1959
1960        // Verify document is created with empty caches
1961        assert!(document.page_tree.borrow().is_none());
1962        assert!(document.metadata_cache.borrow().is_none());
1963    }
1964
1965    #[test]
1966    fn test_version() {
1967        let pdf_data = create_minimal_pdf();
1968        let cursor = Cursor::new(pdf_data);
1969        let reader = PdfReader::new(cursor).unwrap();
1970        let document = PdfDocument::new(reader);
1971
1972        let version = document.version().unwrap();
1973        assert_eq!(version, "1.4");
1974    }
1975
1976    #[test]
1977    fn test_page_count() {
1978        let pdf_data = create_minimal_pdf();
1979        let cursor = Cursor::new(pdf_data);
1980        let reader = PdfReader::new(cursor).unwrap();
1981        let document = PdfDocument::new(reader);
1982
1983        let count = document.page_count().unwrap();
1984        assert_eq!(count, 1);
1985    }
1986
1987    #[test]
1988    fn test_metadata() {
1989        let pdf_data = create_pdf_with_metadata();
1990        let cursor = Cursor::new(pdf_data);
1991        let reader = PdfReader::new(cursor).unwrap();
1992        let document = PdfDocument::new(reader);
1993
1994        let metadata = document.metadata().unwrap();
1995        assert_eq!(metadata.title, Some("Test Document".to_string()));
1996        assert_eq!(metadata.author, Some("Test Author".to_string()));
1997        assert_eq!(metadata.subject, Some("Test Subject".to_string()));
1998
1999        // Verify caching works
2000        let metadata2 = document.metadata().unwrap();
2001        assert_eq!(metadata.title, metadata2.title);
2002    }
2003
2004    #[test]
2005    fn test_get_page() {
2006        let pdf_data = create_minimal_pdf();
2007        let cursor = Cursor::new(pdf_data);
2008        let reader = PdfReader::new(cursor).unwrap();
2009        let document = PdfDocument::new(reader);
2010
2011        // Get first page
2012        let page = document.get_page(0).unwrap();
2013        assert_eq!(page.media_box, [0.0, 0.0, 612.0, 792.0]);
2014
2015        // Verify caching works
2016        let page2 = document.get_page(0).unwrap();
2017        assert_eq!(page.media_box, page2.media_box);
2018    }
2019
2020    #[test]
2021    fn test_get_page_out_of_bounds() {
2022        let pdf_data = create_minimal_pdf();
2023        let cursor = Cursor::new(pdf_data);
2024        let reader = PdfReader::new(cursor).unwrap();
2025        let document = PdfDocument::new(reader);
2026
2027        // Try to get page that doesn't exist
2028        let result = document.get_page(10);
2029        // With fallback lookup, this might succeed or fail gracefully
2030        if result.is_err() {
2031            assert!(result.unwrap_err().to_string().contains("Page"));
2032        } else {
2033            // If succeeds, should return a valid page
2034            let _page = result.unwrap();
2035        }
2036    }
2037
2038    #[test]
2039    fn test_resource_manager_caching() {
2040        let resources = ResourceManager::new();
2041
2042        // Test caching an object
2043        let obj_ref = (1, 0);
2044        let obj = PdfObject::String(PdfString("Test".as_bytes().to_vec()));
2045
2046        assert!(resources.get_cached(obj_ref).is_none());
2047
2048        resources.cache_object(obj_ref, obj.clone());
2049
2050        let cached = resources.get_cached(obj_ref).unwrap();
2051        assert_eq!(cached, obj);
2052
2053        // Test clearing cache
2054        resources.clear_cache();
2055        assert!(resources.get_cached(obj_ref).is_none());
2056    }
2057
2058    #[test]
2059    fn test_get_object() {
2060        let pdf_data = create_minimal_pdf();
2061        let cursor = Cursor::new(pdf_data);
2062        let reader = PdfReader::new(cursor).unwrap();
2063        let document = PdfDocument::new(reader);
2064
2065        // Get catalog object
2066        let catalog = document.get_object(1, 0).unwrap();
2067        if let PdfObject::Dictionary(dict) = catalog {
2068            if let Some(PdfObject::Name(name)) = dict.get("Type") {
2069                assert_eq!(name.0, "Catalog");
2070            } else {
2071                panic!("Expected /Type name");
2072            }
2073        } else {
2074            panic!("Expected dictionary object");
2075        }
2076    }
2077
2078    #[test]
2079    fn test_resolve_reference() {
2080        let pdf_data = create_minimal_pdf();
2081        let cursor = Cursor::new(pdf_data);
2082        let reader = PdfReader::new(cursor).unwrap();
2083        let document = PdfDocument::new(reader);
2084
2085        // Create a reference to the catalog
2086        let ref_obj = PdfObject::Reference(1, 0);
2087
2088        // Resolve it
2089        let resolved = document.resolve(&ref_obj).unwrap();
2090        if let PdfObject::Dictionary(dict) = resolved {
2091            if let Some(PdfObject::Name(name)) = dict.get("Type") {
2092                assert_eq!(name.0, "Catalog");
2093            } else {
2094                panic!("Expected /Type name");
2095            }
2096        } else {
2097            panic!("Expected dictionary object");
2098        }
2099    }
2100
2101    #[test]
2102    fn test_resolve_non_reference() {
2103        let pdf_data = create_minimal_pdf();
2104        let cursor = Cursor::new(pdf_data);
2105        let reader = PdfReader::new(cursor).unwrap();
2106        let document = PdfDocument::new(reader);
2107
2108        // Try to resolve a non-reference object
2109        let obj = PdfObject::String(PdfString("Test".as_bytes().to_vec()));
2110        let resolved = document.resolve(&obj).unwrap();
2111
2112        // Should return the same object
2113        assert_eq!(resolved, obj);
2114    }
2115
2116    #[test]
2117    fn test_invalid_pdf_data() {
2118        let invalid_data = b"This is not a PDF";
2119        let cursor = Cursor::new(invalid_data.to_vec());
2120        let result = PdfReader::new(cursor);
2121
2122        assert!(result.is_err());
2123    }
2124
2125    #[test]
2126    fn test_empty_page_tree() {
2127        // Create PDF with empty page tree
2128        let pdf_data = create_pdf_with_metadata(); // This has 0 pages
2129        let cursor = Cursor::new(pdf_data);
2130        let reader = PdfReader::new(cursor).unwrap();
2131        let document = PdfDocument::new(reader);
2132
2133        let count = document.page_count().unwrap();
2134        assert_eq!(count, 0);
2135
2136        // Try to get a page from empty document
2137        let result = document.get_page(0);
2138        assert!(result.is_err());
2139    }
2140
2141    #[test]
2142    fn test_extract_text_empty_document() {
2143        let pdf_data = create_pdf_with_metadata();
2144        let cursor = Cursor::new(pdf_data);
2145        let reader = PdfReader::new(cursor).unwrap();
2146        let document = PdfDocument::new(reader);
2147
2148        let text = document.extract_text().unwrap();
2149        assert!(text.is_empty());
2150    }
2151
2152    #[test]
2153    fn test_concurrent_access() {
2154        let pdf_data = create_minimal_pdf();
2155        let cursor = Cursor::new(pdf_data);
2156        let reader = PdfReader::new(cursor).unwrap();
2157        let document = PdfDocument::new(reader);
2158
2159        // Access multiple things concurrently
2160        let version = document.version().unwrap();
2161        let count = document.page_count().unwrap();
2162        let page = document.get_page(0).unwrap();
2163
2164        assert_eq!(version, "1.4");
2165        assert_eq!(count, 1);
2166        assert_eq!(page.media_box[2], 612.0);
2167    }
2168
2169    // Additional comprehensive tests
2170    mod comprehensive_tests {
2171        use super::*;
2172
2173        #[test]
2174        fn test_resource_manager_default() {
2175            let resources = ResourceManager::default();
2176            assert!(resources.get_cached((1, 0)).is_none());
2177        }
2178
2179        #[test]
2180        fn test_resource_manager_multiple_objects() {
2181            let resources = ResourceManager::new();
2182
2183            // Cache multiple objects
2184            resources.cache_object((1, 0), PdfObject::Integer(42));
2185            resources.cache_object((2, 0), PdfObject::Boolean(true));
2186            resources.cache_object(
2187                (3, 0),
2188                PdfObject::String(PdfString("test".as_bytes().to_vec())),
2189            );
2190
2191            // Verify all are cached
2192            assert!(resources.get_cached((1, 0)).is_some());
2193            assert!(resources.get_cached((2, 0)).is_some());
2194            assert!(resources.get_cached((3, 0)).is_some());
2195
2196            // Clear and verify empty
2197            resources.clear_cache();
2198            assert!(resources.get_cached((1, 0)).is_none());
2199            assert!(resources.get_cached((2, 0)).is_none());
2200            assert!(resources.get_cached((3, 0)).is_none());
2201        }
2202
2203        #[test]
2204        fn test_resource_manager_object_overwrite() {
2205            let resources = ResourceManager::new();
2206
2207            // Cache an object
2208            resources.cache_object((1, 0), PdfObject::Integer(42));
2209            assert_eq!(resources.get_cached((1, 0)), Some(PdfObject::Integer(42)));
2210
2211            // Overwrite with different object
2212            resources.cache_object((1, 0), PdfObject::Boolean(true));
2213            assert_eq!(resources.get_cached((1, 0)), Some(PdfObject::Boolean(true)));
2214        }
2215
2216        #[test]
2217        fn test_get_object_caching() {
2218            let pdf_data = create_minimal_pdf();
2219            let cursor = Cursor::new(pdf_data);
2220            let reader = PdfReader::new(cursor).unwrap();
2221            let document = PdfDocument::new(reader);
2222
2223            // Get object first time (should cache)
2224            let obj1 = document.get_object(1, 0).unwrap();
2225
2226            // Get same object again (should use cache)
2227            let obj2 = document.get_object(1, 0).unwrap();
2228
2229            // Objects should be identical
2230            assert_eq!(obj1, obj2);
2231
2232            // Verify it's cached
2233            assert!(document.resources.get_cached((1, 0)).is_some());
2234        }
2235
2236        #[test]
2237        fn test_get_object_different_generations() {
2238            let pdf_data = create_minimal_pdf();
2239            let cursor = Cursor::new(pdf_data);
2240            let reader = PdfReader::new(cursor).unwrap();
2241            let document = PdfDocument::new(reader);
2242
2243            // Get object with generation 0
2244            let _obj1 = document.get_object(1, 0).unwrap();
2245
2246            // Try to get same object with different generation (should fail)
2247            let result = document.get_object(1, 1);
2248            assert!(result.is_err());
2249
2250            // Original should still be cached
2251            assert!(document.resources.get_cached((1, 0)).is_some());
2252        }
2253
2254        #[test]
2255        fn test_get_object_nonexistent() {
2256            let pdf_data = create_minimal_pdf();
2257            let cursor = Cursor::new(pdf_data);
2258            let reader = PdfReader::new(cursor).unwrap();
2259            let document = PdfDocument::new(reader);
2260
2261            // Try to get non-existent object
2262            let result = document.get_object(999, 0);
2263            assert!(result.is_err());
2264        }
2265
2266        #[test]
2267        fn test_resolve_nested_references() {
2268            let pdf_data = create_minimal_pdf();
2269            let cursor = Cursor::new(pdf_data);
2270            let reader = PdfReader::new(cursor).unwrap();
2271            let document = PdfDocument::new(reader);
2272
2273            // Test resolving a reference
2274            let ref_obj = PdfObject::Reference(2, 0);
2275            let resolved = document.resolve(&ref_obj).unwrap();
2276
2277            // Should resolve to the pages object
2278            if let PdfObject::Dictionary(dict) = resolved {
2279                if let Some(PdfObject::Name(name)) = dict.get("Type") {
2280                    assert_eq!(name.0, "Pages");
2281                }
2282            }
2283        }
2284
2285        #[test]
2286        fn test_resolve_various_object_types() {
2287            let pdf_data = create_minimal_pdf();
2288            let cursor = Cursor::new(pdf_data);
2289            let reader = PdfReader::new(cursor).unwrap();
2290            let document = PdfDocument::new(reader);
2291
2292            // Test resolving different object types
2293            let test_objects = vec![
2294                PdfObject::Integer(42),
2295                PdfObject::Boolean(true),
2296                PdfObject::String(PdfString("test".as_bytes().to_vec())),
2297                PdfObject::Real(3.14),
2298                PdfObject::Null,
2299            ];
2300
2301            for obj in test_objects {
2302                let resolved = document.resolve(&obj).unwrap();
2303                assert_eq!(resolved, obj);
2304            }
2305        }
2306
2307        #[test]
2308        fn test_get_page_cached() {
2309            let pdf_data = create_minimal_pdf();
2310            let cursor = Cursor::new(pdf_data);
2311            let reader = PdfReader::new(cursor).unwrap();
2312            let document = PdfDocument::new(reader);
2313
2314            // Get page first time
2315            let page1 = document.get_page(0).unwrap();
2316
2317            // Get same page again
2318            let page2 = document.get_page(0).unwrap();
2319
2320            // Should be identical
2321            assert_eq!(page1.media_box, page2.media_box);
2322            assert_eq!(page1.rotation, page2.rotation);
2323            assert_eq!(page1.obj_ref, page2.obj_ref);
2324        }
2325
2326        #[test]
2327        fn test_metadata_caching() {
2328            let pdf_data = create_pdf_with_metadata();
2329            let cursor = Cursor::new(pdf_data);
2330            let reader = PdfReader::new(cursor).unwrap();
2331            let document = PdfDocument::new(reader);
2332
2333            // Get metadata first time
2334            let meta1 = document.metadata().unwrap();
2335
2336            // Get metadata again
2337            let meta2 = document.metadata().unwrap();
2338
2339            // Should be identical
2340            assert_eq!(meta1.title, meta2.title);
2341            assert_eq!(meta1.author, meta2.author);
2342            assert_eq!(meta1.subject, meta2.subject);
2343            assert_eq!(meta1.version, meta2.version);
2344        }
2345
2346        #[test]
2347        fn test_page_tree_initialization() {
2348            let pdf_data = create_minimal_pdf();
2349            let cursor = Cursor::new(pdf_data);
2350            let reader = PdfReader::new(cursor).unwrap();
2351            let document = PdfDocument::new(reader);
2352
2353            // Initially page tree should be None
2354            assert!(document.page_tree.borrow().is_none());
2355
2356            // After getting page count, page tree should be initialized
2357            let _count = document.page_count().unwrap();
2358            // Note: page_tree is private, so we can't directly check it
2359            // But we can verify it works by getting a page
2360            let _page = document.get_page(0).unwrap();
2361        }
2362
2363        #[test]
2364        fn test_get_page_resources() {
2365            let pdf_data = create_minimal_pdf();
2366            let cursor = Cursor::new(pdf_data);
2367            let reader = PdfReader::new(cursor).unwrap();
2368            let document = PdfDocument::new(reader);
2369
2370            let page = document.get_page(0).unwrap();
2371            let resources = document.get_page_resources(&page).unwrap();
2372
2373            // The minimal PDF has empty resources
2374            assert!(resources.is_some());
2375        }
2376
2377        #[test]
2378        fn test_get_page_content_streams_empty() {
2379            let pdf_data = create_minimal_pdf();
2380            let cursor = Cursor::new(pdf_data);
2381            let reader = PdfReader::new(cursor).unwrap();
2382            let document = PdfDocument::new(reader);
2383
2384            let page = document.get_page(0).unwrap();
2385            let streams = document.get_page_content_streams(&page).unwrap();
2386
2387            // Minimal PDF has no content streams
2388            assert!(streams.is_empty());
2389        }
2390
2391        #[test]
2392        fn test_extract_text_from_page() {
2393            let pdf_data = create_minimal_pdf();
2394            let cursor = Cursor::new(pdf_data);
2395            let reader = PdfReader::new(cursor).unwrap();
2396            let document = PdfDocument::new(reader);
2397
2398            let result = document.extract_text_from_page(0);
2399            // Should succeed even with empty page
2400            assert!(result.is_ok());
2401        }
2402
2403        #[test]
2404        fn test_extract_text_from_page_out_of_bounds() {
2405            let pdf_data = create_minimal_pdf();
2406            let cursor = Cursor::new(pdf_data);
2407            let reader = PdfReader::new(cursor).unwrap();
2408            let document = PdfDocument::new(reader);
2409
2410            let result = document.extract_text_from_page(999);
2411            // With fallback lookup, this might succeed or fail gracefully
2412            if result.is_err() {
2413                assert!(result.unwrap_err().to_string().contains("Page"));
2414            } else {
2415                // If succeeds, should return empty or valid text
2416                let _text = result.unwrap();
2417            }
2418        }
2419
2420        #[test]
2421        fn test_extract_text_with_options() {
2422            let pdf_data = create_minimal_pdf();
2423            let cursor = Cursor::new(pdf_data);
2424            let reader = PdfReader::new(cursor).unwrap();
2425            let document = PdfDocument::new(reader);
2426
2427            let options = crate::text::ExtractionOptions {
2428                preserve_layout: true,
2429                space_threshold: 0.5,
2430                newline_threshold: 15.0,
2431                ..Default::default()
2432            };
2433
2434            let result = document.extract_text_with_options(options);
2435            assert!(result.is_ok());
2436        }
2437
2438        #[test]
2439        fn test_version_different_pdf_versions() {
2440            // Test with different PDF versions
2441            let versions = vec!["1.3", "1.4", "1.5", "1.6", "1.7"];
2442
2443            for version in versions {
2444                let mut pdf_data = Vec::new();
2445
2446                // PDF header
2447                pdf_data.extend_from_slice(format!("%PDF-{version}\n").as_bytes());
2448
2449                // Track positions for xref
2450                let obj1_pos = pdf_data.len();
2451
2452                // Catalog object
2453                pdf_data.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
2454
2455                let obj2_pos = pdf_data.len();
2456
2457                // Pages object
2458                pdf_data
2459                    .extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n");
2460
2461                // Cross-reference table
2462                let xref_pos = pdf_data.len();
2463                pdf_data.extend_from_slice(b"xref\n");
2464                pdf_data.extend_from_slice(b"0 3\n");
2465                pdf_data.extend_from_slice(b"0000000000 65535 f \n");
2466                pdf_data.extend_from_slice(format!("{obj1_pos:010} 00000 n \n").as_bytes());
2467                pdf_data.extend_from_slice(format!("{obj2_pos:010} 00000 n \n").as_bytes());
2468
2469                // Trailer
2470                pdf_data.extend_from_slice(b"trailer\n");
2471                pdf_data.extend_from_slice(b"<< /Size 3 /Root 1 0 R >>\n");
2472                pdf_data.extend_from_slice(b"startxref\n");
2473                pdf_data.extend_from_slice(format!("{xref_pos}\n").as_bytes());
2474                pdf_data.extend_from_slice(b"%%EOF\n");
2475
2476                let cursor = Cursor::new(pdf_data);
2477                let reader = PdfReader::new(cursor).unwrap();
2478                let document = PdfDocument::new(reader);
2479
2480                let pdf_version = document.version().unwrap();
2481                assert_eq!(pdf_version, version);
2482            }
2483        }
2484
2485        #[test]
2486        fn test_page_count_zero() {
2487            let pdf_data = create_pdf_with_metadata(); // Has 0 pages
2488            let cursor = Cursor::new(pdf_data);
2489            let reader = PdfReader::new(cursor).unwrap();
2490            let document = PdfDocument::new(reader);
2491
2492            let count = document.page_count().unwrap();
2493            assert_eq!(count, 0);
2494        }
2495
2496        #[test]
2497        fn test_multiple_object_access() {
2498            let pdf_data = create_minimal_pdf();
2499            let cursor = Cursor::new(pdf_data);
2500            let reader = PdfReader::new(cursor).unwrap();
2501            let document = PdfDocument::new(reader);
2502
2503            // Access multiple objects
2504            let catalog = document.get_object(1, 0).unwrap();
2505            let pages = document.get_object(2, 0).unwrap();
2506            let page = document.get_object(3, 0).unwrap();
2507
2508            // Verify they're all different objects
2509            assert_ne!(catalog, pages);
2510            assert_ne!(pages, page);
2511            assert_ne!(catalog, page);
2512        }
2513
2514        #[test]
2515        fn test_error_handling_invalid_object_reference() {
2516            let pdf_data = create_minimal_pdf();
2517            let cursor = Cursor::new(pdf_data);
2518            let reader = PdfReader::new(cursor).unwrap();
2519            let document = PdfDocument::new(reader);
2520
2521            // Try to resolve an invalid reference
2522            let invalid_ref = PdfObject::Reference(999, 0);
2523            let result = document.resolve(&invalid_ref);
2524            assert!(result.is_err());
2525        }
2526
2527        #[test]
2528        fn test_concurrent_metadata_access() {
2529            let pdf_data = create_pdf_with_metadata();
2530            let cursor = Cursor::new(pdf_data);
2531            let reader = PdfReader::new(cursor).unwrap();
2532            let document = PdfDocument::new(reader);
2533
2534            // Access metadata and other properties concurrently
2535            let metadata = document.metadata().unwrap();
2536            let version = document.version().unwrap();
2537            let count = document.page_count().unwrap();
2538
2539            assert_eq!(metadata.title, Some("Test Document".to_string()));
2540            assert_eq!(version, "1.5");
2541            assert_eq!(count, 0);
2542        }
2543
2544        #[test]
2545        fn test_page_properties_comprehensive() {
2546            let pdf_data = create_minimal_pdf();
2547            let cursor = Cursor::new(pdf_data);
2548            let reader = PdfReader::new(cursor).unwrap();
2549            let document = PdfDocument::new(reader);
2550
2551            let page = document.get_page(0).unwrap();
2552
2553            // Test all page properties
2554            assert_eq!(page.media_box, [0.0, 0.0, 612.0, 792.0]);
2555            assert_eq!(page.crop_box, None);
2556            assert_eq!(page.rotation, 0);
2557            assert_eq!(page.obj_ref, (3, 0));
2558
2559            // Test width/height calculation
2560            assert_eq!(page.width(), 612.0);
2561            assert_eq!(page.height(), 792.0);
2562        }
2563
2564        #[test]
2565        fn test_memory_usage_efficiency() {
2566            let pdf_data = create_minimal_pdf();
2567            let cursor = Cursor::new(pdf_data);
2568            let reader = PdfReader::new(cursor).unwrap();
2569            let document = PdfDocument::new(reader);
2570
2571            // Access same page multiple times
2572            for _ in 0..10 {
2573                let _page = document.get_page(0).unwrap();
2574            }
2575
2576            // Should only have one copy in cache
2577            let page_count = document.page_count().unwrap();
2578            assert_eq!(page_count, 1);
2579        }
2580
2581        #[test]
2582        fn test_reader_borrow_safety() {
2583            let pdf_data = create_minimal_pdf();
2584            let cursor = Cursor::new(pdf_data);
2585            let reader = PdfReader::new(cursor).unwrap();
2586            let document = PdfDocument::new(reader);
2587
2588            // Multiple concurrent borrows should work
2589            let version = document.version().unwrap();
2590            let count = document.page_count().unwrap();
2591            let metadata = document.metadata().unwrap();
2592
2593            assert_eq!(version, "1.4");
2594            assert_eq!(count, 1);
2595            assert!(metadata.title.is_none());
2596        }
2597
2598        #[test]
2599        fn test_cache_consistency() {
2600            let pdf_data = create_minimal_pdf();
2601            let cursor = Cursor::new(pdf_data);
2602            let reader = PdfReader::new(cursor).unwrap();
2603            let document = PdfDocument::new(reader);
2604
2605            // Get object and verify caching
2606            let obj1 = document.get_object(1, 0).unwrap();
2607            let cached = document.resources.get_cached((1, 0)).unwrap();
2608
2609            assert_eq!(obj1, cached);
2610
2611            // Clear cache and get object again
2612            document.resources.clear_cache();
2613            let obj2 = document.get_object(1, 0).unwrap();
2614
2615            // Should be same content but loaded fresh
2616            assert_eq!(obj1, obj2);
2617        }
2618    }
2619
2620    #[test]
2621    fn test_resource_manager_new() {
2622        let resources = ResourceManager::new();
2623        assert!(resources.get_cached((1, 0)).is_none());
2624    }
2625
2626    #[test]
2627    fn test_resource_manager_cache_and_get() {
2628        let resources = ResourceManager::new();
2629
2630        // Cache an object
2631        let obj = PdfObject::Integer(42);
2632        resources.cache_object((10, 0), obj.clone());
2633
2634        // Should be retrievable
2635        let cached = resources.get_cached((10, 0));
2636        assert!(cached.is_some());
2637        assert_eq!(cached.unwrap(), obj);
2638
2639        // Non-existent object
2640        assert!(resources.get_cached((11, 0)).is_none());
2641    }
2642
2643    #[test]
2644    fn test_resource_manager_clear_cache() {
2645        let resources = ResourceManager::new();
2646
2647        // Cache multiple objects
2648        resources.cache_object((1, 0), PdfObject::Integer(1));
2649        resources.cache_object((2, 0), PdfObject::Integer(2));
2650        resources.cache_object((3, 0), PdfObject::Integer(3));
2651
2652        // Verify they're cached
2653        assert!(resources.get_cached((1, 0)).is_some());
2654        assert!(resources.get_cached((2, 0)).is_some());
2655        assert!(resources.get_cached((3, 0)).is_some());
2656
2657        // Clear cache
2658        resources.clear_cache();
2659
2660        // Should all be gone
2661        assert!(resources.get_cached((1, 0)).is_none());
2662        assert!(resources.get_cached((2, 0)).is_none());
2663        assert!(resources.get_cached((3, 0)).is_none());
2664    }
2665
2666    #[test]
2667    fn test_resource_manager_overwrite_cached() {
2668        let resources = ResourceManager::new();
2669
2670        // Cache initial object
2671        resources.cache_object((1, 0), PdfObject::Integer(42));
2672        assert_eq!(
2673            resources.get_cached((1, 0)).unwrap(),
2674            PdfObject::Integer(42)
2675        );
2676
2677        // Overwrite with new object
2678        resources.cache_object((1, 0), PdfObject::Integer(100));
2679        assert_eq!(
2680            resources.get_cached((1, 0)).unwrap(),
2681            PdfObject::Integer(100)
2682        );
2683    }
2684
2685    #[test]
2686    fn test_resource_manager_multiple_generations() {
2687        let resources = ResourceManager::new();
2688
2689        // Cache objects with different generations
2690        resources.cache_object((1, 0), PdfObject::Integer(10));
2691        resources.cache_object((1, 1), PdfObject::Integer(11));
2692        resources.cache_object((1, 2), PdfObject::Integer(12));
2693
2694        // Each should be distinct
2695        assert_eq!(
2696            resources.get_cached((1, 0)).unwrap(),
2697            PdfObject::Integer(10)
2698        );
2699        assert_eq!(
2700            resources.get_cached((1, 1)).unwrap(),
2701            PdfObject::Integer(11)
2702        );
2703        assert_eq!(
2704            resources.get_cached((1, 2)).unwrap(),
2705            PdfObject::Integer(12)
2706        );
2707    }
2708
2709    #[test]
2710    fn test_resource_manager_cache_complex_objects() {
2711        let resources = ResourceManager::new();
2712
2713        // Cache different object types
2714        resources.cache_object((1, 0), PdfObject::Boolean(true));
2715        resources.cache_object((2, 0), PdfObject::Real(3.14159));
2716        resources.cache_object(
2717            (3, 0),
2718            PdfObject::String(PdfString::new(b"Hello PDF".to_vec())),
2719        );
2720        resources.cache_object((4, 0), PdfObject::Name(PdfName::new("Type".to_string())));
2721
2722        let mut dict = PdfDictionary::new();
2723        dict.insert(
2724            "Key".to_string(),
2725            PdfObject::String(PdfString::new(b"Value".to_vec())),
2726        );
2727        resources.cache_object((5, 0), PdfObject::Dictionary(dict));
2728
2729        let array = vec![PdfObject::Integer(1), PdfObject::Integer(2)];
2730        resources.cache_object((6, 0), PdfObject::Array(PdfArray(array)));
2731
2732        // Verify all cached correctly
2733        assert_eq!(
2734            resources.get_cached((1, 0)).unwrap(),
2735            PdfObject::Boolean(true)
2736        );
2737        assert_eq!(
2738            resources.get_cached((2, 0)).unwrap(),
2739            PdfObject::Real(3.14159)
2740        );
2741        assert_eq!(
2742            resources.get_cached((3, 0)).unwrap(),
2743            PdfObject::String(PdfString::new(b"Hello PDF".to_vec()))
2744        );
2745        assert_eq!(
2746            resources.get_cached((4, 0)).unwrap(),
2747            PdfObject::Name(PdfName::new("Type".to_string()))
2748        );
2749        assert!(matches!(
2750            resources.get_cached((5, 0)).unwrap(),
2751            PdfObject::Dictionary(_)
2752        ));
2753        assert!(matches!(
2754            resources.get_cached((6, 0)).unwrap(),
2755            PdfObject::Array(_)
2756        ));
2757    }
2758
2759    // Tests for PdfDocument removed due to API incompatibilities
2760    // The methods tested don't exist in the current implementation
2761
2762    /*
2763        #[test]
2764        fn test_pdf_document_new_initialization() {
2765            // Create a minimal PDF for testing
2766            let data = b"%PDF-1.4
2767    1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2768    2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2769    3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2770    xref
2771    0 4
2772    0000000000 65535 f
2773    0000000009 00000 n
2774    0000000052 00000 n
2775    0000000101 00000 n
2776    trailer<</Size 4/Root 1 0 R>>
2777    startxref
2778    164
2779    %%EOF";
2780            let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2781            let document = PdfDocument::new(reader);
2782
2783            // Document should be created successfully
2784            // Initially no page tree loaded
2785            assert!(document.page_tree.borrow().is_none());
2786            assert!(document.metadata_cache.borrow().is_none());
2787        }
2788
2789        #[test]
2790        fn test_pdf_document_version() {
2791            // Create a minimal PDF for testing
2792            let data = b"%PDF-1.4
2793    1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2794    2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2795    3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2796    xref
2797    0 4
2798    0000000000 65535 f
2799    0000000009 00000 n
2800    0000000052 00000 n
2801    0000000101 00000 n
2802    trailer<</Size 4/Root 1 0 R>>
2803    startxref
2804    164
2805    %%EOF";
2806            let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2807            let document = PdfDocument::new(reader);
2808
2809            let version = document.version().unwrap();
2810            assert!(!version.is_empty());
2811            // Most PDFs are version 1.4 to 1.7
2812            assert!(version.starts_with("1.") || version.starts_with("2."));
2813        }
2814
2815        #[test]
2816        fn test_pdf_document_page_count() {
2817            // Create a minimal PDF for testing
2818            let data = b"%PDF-1.4
2819    1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2820    2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2821    3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2822    xref
2823    0 4
2824    0000000000 65535 f
2825    0000000009 00000 n
2826    0000000052 00000 n
2827    0000000101 00000 n
2828    trailer<</Size 4/Root 1 0 R>>
2829    startxref
2830    164
2831    %%EOF";
2832            let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2833            let document = PdfDocument::new(reader);
2834
2835            let count = document.page_count().unwrap();
2836            assert!(count > 0);
2837        }
2838
2839        #[test]
2840        fn test_pdf_document_metadata() {
2841            // Create a minimal PDF for testing
2842            let data = b"%PDF-1.4
2843    1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2844    2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2845    3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2846    xref
2847    0 4
2848    0000000000 65535 f
2849    0000000009 00000 n
2850    0000000052 00000 n
2851    0000000101 00000 n
2852    trailer<</Size 4/Root 1 0 R>>
2853    startxref
2854    164
2855    %%EOF";
2856            let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2857            let document = PdfDocument::new(reader);
2858
2859            let metadata = document.metadata().unwrap();
2860            // Metadata should be cached after first access
2861            assert!(document.metadata_cache.borrow().is_some());
2862
2863            // Second call should use cache
2864            let metadata2 = document.metadata().unwrap();
2865            assert_eq!(metadata.title, metadata2.title);
2866        }
2867
2868        #[test]
2869        fn test_pdf_document_get_page() {
2870            // Create a minimal PDF for testing
2871            let data = b"%PDF-1.4
2872    1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2873    2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2874    3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2875    xref
2876    0 4
2877    0000000000 65535 f
2878    0000000009 00000 n
2879    0000000052 00000 n
2880    0000000101 00000 n
2881    trailer<</Size 4/Root 1 0 R>>
2882    startxref
2883    164
2884    %%EOF";
2885            let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2886            let document = PdfDocument::new(reader);
2887
2888            // Get first page
2889            let page = document.get_page(0).unwrap();
2890            assert!(page.width() > 0.0);
2891            assert!(page.height() > 0.0);
2892
2893            // Page tree should be loaded now
2894            assert!(document.page_tree.borrow().is_some());
2895        }
2896
2897        #[test]
2898        fn test_pdf_document_get_page_out_of_bounds() {
2899            // Create a minimal PDF for testing
2900            let data = b"%PDF-1.4
2901    1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2902    2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2903    3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2904    xref
2905    0 4
2906    0000000000 65535 f
2907    0000000009 00000 n
2908    0000000052 00000 n
2909    0000000101 00000 n
2910    trailer<</Size 4/Root 1 0 R>>
2911    startxref
2912    164
2913    %%EOF";
2914            let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2915            let document = PdfDocument::new(reader);
2916
2917            let page_count = document.page_count().unwrap();
2918
2919            // Try to get page beyond count
2920            let result = document.get_page(page_count + 10);
2921            assert!(result.is_err());
2922        }
2923
2924
2925        #[test]
2926        fn test_pdf_document_get_object() {
2927            // Create a minimal PDF for testing
2928            let data = b"%PDF-1.4
2929    1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2930    2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2931    3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2932    xref
2933    0 4
2934    0000000000 65535 f
2935    0000000009 00000 n
2936    0000000052 00000 n
2937    0000000101 00000 n
2938    trailer<</Size 4/Root 1 0 R>>
2939    startxref
2940    164
2941    %%EOF";
2942            let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2943            let document = PdfDocument::new(reader);
2944
2945            // Get an object (catalog is usually object 1 0)
2946            let obj = document.get_object(1, 0);
2947            assert!(obj.is_ok());
2948
2949            // Object should be cached
2950            assert!(document.resources.get_cached((1, 0)).is_some());
2951        }
2952
2953
2954
2955        #[test]
2956        fn test_pdf_document_extract_text_from_page() {
2957            // Create a minimal PDF for testing
2958            let data = b"%PDF-1.4
2959    1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2960    2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2961    3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2962    xref
2963    0 4
2964    0000000000 65535 f
2965    0000000009 00000 n
2966    0000000052 00000 n
2967    0000000101 00000 n
2968    trailer<</Size 4/Root 1 0 R>>
2969    startxref
2970    164
2971    %%EOF";
2972            let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2973            let document = PdfDocument::new(reader);
2974
2975            // Try to extract text from first page
2976            let result = document.extract_text_from_page(0);
2977            // Even if no text, should not error
2978            assert!(result.is_ok());
2979        }
2980
2981        #[test]
2982        fn test_pdf_document_extract_all_text() {
2983            // Create a minimal PDF for testing
2984            let data = b"%PDF-1.4
2985    1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2986    2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
2987    3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
2988    xref
2989    0 4
2990    0000000000 65535 f
2991    0000000009 00000 n
2992    0000000052 00000 n
2993    0000000101 00000 n
2994    trailer<</Size 4/Root 1 0 R>>
2995    startxref
2996    164
2997    %%EOF";
2998            let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
2999            let document = PdfDocument::new(reader);
3000
3001            let extracted = document.extract_text().unwrap();
3002            let page_count = document.page_count().unwrap();
3003
3004            // Should have text for each page
3005            assert_eq!(extracted.len(), page_count);
3006        }
3007
3008
3009        #[test]
3010        fn test_pdf_document_ensure_page_tree() {
3011            // Create a minimal PDF for testing
3012            let data = b"%PDF-1.4
3013    1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
3014    2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
3015    3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]>>endobj
3016    xref
3017    0 4
3018    0000000000 65535 f
3019    0000000009 00000 n
3020    0000000052 00000 n
3021    0000000101 00000 n
3022    trailer<</Size 4/Root 1 0 R>>
3023    startxref
3024    164
3025    %%EOF";
3026            let reader = PdfReader::new(std::io::Cursor::new(data.to_vec())).unwrap();
3027            let document = PdfDocument::new(reader);
3028
3029            // Initially no page tree
3030            assert!(document.page_tree.borrow().is_none());
3031
3032            // After ensuring, should be loaded
3033            document.ensure_page_tree().unwrap();
3034            assert!(document.page_tree.borrow().is_some());
3035
3036            // Second call should not error
3037            document.ensure_page_tree().unwrap();
3038        }
3039
3040        #[test]
3041        fn test_resource_manager_concurrent_access() {
3042            let resources = ResourceManager::new();
3043
3044            // Simulate concurrent-like access pattern
3045            resources.cache_object((1, 0), PdfObject::Integer(1));
3046            let obj1 = resources.get_cached((1, 0));
3047
3048            resources.cache_object((2, 0), PdfObject::Integer(2));
3049            let obj2 = resources.get_cached((2, 0));
3050
3051            // Both should be accessible
3052            assert_eq!(obj1.unwrap(), PdfObject::Integer(1));
3053            assert_eq!(obj2.unwrap(), PdfObject::Integer(2));
3054        }
3055
3056        #[test]
3057        fn test_resource_manager_large_cache() {
3058            let resources = ResourceManager::new();
3059
3060            // Cache many objects
3061            for i in 0..1000 {
3062                resources.cache_object((i, 0), PdfObject::Integer(i as i64));
3063            }
3064
3065            // Verify random access
3066            assert_eq!(resources.get_cached((500, 0)).unwrap(), PdfObject::Integer(500));
3067            assert_eq!(resources.get_cached((999, 0)).unwrap(), PdfObject::Integer(999));
3068            assert_eq!(resources.get_cached((0, 0)).unwrap(), PdfObject::Integer(0));
3069
3070            // Clear should remove all
3071            resources.clear_cache();
3072            assert!(resources.get_cached((500, 0)).is_none());
3073        }
3074        */
3075}
oxidize_pdf/parser/document.rs

oxidize_pdf/parser/
document.rs