oxidize_pdf/parser/
document.rs

1//! PDF Document wrapper with improved architecture
2//! 
3//! This module provides a higher-level interface for PDF parsing that solves
4//! the borrow checker issues by using interior mutability and separation of concerns.
5
6use super::{ParseError, ParseResult};
7use super::reader::PdfReader;
8use super::page_tree::{PageTree, ParsedPage};
9use super::objects::{PdfObject, PdfDictionary};
10use std::io::{Read, Seek};
11use std::cell::RefCell;
12use std::rc::Rc;
13use std::collections::HashMap;
14
15/// Resource manager for caching PDF objects
16pub struct ResourceManager {
17    /// Cached objects by (obj_num, gen_num)
18    object_cache: RefCell<HashMap<(u32, u16), PdfObject>>,
19}
20
21impl ResourceManager {
22    /// Create a new resource manager
23    pub fn new() -> Self {
24        Self {
25            object_cache: RefCell::new(HashMap::new()),
26        }
27    }
28    
29    /// Get an object from cache
30    pub fn get_cached(&self, obj_ref: (u32, u16)) -> Option<PdfObject> {
31        self.object_cache.borrow().get(&obj_ref).cloned()
32    }
33    
34    /// Cache an object
35    pub fn cache_object(&self, obj_ref: (u32, u16), obj: PdfObject) {
36        self.object_cache.borrow_mut().insert(obj_ref, obj);
37    }
38    
39    /// Clear the cache
40    pub fn clear_cache(&self) {
41        self.object_cache.borrow_mut().clear();
42    }
43}
44
45/// High-level PDF document interface
46pub struct PdfDocument<R: Read + Seek> {
47    /// The underlying PDF reader
48    reader: RefCell<PdfReader<R>>,
49    /// Page tree navigator
50    page_tree: RefCell<Option<PageTree>>,
51    /// Resource manager for object caching
52    resources: Rc<ResourceManager>,
53    /// Document metadata cache
54    metadata_cache: RefCell<Option<super::reader::DocumentMetadata>>,
55}
56
57impl<R: Read + Seek> PdfDocument<R> {
58    /// Create a new PDF document from a reader
59    pub fn new(reader: PdfReader<R>) -> Self {
60        Self {
61            reader: RefCell::new(reader),
62            page_tree: RefCell::new(None),
63            resources: Rc::new(ResourceManager::new()),
64            metadata_cache: RefCell::new(None),
65        }
66    }
67    
68    /// Get the PDF version
69    pub fn version(&self) -> ParseResult<String> {
70        Ok(self.reader.borrow().version().to_string())
71    }
72    
73    /// Get the number of pages
74    pub fn page_count(&self) -> ParseResult<u32> {
75        self.reader.borrow_mut().page_count()
76    }
77    
78    /// Get document metadata
79    pub fn metadata(&self) -> ParseResult<super::reader::DocumentMetadata> {
80        // Check cache first
81        if let Some(metadata) = self.metadata_cache.borrow().as_ref() {
82            return Ok(metadata.clone());
83        }
84        
85        // Load metadata
86        let metadata = self.reader.borrow_mut().metadata()?;
87        self.metadata_cache.borrow_mut().replace(metadata.clone());
88        Ok(metadata)
89    }
90    
91    /// Initialize the page tree if not already done
92    fn ensure_page_tree(&self) -> ParseResult<()> {
93        if self.page_tree.borrow().is_none() {
94            let page_count = self.page_count()?;
95            let pages_dict = self.load_pages_dict()?;
96            let page_tree = PageTree::new_with_pages_dict(page_count, pages_dict);
97            self.page_tree.borrow_mut().replace(page_tree);
98        }
99        Ok(())
100    }
101    
102    /// Load the pages dictionary
103    fn load_pages_dict(&self) -> ParseResult<PdfDictionary> {
104        let mut reader = self.reader.borrow_mut();
105        let pages = reader.pages()?;
106        Ok(pages.clone())
107    }
108    
109    /// Get a page by index (0-based)
110    pub fn get_page(&self, index: u32) -> ParseResult<ParsedPage> {
111        self.ensure_page_tree()?;
112        
113        // First check if page is already loaded
114        if let Some(page_tree) = self.page_tree.borrow().as_ref() {
115            if let Some(page) = page_tree.get_cached_page(index) {
116                return Ok(page.clone());
117            }
118        }
119        
120        // Load the page
121        let page = self.load_page_at_index(index)?;
122        
123        // Cache it
124        if let Some(page_tree) = self.page_tree.borrow_mut().as_mut() {
125            page_tree.cache_page(index, page.clone());
126        }
127        
128        Ok(page)
129    }
130    
131    /// Load a specific page by index
132    fn load_page_at_index(&self, index: u32) -> ParseResult<ParsedPage> {
133        // Get the pages root
134        let pages_dict = self.load_pages_dict()?;
135        
136        // Navigate to the specific page
137        let page_info = self.find_page_in_tree(&pages_dict, index, 0, None)?;
138        
139        Ok(page_info)
140    }
141    
142    /// Find a page in the page tree
143    fn find_page_in_tree(
144        &self,
145        node: &PdfDictionary,
146        target_index: u32,
147        current_index: u32,
148        inherited: Option<&PdfDictionary>,
149    ) -> ParseResult<ParsedPage> {
150        let node_type = node.get_type()
151            .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
152        
153        match node_type {
154            "Pages" => {
155                // This is a page tree node
156                let kids = node.get("Kids")
157                    .and_then(|obj| obj.as_array())
158                    .ok_or_else(|| ParseError::MissingKey("Kids".to_string()))?;
159                
160                // Merge inherited attributes
161                let mut merged_inherited = inherited.cloned().unwrap_or_else(PdfDictionary::new);
162                
163                // Inheritable attributes
164                for key in ["Resources", "MediaBox", "CropBox", "Rotate"] {
165                    if let Some(value) = node.get(key) {
166                        if !merged_inherited.contains_key(key) {
167                            merged_inherited.insert(key.to_string(), value.clone());
168                        }
169                    }
170                }
171                
172                // Find which kid contains our target page
173                let mut current_idx = current_index;
174                for kid_ref in &kids.0 {
175                    let kid_ref = kid_ref.as_reference()
176                        .ok_or_else(|| ParseError::SyntaxError {
177                            position: 0,
178                            message: "Kids array must contain references".to_string(),
179                        })?;
180                    
181                    // Get the kid object
182                    let kid_obj = self.get_object(kid_ref.0, kid_ref.1)?;
183                    let kid_dict = kid_obj.as_dict()
184                        .ok_or_else(|| ParseError::SyntaxError {
185                            position: 0,
186                            message: "Page tree node must be a dictionary".to_string(),
187                        })?;
188                    
189                    let kid_type = kid_dict.get_type()
190                        .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
191                    
192                    let count = if kid_type == "Pages" {
193                        kid_dict.get("Count")
194                            .and_then(|obj| obj.as_integer())
195                            .ok_or_else(|| ParseError::MissingKey("Count".to_string()))? as u32
196                    } else {
197                        1
198                    };
199                    
200                    if target_index < current_idx + count {
201                        // Found the right subtree/page
202                        if kid_type == "Page" {
203                            // This is the page we want
204                            return self.create_parsed_page(kid_ref, kid_dict, Some(&merged_inherited));
205                        } else {
206                            // Recurse into this subtree
207                            return self.find_page_in_tree(
208                                kid_dict,
209                                target_index,
210                                current_idx,
211                                Some(&merged_inherited),
212                            );
213                        }
214                    }
215                    
216                    current_idx += count;
217                }
218                
219                Err(ParseError::SyntaxError {
220                    position: 0,
221                    message: "Page not found in tree".to_string(),
222                })
223            }
224            "Page" => {
225                // This is a page object
226                if target_index != current_index {
227                    return Err(ParseError::SyntaxError {
228                        position: 0,
229                        message: "Page index mismatch".to_string(),
230                    });
231                }
232                
233                // We need the reference, but we don't have it here
234                // This case shouldn't happen if we're navigating properly
235                Err(ParseError::SyntaxError {
236                    position: 0,
237                    message: "Direct page object without reference".to_string(),
238                })
239            }
240            _ => Err(ParseError::SyntaxError {
241                position: 0,
242                message: format!("Invalid page tree node type: {}", node_type),
243            }),
244        }
245    }
246    
247    /// Create a ParsedPage from a page dictionary
248    fn create_parsed_page(
249        &self,
250        obj_ref: (u32, u16),
251        page_dict: &PdfDictionary,
252        inherited: Option<&PdfDictionary>,
253    ) -> ParseResult<ParsedPage> {
254        // Extract page attributes
255        let media_box = self.get_rectangle(page_dict, inherited, "MediaBox")?
256            .ok_or_else(|| ParseError::MissingKey("MediaBox".to_string()))?;
257        
258        let crop_box = self.get_rectangle(page_dict, inherited, "CropBox")?;
259        
260        let rotation = self.get_integer(page_dict, inherited, "Rotate")?
261            .unwrap_or(0) as i32;
262        
263        // Get inherited resources
264        let inherited_resources = if let Some(inherited) = inherited {
265            inherited.get("Resources").and_then(|r| r.as_dict()).cloned()
266        } else {
267            None
268        };
269        
270        Ok(ParsedPage {
271            obj_ref,
272            dict: page_dict.clone(),
273            inherited_resources,
274            media_box,
275            crop_box,
276            rotation,
277        })
278    }
279    
280    /// Get a rectangle value
281    fn get_rectangle(
282        &self,
283        node: &PdfDictionary,
284        inherited: Option<&PdfDictionary>,
285        key: &str,
286    ) -> ParseResult<Option<[f64; 4]>> {
287        let array = node.get(key)
288            .or_else(|| inherited.and_then(|i| i.get(key)));
289        
290        if let Some(array) = array.and_then(|obj| obj.as_array()) {
291            if array.len() != 4 {
292                return Err(ParseError::SyntaxError {
293                    position: 0,
294                    message: format!("{} must have 4 elements", key),
295                });
296            }
297            
298            let rect = [
299                array.get(0).unwrap().as_real().unwrap_or(0.0),
300                array.get(1).unwrap().as_real().unwrap_or(0.0),
301                array.get(2).unwrap().as_real().unwrap_or(0.0),
302                array.get(3).unwrap().as_real().unwrap_or(0.0),
303            ];
304            
305            Ok(Some(rect))
306        } else {
307            Ok(None)
308        }
309    }
310    
311    /// Get an integer value
312    fn get_integer(
313        &self,
314        node: &PdfDictionary,
315        inherited: Option<&PdfDictionary>,
316        key: &str,
317    ) -> ParseResult<Option<i64>> {
318        let value = node.get(key)
319            .or_else(|| inherited.and_then(|i| i.get(key)));
320        
321        Ok(value.and_then(|obj| obj.as_integer()))
322    }
323    
324    /// Get an object by reference
325    pub fn get_object(&self, obj_num: u32, gen_num: u16) -> ParseResult<PdfObject> {
326        // Check resource cache first
327        if let Some(obj) = self.resources.get_cached((obj_num, gen_num)) {
328            return Ok(obj);
329        }
330        
331        // Load from reader
332        let obj = {
333            let mut reader = self.reader.borrow_mut();
334            reader.get_object(obj_num, gen_num)?.clone()
335        };
336        
337        // Cache it
338        self.resources.cache_object((obj_num, gen_num), obj.clone());
339        
340        Ok(obj)
341    }
342    
343    /// Resolve a reference
344    pub fn resolve(&self, obj: &PdfObject) -> ParseResult<PdfObject> {
345        match obj {
346            PdfObject::Reference(obj_num, gen_num) => {
347                self.get_object(*obj_num, *gen_num)
348            }
349            _ => Ok(obj.clone()),
350        }
351    }
352    
353    /// Get content streams for a page
354    pub fn get_page_content_streams(&self, page: &ParsedPage) -> ParseResult<Vec<Vec<u8>>> {
355        let mut streams = Vec::new();
356        
357        if let Some(contents) = page.dict.get("Contents") {
358            let resolved_contents = self.resolve(contents)?;
359            
360            match &resolved_contents {
361                PdfObject::Stream(stream) => {
362                    streams.push(stream.decode()?);
363                }
364                PdfObject::Array(array) => {
365                    for item in &array.0 {
366                        let resolved = self.resolve(item)?;
367                        if let PdfObject::Stream(stream) = resolved {
368                            streams.push(stream.decode()?);
369                        }
370                    }
371                }
372                _ => return Err(ParseError::SyntaxError {
373                    position: 0,
374                    message: "Contents must be a stream or array of streams".to_string(),
375                }),
376            }
377        }
378        
379        Ok(streams)
380    }
381    
382    /// Extract text from all pages
383    pub fn extract_text(&self) -> ParseResult<Vec<crate::text::ExtractedText>> {
384        let extractor = crate::text::TextExtractor::new();
385        extractor.extract_from_document(self)
386    }
387    
388    /// Extract text from a specific page
389    pub fn extract_text_from_page(&self, page_index: u32) -> ParseResult<crate::text::ExtractedText> {
390        let extractor = crate::text::TextExtractor::new();
391        extractor.extract_from_page(self, page_index)
392    }
393    
394    /// Extract text with custom options
395    pub fn extract_text_with_options(&self, options: crate::text::ExtractionOptions) -> ParseResult<Vec<crate::text::ExtractedText>> {
396        let extractor = crate::text::TextExtractor::with_options(options);
397        extractor.extract_from_document(self)
398    }
399}