oxidize_pdf/parser/
reader.rs

1//! High-level PDF Reader API
2//!
3//! Provides a simple interface for reading PDF files
4
5use super::header::PdfHeader;
6use super::object_stream::ObjectStream;
7use super::objects::{PdfDictionary, PdfObject};
8use super::trailer::PdfTrailer;
9use super::xref::XRefTable;
10use super::{ParseError, ParseResult};
11use std::collections::HashMap;
12use std::fs::File;
13use std::io::{BufReader, Read, Seek};
14use std::path::Path;
15
16/// High-level PDF reader
17pub struct PdfReader<R: Read + Seek> {
18    reader: BufReader<R>,
19    header: PdfHeader,
20    xref: XRefTable,
21    trailer: PdfTrailer,
22    /// Cache of loaded objects
23    object_cache: HashMap<(u32, u16), PdfObject>,
24    /// Cache of object streams
25    object_stream_cache: HashMap<u32, ObjectStream>,
26    /// Page tree navigator
27    page_tree: Option<super::page_tree::PageTree>,
28}
29
30impl PdfReader<File> {
31    /// Open a PDF file from a path
32    pub fn open<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
33        let file = File::open(path)?;
34        Self::new(file)
35    }
36
37    /// Open a PDF file as a PdfDocument
38    pub fn open_document<P: AsRef<Path>>(
39        path: P,
40    ) -> ParseResult<super::document::PdfDocument<File>> {
41        let reader = Self::open(path)?;
42        Ok(reader.into_document())
43    }
44}
45
46impl<R: Read + Seek> PdfReader<R> {
47    /// Create a new PDF reader from a reader
48    pub fn new(reader: R) -> ParseResult<Self> {
49        let mut buf_reader = BufReader::new(reader);
50
51        // Parse header
52        let header = PdfHeader::parse(&mut buf_reader)?;
53        // Parse xref table
54        let xref = XRefTable::parse(&mut buf_reader)?;
55
56        // Get trailer
57        let trailer_dict = xref.trailer().ok_or(ParseError::InvalidTrailer)?.clone();
58
59        let xref_offset = 0; // TODO: Get actual offset
60        let trailer = PdfTrailer::from_dict(trailer_dict, xref_offset)?;
61
62        // Validate trailer
63        trailer.validate()?;
64
65        Ok(Self {
66            reader: buf_reader,
67            header,
68            xref,
69            trailer,
70            object_cache: HashMap::new(),
71            object_stream_cache: HashMap::new(),
72            page_tree: None,
73        })
74    }
75
76    /// Get the PDF version
77    pub fn version(&self) -> &super::header::PdfVersion {
78        &self.header.version
79    }
80
81    /// Get the document catalog
82    pub fn catalog(&mut self) -> ParseResult<&PdfDictionary> {
83        let (obj_num, gen_num) = self.trailer.root()?;
84        let catalog = self.get_object(obj_num, gen_num)?;
85
86        catalog.as_dict().ok_or_else(|| ParseError::SyntaxError {
87            position: 0,
88            message: "Catalog is not a dictionary".to_string(),
89        })
90    }
91
92    /// Get the document info dictionary
93    pub fn info(&mut self) -> ParseResult<Option<&PdfDictionary>> {
94        match self.trailer.info() {
95            Some((obj_num, gen_num)) => {
96                let info = self.get_object(obj_num, gen_num)?;
97                Ok(info.as_dict())
98            }
99            None => Ok(None),
100        }
101    }
102
103    /// Get an object by reference
104    pub fn get_object(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
105        let key = (obj_num, gen_num);
106
107        // Check cache first
108        if self.object_cache.contains_key(&key) {
109            return Ok(&self.object_cache[&key]);
110        }
111
112        // Check if this is a compressed object
113        if let Some(ext_entry) = self.xref.get_extended_entry(obj_num) {
114            if let Some((stream_obj_num, index_in_stream)) = ext_entry.compressed_info {
115                // This is a compressed object - need to extract from object stream
116                return self.get_compressed_object(
117                    obj_num,
118                    gen_num,
119                    stream_obj_num,
120                    index_in_stream,
121                );
122            }
123        }
124
125        // Get xref entry
126        let entry = self
127            .xref
128            .get_entry(obj_num)
129            .ok_or(ParseError::InvalidReference(obj_num, gen_num))?;
130
131        if !entry.in_use {
132            // Free object
133            self.object_cache.insert(key, PdfObject::Null);
134            return Ok(&self.object_cache[&key]);
135        }
136
137        if entry.generation != gen_num {
138            return Err(ParseError::InvalidReference(obj_num, gen_num));
139        }
140
141        // Seek to object position
142        self.reader.seek(std::io::SeekFrom::Start(entry.offset))?;
143
144        // Parse object header (obj_num gen_num obj)
145        let mut lexer = super::lexer::Lexer::new(&mut self.reader);
146
147        // Read object number
148        let token = lexer.next_token()?;
149        let read_obj_num = match token {
150            super::lexer::Token::Integer(n) => n as u32,
151            _ => {
152                return Err(ParseError::SyntaxError {
153                    position: entry.offset as usize,
154                    message: "Expected object number".to_string(),
155                })
156            }
157        };
158
159        if read_obj_num != obj_num {
160            return Err(ParseError::SyntaxError {
161                position: entry.offset as usize,
162                message: format!(
163                    "Object number mismatch: expected {obj_num}, found {read_obj_num}"
164                ),
165            });
166        }
167
168        // Read generation number
169        let token = lexer.next_token()?;
170        let read_gen_num = match token {
171            super::lexer::Token::Integer(n) => n as u16,
172            _ => {
173                return Err(ParseError::SyntaxError {
174                    position: entry.offset as usize,
175                    message: "Expected generation number".to_string(),
176                })
177            }
178        };
179
180        if read_gen_num != gen_num {
181            return Err(ParseError::SyntaxError {
182                position: entry.offset as usize,
183                message: format!(
184                    "Generation number mismatch: expected {gen_num}, found {read_gen_num}"
185                ),
186            });
187        }
188
189        // Read 'obj' keyword
190        let token = lexer.next_token()?;
191        match token {
192            super::lexer::Token::Obj => {}
193            _ => {
194                return Err(ParseError::SyntaxError {
195                    position: entry.offset as usize,
196                    message: "Expected 'obj' keyword".to_string(),
197                })
198            }
199        };
200
201        // Parse the actual object
202        let obj = PdfObject::parse(&mut lexer)?;
203
204        // Read 'endobj' keyword
205        let token = lexer.next_token()?;
206        match token {
207            super::lexer::Token::EndObj => {}
208            _ => {
209                return Err(ParseError::SyntaxError {
210                    position: entry.offset as usize,
211                    message: "Expected 'endobj' keyword".to_string(),
212                })
213            }
214        };
215
216        // Cache the object
217        self.object_cache.insert(key, obj);
218        Ok(&self.object_cache[&key])
219    }
220
221    /// Resolve a reference to get the actual object
222    pub fn resolve<'a>(&'a mut self, obj: &'a PdfObject) -> ParseResult<&'a PdfObject> {
223        match obj {
224            PdfObject::Reference(obj_num, gen_num) => self.get_object(*obj_num, *gen_num),
225            _ => Ok(obj),
226        }
227    }
228
229    /// Get a compressed object from an object stream
230    fn get_compressed_object(
231        &mut self,
232        obj_num: u32,
233        gen_num: u16,
234        stream_obj_num: u32,
235        _index_in_stream: u32,
236    ) -> ParseResult<&PdfObject> {
237        let key = (obj_num, gen_num);
238
239        // Load the object stream if not cached
240        if !self.object_stream_cache.contains_key(&stream_obj_num) {
241            // Get the stream object
242            let stream_obj = self.get_object(stream_obj_num, 0)?;
243
244            if let Some(stream) = stream_obj.as_stream() {
245                // Parse the object stream
246                let obj_stream = ObjectStream::parse(stream.clone())?;
247                self.object_stream_cache.insert(stream_obj_num, obj_stream);
248            } else {
249                return Err(ParseError::SyntaxError {
250                    position: 0,
251                    message: format!("Object {stream_obj_num} is not a stream"),
252                });
253            }
254        }
255
256        // Get the object from the stream
257        let obj_stream = &self.object_stream_cache[&stream_obj_num];
258        let obj = obj_stream
259            .get_object(obj_num)
260            .ok_or_else(|| ParseError::SyntaxError {
261                position: 0,
262                message: format!("Object {obj_num} not found in object stream {stream_obj_num}"),
263            })?;
264
265        // Cache the object
266        self.object_cache.insert(key, obj.clone());
267        Ok(&self.object_cache[&key])
268    }
269
270    /// Get the page tree root
271    pub fn pages(&mut self) -> ParseResult<&PdfDictionary> {
272        // Get the pages reference from catalog first
273        let (pages_obj_num, pages_gen_num) = {
274            let catalog = self.catalog()?;
275            let pages_ref = catalog
276                .get("Pages")
277                .ok_or_else(|| ParseError::MissingKey("Pages".to_string()))?;
278
279            match pages_ref {
280                PdfObject::Reference(obj_num, gen_num) => (*obj_num, *gen_num),
281                _ => {
282                    return Err(ParseError::SyntaxError {
283                        position: 0,
284                        message: "Pages must be a reference".to_string(),
285                    })
286                }
287            }
288        };
289
290        // Now we can get the pages object without holding a reference to catalog
291        let pages_obj = self.get_object(pages_obj_num, pages_gen_num)?;
292        pages_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
293            position: 0,
294            message: "Pages is not a dictionary".to_string(),
295        })
296    }
297
298    /// Get the number of pages
299    pub fn page_count(&mut self) -> ParseResult<u32> {
300        let pages = self.pages()?;
301        pages
302            .get("Count")
303            .and_then(|obj| obj.as_integer())
304            .map(|count| count as u32)
305            .ok_or_else(|| ParseError::MissingKey("Count".to_string()))
306    }
307
308    /// Get metadata from the document
309    pub fn metadata(&mut self) -> ParseResult<DocumentMetadata> {
310        let mut metadata = DocumentMetadata::default();
311
312        if let Some(info_dict) = self.info()? {
313            if let Some(title) = info_dict.get("Title").and_then(|o| o.as_string()) {
314                metadata.title = title.as_str().ok().map(|s| s.to_string());
315            }
316            if let Some(author) = info_dict.get("Author").and_then(|o| o.as_string()) {
317                metadata.author = author.as_str().ok().map(|s| s.to_string());
318            }
319            if let Some(subject) = info_dict.get("Subject").and_then(|o| o.as_string()) {
320                metadata.subject = subject.as_str().ok().map(|s| s.to_string());
321            }
322            if let Some(keywords) = info_dict.get("Keywords").and_then(|o| o.as_string()) {
323                metadata.keywords = keywords.as_str().ok().map(|s| s.to_string());
324            }
325            if let Some(creator) = info_dict.get("Creator").and_then(|o| o.as_string()) {
326                metadata.creator = creator.as_str().ok().map(|s| s.to_string());
327            }
328            if let Some(producer) = info_dict.get("Producer").and_then(|o| o.as_string()) {
329                metadata.producer = producer.as_str().ok().map(|s| s.to_string());
330            }
331        }
332
333        metadata.version = self.version().to_string();
334        metadata.page_count = self.page_count().ok();
335
336        Ok(metadata)
337    }
338
339    /// Initialize the page tree navigator if not already done
340    fn ensure_page_tree(&mut self) -> ParseResult<()> {
341        if self.page_tree.is_none() {
342            let page_count = self.page_count()?;
343            self.page_tree = Some(super::page_tree::PageTree::new(page_count));
344        }
345        Ok(())
346    }
347
348    /// Get a specific page by index (0-based)
349    pub fn get_page(&mut self, _index: u32) -> ParseResult<&super::page_tree::ParsedPage> {
350        self.ensure_page_tree()?;
351
352        // TODO: Fix borrow checker issues with page_tree
353        // The page_tree needs mutable access to both itself and the reader
354        // This requires a redesign of the architecture
355        Err(ParseError::SyntaxError {
356            position: 0,
357            message: "get_page not implemented due to borrow checker constraints".to_string(),
358        })
359    }
360
361    /// Get all pages
362    pub fn get_all_pages(&mut self) -> ParseResult<Vec<super::page_tree::ParsedPage>> {
363        let page_count = self.page_count()?;
364        let mut pages = Vec::with_capacity(page_count as usize);
365
366        for i in 0..page_count {
367            let page = self.get_page(i)?.clone();
368            pages.push(page);
369        }
370
371        Ok(pages)
372    }
373
374    /// Convert this reader into a PdfDocument for easier page access
375    pub fn into_document(self) -> super::document::PdfDocument<R> {
376        super::document::PdfDocument::new(self)
377    }
378}
379
380/// Document metadata
381#[derive(Debug, Default, Clone)]
382pub struct DocumentMetadata {
383    pub title: Option<String>,
384    pub author: Option<String>,
385    pub subject: Option<String>,
386    pub keywords: Option<String>,
387    pub creator: Option<String>,
388    pub producer: Option<String>,
389    pub creation_date: Option<String>,
390    pub modification_date: Option<String>,
391    pub version: String,
392    pub page_count: Option<u32>,
393}
394
395#[cfg(test)]
396mod tests {
397
398    #[test]
399    fn test_reader_construction() {
400        // This is a minimal valid PDF for testing
401        let _pdf_data = b"%PDF-1.4
4021 0 obj
403<< /Type /Catalog /Pages 2 0 R >>
404endobj
4052 0 obj
406<< /Type /Pages /Kids [] /Count 0 >>
407endobj
408xref
4090 3
4100000000000 65535 f 
4110000000009 00000 n 
4120000000058 00000 n 
413trailer
414<< /Size 3 /Root 1 0 R >>
415startxref
416116
417%%EOF";
418
419        // For now, we can't fully test this without implementing Seek for Cursor
420        // This would require a more complex test setup
421    }
422}