1use super::header::PdfHeader;
6use super::object_stream::ObjectStream;
7use super::objects::{PdfDictionary, PdfObject};
8use super::trailer::PdfTrailer;
9use super::xref::XRefTable;
10use super::{ParseError, ParseResult};
11use std::collections::HashMap;
12use std::fs::File;
13use std::io::{BufReader, Read, Seek};
14use std::path::Path;
15
16pub struct PdfReader<R: Read + Seek> {
18 reader: BufReader<R>,
19 header: PdfHeader,
20 xref: XRefTable,
21 trailer: PdfTrailer,
22 object_cache: HashMap<(u32, u16), PdfObject>,
24 object_stream_cache: HashMap<u32, ObjectStream>,
26 page_tree: Option<super::page_tree::PageTree>,
28}
29
30impl PdfReader<File> {
31 pub fn open<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
33 let file = File::open(path)?;
34 Self::new(file)
35 }
36
37 pub fn open_document<P: AsRef<Path>>(
39 path: P,
40 ) -> ParseResult<super::document::PdfDocument<File>> {
41 let reader = Self::open(path)?;
42 Ok(reader.into_document())
43 }
44}
45
46impl<R: Read + Seek> PdfReader<R> {
47 pub fn new(reader: R) -> ParseResult<Self> {
49 let mut buf_reader = BufReader::new(reader);
50
51 let header = PdfHeader::parse(&mut buf_reader)?;
53 let xref = XRefTable::parse(&mut buf_reader)?;
55
56 let trailer_dict = xref.trailer().ok_or(ParseError::InvalidTrailer)?.clone();
58
59 let xref_offset = 0; let trailer = PdfTrailer::from_dict(trailer_dict, xref_offset)?;
61
62 trailer.validate()?;
64
65 Ok(Self {
66 reader: buf_reader,
67 header,
68 xref,
69 trailer,
70 object_cache: HashMap::new(),
71 object_stream_cache: HashMap::new(),
72 page_tree: None,
73 })
74 }
75
76 pub fn version(&self) -> &super::header::PdfVersion {
78 &self.header.version
79 }
80
81 pub fn catalog(&mut self) -> ParseResult<&PdfDictionary> {
83 let (obj_num, gen_num) = self.trailer.root()?;
84 let catalog = self.get_object(obj_num, gen_num)?;
85
86 catalog.as_dict().ok_or_else(|| ParseError::SyntaxError {
87 position: 0,
88 message: "Catalog is not a dictionary".to_string(),
89 })
90 }
91
92 pub fn info(&mut self) -> ParseResult<Option<&PdfDictionary>> {
94 match self.trailer.info() {
95 Some((obj_num, gen_num)) => {
96 let info = self.get_object(obj_num, gen_num)?;
97 Ok(info.as_dict())
98 }
99 None => Ok(None),
100 }
101 }
102
103 pub fn get_object(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
105 let key = (obj_num, gen_num);
106
107 if self.object_cache.contains_key(&key) {
109 return Ok(&self.object_cache[&key]);
110 }
111
112 if let Some(ext_entry) = self.xref.get_extended_entry(obj_num) {
114 if let Some((stream_obj_num, index_in_stream)) = ext_entry.compressed_info {
115 return self.get_compressed_object(
117 obj_num,
118 gen_num,
119 stream_obj_num,
120 index_in_stream,
121 );
122 }
123 }
124
125 let entry = self
127 .xref
128 .get_entry(obj_num)
129 .ok_or(ParseError::InvalidReference(obj_num, gen_num))?;
130
131 if !entry.in_use {
132 self.object_cache.insert(key, PdfObject::Null);
134 return Ok(&self.object_cache[&key]);
135 }
136
137 if entry.generation != gen_num {
138 return Err(ParseError::InvalidReference(obj_num, gen_num));
139 }
140
141 self.reader.seek(std::io::SeekFrom::Start(entry.offset))?;
143
144 let mut lexer = super::lexer::Lexer::new(&mut self.reader);
146
147 let token = lexer.next_token()?;
149 let read_obj_num = match token {
150 super::lexer::Token::Integer(n) => n as u32,
151 _ => {
152 return Err(ParseError::SyntaxError {
153 position: entry.offset as usize,
154 message: "Expected object number".to_string(),
155 })
156 }
157 };
158
159 if read_obj_num != obj_num {
160 return Err(ParseError::SyntaxError {
161 position: entry.offset as usize,
162 message: format!(
163 "Object number mismatch: expected {obj_num}, found {read_obj_num}"
164 ),
165 });
166 }
167
168 let token = lexer.next_token()?;
170 let read_gen_num = match token {
171 super::lexer::Token::Integer(n) => n as u16,
172 _ => {
173 return Err(ParseError::SyntaxError {
174 position: entry.offset as usize,
175 message: "Expected generation number".to_string(),
176 })
177 }
178 };
179
180 if read_gen_num != gen_num {
181 return Err(ParseError::SyntaxError {
182 position: entry.offset as usize,
183 message: format!(
184 "Generation number mismatch: expected {gen_num}, found {read_gen_num}"
185 ),
186 });
187 }
188
189 let token = lexer.next_token()?;
191 match token {
192 super::lexer::Token::Obj => {}
193 _ => {
194 return Err(ParseError::SyntaxError {
195 position: entry.offset as usize,
196 message: "Expected 'obj' keyword".to_string(),
197 })
198 }
199 };
200
201 let obj = PdfObject::parse(&mut lexer)?;
203
204 let token = lexer.next_token()?;
206 match token {
207 super::lexer::Token::EndObj => {}
208 _ => {
209 return Err(ParseError::SyntaxError {
210 position: entry.offset as usize,
211 message: "Expected 'endobj' keyword".to_string(),
212 })
213 }
214 };
215
216 self.object_cache.insert(key, obj);
218 Ok(&self.object_cache[&key])
219 }
220
221 pub fn resolve<'a>(&'a mut self, obj: &'a PdfObject) -> ParseResult<&'a PdfObject> {
223 match obj {
224 PdfObject::Reference(obj_num, gen_num) => self.get_object(*obj_num, *gen_num),
225 _ => Ok(obj),
226 }
227 }
228
229 fn get_compressed_object(
231 &mut self,
232 obj_num: u32,
233 gen_num: u16,
234 stream_obj_num: u32,
235 _index_in_stream: u32,
236 ) -> ParseResult<&PdfObject> {
237 let key = (obj_num, gen_num);
238
239 if !self.object_stream_cache.contains_key(&stream_obj_num) {
241 let stream_obj = self.get_object(stream_obj_num, 0)?;
243
244 if let Some(stream) = stream_obj.as_stream() {
245 let obj_stream = ObjectStream::parse(stream.clone())?;
247 self.object_stream_cache.insert(stream_obj_num, obj_stream);
248 } else {
249 return Err(ParseError::SyntaxError {
250 position: 0,
251 message: format!("Object {stream_obj_num} is not a stream"),
252 });
253 }
254 }
255
256 let obj_stream = &self.object_stream_cache[&stream_obj_num];
258 let obj = obj_stream
259 .get_object(obj_num)
260 .ok_or_else(|| ParseError::SyntaxError {
261 position: 0,
262 message: format!("Object {obj_num} not found in object stream {stream_obj_num}"),
263 })?;
264
265 self.object_cache.insert(key, obj.clone());
267 Ok(&self.object_cache[&key])
268 }
269
270 pub fn pages(&mut self) -> ParseResult<&PdfDictionary> {
272 let (pages_obj_num, pages_gen_num) = {
274 let catalog = self.catalog()?;
275 let pages_ref = catalog
276 .get("Pages")
277 .ok_or_else(|| ParseError::MissingKey("Pages".to_string()))?;
278
279 match pages_ref {
280 PdfObject::Reference(obj_num, gen_num) => (*obj_num, *gen_num),
281 _ => {
282 return Err(ParseError::SyntaxError {
283 position: 0,
284 message: "Pages must be a reference".to_string(),
285 })
286 }
287 }
288 };
289
290 let pages_obj = self.get_object(pages_obj_num, pages_gen_num)?;
292 pages_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
293 position: 0,
294 message: "Pages is not a dictionary".to_string(),
295 })
296 }
297
298 pub fn page_count(&mut self) -> ParseResult<u32> {
300 let pages = self.pages()?;
301 pages
302 .get("Count")
303 .and_then(|obj| obj.as_integer())
304 .map(|count| count as u32)
305 .ok_or_else(|| ParseError::MissingKey("Count".to_string()))
306 }
307
308 pub fn metadata(&mut self) -> ParseResult<DocumentMetadata> {
310 let mut metadata = DocumentMetadata::default();
311
312 if let Some(info_dict) = self.info()? {
313 if let Some(title) = info_dict.get("Title").and_then(|o| o.as_string()) {
314 metadata.title = title.as_str().ok().map(|s| s.to_string());
315 }
316 if let Some(author) = info_dict.get("Author").and_then(|o| o.as_string()) {
317 metadata.author = author.as_str().ok().map(|s| s.to_string());
318 }
319 if let Some(subject) = info_dict.get("Subject").and_then(|o| o.as_string()) {
320 metadata.subject = subject.as_str().ok().map(|s| s.to_string());
321 }
322 if let Some(keywords) = info_dict.get("Keywords").and_then(|o| o.as_string()) {
323 metadata.keywords = keywords.as_str().ok().map(|s| s.to_string());
324 }
325 if let Some(creator) = info_dict.get("Creator").and_then(|o| o.as_string()) {
326 metadata.creator = creator.as_str().ok().map(|s| s.to_string());
327 }
328 if let Some(producer) = info_dict.get("Producer").and_then(|o| o.as_string()) {
329 metadata.producer = producer.as_str().ok().map(|s| s.to_string());
330 }
331 }
332
333 metadata.version = self.version().to_string();
334 metadata.page_count = self.page_count().ok();
335
336 Ok(metadata)
337 }
338
339 fn ensure_page_tree(&mut self) -> ParseResult<()> {
341 if self.page_tree.is_none() {
342 let page_count = self.page_count()?;
343 self.page_tree = Some(super::page_tree::PageTree::new(page_count));
344 }
345 Ok(())
346 }
347
348 pub fn get_page(&mut self, _index: u32) -> ParseResult<&super::page_tree::ParsedPage> {
350 self.ensure_page_tree()?;
351
352 Err(ParseError::SyntaxError {
356 position: 0,
357 message: "get_page not implemented due to borrow checker constraints".to_string(),
358 })
359 }
360
361 pub fn get_all_pages(&mut self) -> ParseResult<Vec<super::page_tree::ParsedPage>> {
363 let page_count = self.page_count()?;
364 let mut pages = Vec::with_capacity(page_count as usize);
365
366 for i in 0..page_count {
367 let page = self.get_page(i)?.clone();
368 pages.push(page);
369 }
370
371 Ok(pages)
372 }
373
374 pub fn into_document(self) -> super::document::PdfDocument<R> {
376 super::document::PdfDocument::new(self)
377 }
378}
379
380#[derive(Debug, Default, Clone)]
382pub struct DocumentMetadata {
383 pub title: Option<String>,
384 pub author: Option<String>,
385 pub subject: Option<String>,
386 pub keywords: Option<String>,
387 pub creator: Option<String>,
388 pub producer: Option<String>,
389 pub creation_date: Option<String>,
390 pub modification_date: Option<String>,
391 pub version: String,
392 pub page_count: Option<u32>,
393}
394
395#[cfg(test)]
396mod tests {
397
398 #[test]
399 fn test_reader_construction() {
400 let _pdf_data = b"%PDF-1.4
4021 0 obj
403<< /Type /Catalog /Pages 2 0 R >>
404endobj
4052 0 obj
406<< /Type /Pages /Kids [] /Count 0 >>
407endobj
408xref
4090 3
4100000000000 65535 f
4110000000009 00000 n
4120000000058 00000 n
413trailer
414<< /Size 3 /Root 1 0 R >>
415startxref
416116
417%%EOF";
418
419 }
422}