1use super::header::PdfHeader;
6use super::object_stream::ObjectStream;
7use super::objects::{PdfDictionary, PdfObject};
8use super::trailer::PdfTrailer;
9use super::xref::XRefTable;
10use super::{ParseError, ParseResult};
11use std::collections::HashMap;
12use std::fs::File;
13use std::io::{BufReader, Read, Seek};
14use std::path::Path;
15
16pub struct PdfReader<R: Read + Seek> {
18 reader: BufReader<R>,
19 header: PdfHeader,
20 xref: XRefTable,
21 trailer: PdfTrailer,
22 object_cache: HashMap<(u32, u16), PdfObject>,
24 object_stream_cache: HashMap<u32, ObjectStream>,
26 page_tree: Option<super::page_tree::PageTree>,
28}
29
30impl PdfReader<File> {
31 pub fn open<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
33 let file = File::open(path)?;
34 Self::new(file)
35 }
36
37 pub fn open_document<P: AsRef<Path>>(
39 path: P,
40 ) -> ParseResult<super::document::PdfDocument<File>> {
41 let reader = Self::open(path)?;
42 Ok(reader.into_document())
43 }
44}
45
46impl<R: Read + Seek> PdfReader<R> {
47 pub fn new(reader: R) -> ParseResult<Self> {
49 let mut buf_reader = BufReader::new(reader);
50
51 let header = PdfHeader::parse(&mut buf_reader)?;
53 let xref = XRefTable::parse(&mut buf_reader)?;
55
56 let trailer_dict = xref.trailer().ok_or(ParseError::InvalidTrailer)?.clone();
58
59 let xref_offset = xref.xref_offset();
60 let trailer = PdfTrailer::from_dict(trailer_dict, xref_offset)?;
61
62 trailer.validate()?;
64
65 Ok(Self {
66 reader: buf_reader,
67 header,
68 xref,
69 trailer,
70 object_cache: HashMap::new(),
71 object_stream_cache: HashMap::new(),
72 page_tree: None,
73 })
74 }
75
76 pub fn version(&self) -> &super::header::PdfVersion {
78 &self.header.version
79 }
80
81 pub fn catalog(&mut self) -> ParseResult<&PdfDictionary> {
83 let (obj_num, gen_num) = self.trailer.root()?;
84 let catalog = self.get_object(obj_num, gen_num)?;
85
86 catalog.as_dict().ok_or_else(|| ParseError::SyntaxError {
87 position: 0,
88 message: "Catalog is not a dictionary".to_string(),
89 })
90 }
91
92 pub fn info(&mut self) -> ParseResult<Option<&PdfDictionary>> {
94 match self.trailer.info() {
95 Some((obj_num, gen_num)) => {
96 let info = self.get_object(obj_num, gen_num)?;
97 Ok(info.as_dict())
98 }
99 None => Ok(None),
100 }
101 }
102
103 pub fn get_object(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
105 let key = (obj_num, gen_num);
106
107 if self.object_cache.contains_key(&key) {
109 return Ok(&self.object_cache[&key]);
110 }
111
112 if let Some(ext_entry) = self.xref.get_extended_entry(obj_num) {
114 if let Some((stream_obj_num, index_in_stream)) = ext_entry.compressed_info {
115 return self.get_compressed_object(
117 obj_num,
118 gen_num,
119 stream_obj_num,
120 index_in_stream,
121 );
122 }
123 }
124
125 let entry = self
127 .xref
128 .get_entry(obj_num)
129 .ok_or(ParseError::InvalidReference(obj_num, gen_num))?;
130
131 if !entry.in_use {
132 self.object_cache.insert(key, PdfObject::Null);
134 return Ok(&self.object_cache[&key]);
135 }
136
137 if entry.generation != gen_num {
138 return Err(ParseError::InvalidReference(obj_num, gen_num));
139 }
140
141 self.reader.seek(std::io::SeekFrom::Start(entry.offset))?;
143
144 let mut lexer = super::lexer::Lexer::new(&mut self.reader);
146
147 let token = lexer.next_token()?;
149 let read_obj_num = match token {
150 super::lexer::Token::Integer(n) => n as u32,
151 _ => {
152 return Err(ParseError::SyntaxError {
153 position: entry.offset as usize,
154 message: "Expected object number".to_string(),
155 })
156 }
157 };
158
159 if read_obj_num != obj_num {
160 return Err(ParseError::SyntaxError {
161 position: entry.offset as usize,
162 message: format!(
163 "Object number mismatch: expected {obj_num}, found {read_obj_num}"
164 ),
165 });
166 }
167
168 let token = lexer.next_token()?;
170 let read_gen_num = match token {
171 super::lexer::Token::Integer(n) => n as u16,
172 _ => {
173 return Err(ParseError::SyntaxError {
174 position: entry.offset as usize,
175 message: "Expected generation number".to_string(),
176 })
177 }
178 };
179
180 if read_gen_num != gen_num {
181 return Err(ParseError::SyntaxError {
182 position: entry.offset as usize,
183 message: format!(
184 "Generation number mismatch: expected {gen_num}, found {read_gen_num}"
185 ),
186 });
187 }
188
189 let token = lexer.next_token()?;
191 match token {
192 super::lexer::Token::Obj => {}
193 _ => {
194 return Err(ParseError::SyntaxError {
195 position: entry.offset as usize,
196 message: "Expected 'obj' keyword".to_string(),
197 })
198 }
199 };
200
201 let obj = PdfObject::parse(&mut lexer)?;
203
204 let token = lexer.next_token()?;
206 match token {
207 super::lexer::Token::EndObj => {}
208 _ => {
209 return Err(ParseError::SyntaxError {
210 position: entry.offset as usize,
211 message: "Expected 'endobj' keyword".to_string(),
212 })
213 }
214 };
215
216 self.object_cache.insert(key, obj);
218 Ok(&self.object_cache[&key])
219 }
220
221 pub fn resolve<'a>(&'a mut self, obj: &'a PdfObject) -> ParseResult<&'a PdfObject> {
223 match obj {
224 PdfObject::Reference(obj_num, gen_num) => self.get_object(*obj_num, *gen_num),
225 _ => Ok(obj),
226 }
227 }
228
229 fn get_compressed_object(
231 &mut self,
232 obj_num: u32,
233 gen_num: u16,
234 stream_obj_num: u32,
235 _index_in_stream: u32,
236 ) -> ParseResult<&PdfObject> {
237 let key = (obj_num, gen_num);
238
239 if !self.object_stream_cache.contains_key(&stream_obj_num) {
241 let stream_obj = self.get_object(stream_obj_num, 0)?;
243
244 if let Some(stream) = stream_obj.as_stream() {
245 let obj_stream = ObjectStream::parse(stream.clone())?;
247 self.object_stream_cache.insert(stream_obj_num, obj_stream);
248 } else {
249 return Err(ParseError::SyntaxError {
250 position: 0,
251 message: format!("Object {stream_obj_num} is not a stream"),
252 });
253 }
254 }
255
256 let obj_stream = &self.object_stream_cache[&stream_obj_num];
258 let obj = obj_stream
259 .get_object(obj_num)
260 .ok_or_else(|| ParseError::SyntaxError {
261 position: 0,
262 message: format!("Object {obj_num} not found in object stream {stream_obj_num}"),
263 })?;
264
265 self.object_cache.insert(key, obj.clone());
267 Ok(&self.object_cache[&key])
268 }
269
270 pub fn pages(&mut self) -> ParseResult<&PdfDictionary> {
272 let (pages_obj_num, pages_gen_num) = {
274 let catalog = self.catalog()?;
275 let pages_ref = catalog
276 .get("Pages")
277 .ok_or_else(|| ParseError::MissingKey("Pages".to_string()))?;
278
279 match pages_ref {
280 PdfObject::Reference(obj_num, gen_num) => (*obj_num, *gen_num),
281 _ => {
282 return Err(ParseError::SyntaxError {
283 position: 0,
284 message: "Pages must be a reference".to_string(),
285 })
286 }
287 }
288 };
289
290 let pages_obj = self.get_object(pages_obj_num, pages_gen_num)?;
292 pages_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
293 position: 0,
294 message: "Pages is not a dictionary".to_string(),
295 })
296 }
297
298 pub fn page_count(&mut self) -> ParseResult<u32> {
300 let pages = self.pages()?;
301 pages
302 .get("Count")
303 .and_then(|obj| obj.as_integer())
304 .map(|count| count as u32)
305 .ok_or_else(|| ParseError::MissingKey("Count".to_string()))
306 }
307
308 pub fn metadata(&mut self) -> ParseResult<DocumentMetadata> {
310 let mut metadata = DocumentMetadata::default();
311
312 if let Some(info_dict) = self.info()? {
313 if let Some(title) = info_dict.get("Title").and_then(|o| o.as_string()) {
314 metadata.title = title.as_str().ok().map(|s| s.to_string());
315 }
316 if let Some(author) = info_dict.get("Author").and_then(|o| o.as_string()) {
317 metadata.author = author.as_str().ok().map(|s| s.to_string());
318 }
319 if let Some(subject) = info_dict.get("Subject").and_then(|o| o.as_string()) {
320 metadata.subject = subject.as_str().ok().map(|s| s.to_string());
321 }
322 if let Some(keywords) = info_dict.get("Keywords").and_then(|o| o.as_string()) {
323 metadata.keywords = keywords.as_str().ok().map(|s| s.to_string());
324 }
325 if let Some(creator) = info_dict.get("Creator").and_then(|o| o.as_string()) {
326 metadata.creator = creator.as_str().ok().map(|s| s.to_string());
327 }
328 if let Some(producer) = info_dict.get("Producer").and_then(|o| o.as_string()) {
329 metadata.producer = producer.as_str().ok().map(|s| s.to_string());
330 }
331 }
332
333 metadata.version = self.version().to_string();
334 metadata.page_count = self.page_count().ok();
335
336 Ok(metadata)
337 }
338
339 fn ensure_page_tree(&mut self) -> ParseResult<()> {
341 if self.page_tree.is_none() {
342 let page_count = self.page_count()?;
343 self.page_tree = Some(super::page_tree::PageTree::new(page_count));
344 }
345 Ok(())
346 }
347
348 pub fn get_page(&mut self, _index: u32) -> ParseResult<&super::page_tree::ParsedPage> {
354 self.ensure_page_tree()?;
355
356 Err(ParseError::SyntaxError {
360 position: 0,
361 message: "get_page not implemented due to borrow checker constraints. Use PdfDocument instead.".to_string(),
362 })
363 }
364
365 pub fn get_all_pages(&mut self) -> ParseResult<Vec<super::page_tree::ParsedPage>> {
367 let page_count = self.page_count()?;
368 let mut pages = Vec::with_capacity(page_count as usize);
369
370 for i in 0..page_count {
371 let page = self.get_page(i)?.clone();
372 pages.push(page);
373 }
374
375 Ok(pages)
376 }
377
378 pub fn into_document(self) -> super::document::PdfDocument<R> {
380 super::document::PdfDocument::new(self)
381 }
382}
383
384#[derive(Debug, Default, Clone)]
386pub struct DocumentMetadata {
387 pub title: Option<String>,
388 pub author: Option<String>,
389 pub subject: Option<String>,
390 pub keywords: Option<String>,
391 pub creator: Option<String>,
392 pub producer: Option<String>,
393 pub creation_date: Option<String>,
394 pub modification_date: Option<String>,
395 pub version: String,
396 pub page_count: Option<u32>,
397}
398
399#[cfg(test)]
400mod tests {
401
402 use super::*;
403 use std::io::Cursor;
404 use crate::parser::objects::{PdfName, PdfString};
405 use crate::parser::test_helpers::*;
406
407
408
409 #[test]
410 fn test_reader_construction() {
411 let pdf_data = create_minimal_pdf();
412 let cursor = Cursor::new(pdf_data);
413 let result = PdfReader::new(cursor);
414 assert!(result.is_ok());
415 }
416
417 #[test]
418 fn test_reader_version() {
419 let pdf_data = create_minimal_pdf();
420 let cursor = Cursor::new(pdf_data);
421 let reader = PdfReader::new(cursor).unwrap();
422 assert_eq!(reader.version().major, 1);
423 assert_eq!(reader.version().minor, 4);
424 }
425
426 #[test]
427 fn test_reader_different_versions() {
428 let versions = vec!["1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "2.0"];
429
430 for version in versions {
431 let pdf_data = create_pdf_with_version(version);
432 let cursor = Cursor::new(pdf_data);
433 let reader = PdfReader::new(cursor).unwrap();
434
435 let parts: Vec<&str> = version.split('.').collect();
436 assert_eq!(reader.version().major, parts[0].parse::<u8>().unwrap());
437 assert_eq!(reader.version().minor, parts[1].parse::<u8>().unwrap());
438 }
439 }
440
441 #[test]
442 fn test_reader_catalog() {
443 let pdf_data = create_minimal_pdf();
444 let cursor = Cursor::new(pdf_data);
445 let mut reader = PdfReader::new(cursor).unwrap();
446
447 let catalog = reader.catalog();
448 assert!(catalog.is_ok());
449
450 let catalog_dict = catalog.unwrap();
451 assert_eq!(catalog_dict.get("Type"), Some(&PdfObject::Name(PdfName("Catalog".to_string()))));
452 }
453
454 #[test]
455 fn test_reader_info_none() {
456 let pdf_data = create_minimal_pdf();
457 let cursor = Cursor::new(pdf_data);
458 let mut reader = PdfReader::new(cursor).unwrap();
459
460 let info = reader.info().unwrap();
461 assert!(info.is_none());
462 }
463
464 #[test]
465 fn test_reader_info_present() {
466 let pdf_data = create_pdf_with_info();
467 let cursor = Cursor::new(pdf_data);
468 let mut reader = PdfReader::new(cursor).unwrap();
469
470 let info = reader.info().unwrap();
471 assert!(info.is_some());
472
473 let info_dict = info.unwrap();
474 assert_eq!(info_dict.get("Title"), Some(&PdfObject::String(PdfString("Test PDF".to_string().into_bytes()))));
475 assert_eq!(info_dict.get("Author"), Some(&PdfObject::String(PdfString("Test Author".to_string().into_bytes()))));
476 }
477
478 #[test]
479 fn test_reader_get_object() {
480 let pdf_data = create_minimal_pdf();
481 let cursor = Cursor::new(pdf_data);
482 let mut reader = PdfReader::new(cursor).unwrap();
483
484 let obj = reader.get_object(1, 0);
486 assert!(obj.is_ok());
487
488 let catalog = obj.unwrap();
489 assert!(catalog.as_dict().is_some());
490 }
491
492 #[test]
493 fn test_reader_get_invalid_object() {
494 let pdf_data = create_minimal_pdf();
495 let cursor = Cursor::new(pdf_data);
496 let mut reader = PdfReader::new(cursor).unwrap();
497
498 let obj = reader.get_object(999, 0);
500 assert!(obj.is_err());
501 }
502
503 #[test]
504 fn test_reader_get_free_object() {
505 let pdf_data = create_minimal_pdf();
506 let cursor = Cursor::new(pdf_data);
507 let mut reader = PdfReader::new(cursor).unwrap();
508
509 let obj = reader.get_object(0, 65535);
511 assert!(obj.is_ok());
512 assert_eq!(obj.unwrap(), &PdfObject::Null);
513 }
514
515 #[test]
516 fn test_reader_resolve_reference() {
517 let pdf_data = create_minimal_pdf();
518 let cursor = Cursor::new(pdf_data);
519 let mut reader = PdfReader::new(cursor).unwrap();
520
521 let ref_obj = PdfObject::Reference(1, 0);
523 let resolved = reader.resolve(&ref_obj);
524
525 assert!(resolved.is_ok());
526 assert!(resolved.unwrap().as_dict().is_some());
527 }
528
529 #[test]
530 fn test_reader_resolve_non_reference() {
531 let pdf_data = create_minimal_pdf();
532 let cursor = Cursor::new(pdf_data);
533 let mut reader = PdfReader::new(cursor).unwrap();
534
535 let int_obj = PdfObject::Integer(42);
537 let resolved = reader.resolve(&int_obj).unwrap();
538
539 assert_eq!(resolved, &PdfObject::Integer(42));
540 }
541
542 #[test]
543 fn test_reader_cache_behavior() {
544 let pdf_data = create_minimal_pdf();
545 let cursor = Cursor::new(pdf_data);
546 let mut reader = PdfReader::new(cursor).unwrap();
547
548 let obj1 = reader.get_object(1, 0).unwrap();
550 assert!(obj1.as_dict().is_some());
551
552 let obj2 = reader.get_object(1, 0).unwrap();
554 assert!(obj2.as_dict().is_some());
555 }
556
557 #[test]
558 fn test_reader_wrong_generation() {
559 let pdf_data = create_minimal_pdf();
560 let cursor = Cursor::new(pdf_data);
561 let mut reader = PdfReader::new(cursor).unwrap();
562
563 let obj = reader.get_object(1, 99);
565 assert!(obj.is_err());
566 }
567
568 #[test]
569 fn test_reader_invalid_pdf() {
570 let invalid_data = b"This is not a PDF file";
571 let cursor = Cursor::new(invalid_data.to_vec());
572 let result = PdfReader::new(cursor);
573
574 assert!(result.is_err());
575 }
576
577 #[test]
578 fn test_reader_corrupt_xref() {
579 let corrupt_pdf = b"%PDF-1.4
5801 0 obj
581<< /Type /Catalog >>
582endobj
583xref
584corrupted xref table
585trailer
586<< /Size 2 /Root 1 0 R >>
587startxref
58824
589%%EOF".to_vec();
590
591 let cursor = Cursor::new(corrupt_pdf);
592 let result = PdfReader::new(cursor);
593 assert!(result.is_err());
594 }
595
596 #[test]
597 fn test_reader_missing_trailer() {
598 let pdf_no_trailer = b"%PDF-1.4
5991 0 obj
600<< /Type /Catalog >>
601endobj
602xref
6030 2
6040000000000 65535 f
6050000000009 00000 n
606startxref
60724
608%%EOF".to_vec();
609
610 let cursor = Cursor::new(pdf_no_trailer);
611 let result = PdfReader::new(cursor);
612 assert!(result.is_err());
613 }
614
615 #[test]
616 fn test_reader_empty_pdf() {
617 let cursor = Cursor::new(Vec::new());
618 let result = PdfReader::new(cursor);
619 assert!(result.is_err());
620 }
621
622 #[test]
623 fn test_reader_page_count() {
624 let pdf_data = create_minimal_pdf();
625 let cursor = Cursor::new(pdf_data);
626 let mut reader = PdfReader::new(cursor).unwrap();
627
628 let count = reader.page_count();
629 assert!(count.is_ok());
630 assert_eq!(count.unwrap(), 0); }
632
633 #[test]
634 fn test_reader_into_document() {
635 let pdf_data = create_minimal_pdf();
636 let cursor = Cursor::new(pdf_data);
637 let reader = PdfReader::new(cursor).unwrap();
638
639 let document = reader.into_document();
640 let page_count = document.page_count();
642 assert!(page_count.is_ok());
643 }
644
645 #[test]
646 fn test_reader_pages_dict() {
647 let pdf_data = create_minimal_pdf();
648 let cursor = Cursor::new(pdf_data);
649 let mut reader = PdfReader::new(cursor).unwrap();
650
651 let pages = reader.pages();
652 assert!(pages.is_ok());
653 let pages_dict = pages.unwrap();
654 assert_eq!(pages_dict.get("Type"), Some(&PdfObject::Name(PdfName("Pages".to_string()))));
655 }
656
657 #[test]
658 fn test_reader_pdf_with_binary_data() {
659 let pdf_data = create_pdf_with_binary_marker();
660
661 let cursor = Cursor::new(pdf_data);
662 let result = PdfReader::new(cursor);
663 assert!(result.is_ok());
664 }
665
666 #[test]
667 fn test_reader_metadata() {
668 let pdf_data = create_pdf_with_info();
669 let cursor = Cursor::new(pdf_data);
670 let mut reader = PdfReader::new(cursor).unwrap();
671
672 let metadata = reader.metadata().unwrap();
673 assert_eq!(metadata.title, Some("Test PDF".to_string()));
674 assert_eq!(metadata.author, Some("Test Author".to_string()));
675 assert_eq!(metadata.subject, Some("Testing".to_string()));
676 assert_eq!(metadata.version, "1.4".to_string());
677 }
678
679 #[test]
680 fn test_reader_metadata_empty() {
681 let pdf_data = create_minimal_pdf();
682 let cursor = Cursor::new(pdf_data);
683 let mut reader = PdfReader::new(cursor).unwrap();
684
685 let metadata = reader.metadata().unwrap();
686 assert!(metadata.title.is_none());
687 assert!(metadata.author.is_none());
688 assert_eq!(metadata.version, "1.4".to_string());
689 assert_eq!(metadata.page_count, Some(0));
690 }
691
692 #[test]
693 fn test_reader_object_number_mismatch() {
694 let pdf_data = create_minimal_pdf();
698 let cursor = Cursor::new(pdf_data);
699 let mut reader = PdfReader::new(cursor).unwrap();
700
701 let result = reader.get_object(1, 99);
704 assert!(result.is_err());
705
706 let result2 = reader.get_object(999, 0);
708 assert!(result2.is_err());
709 }
710
711 #[test]
712 fn test_document_metadata_struct() {
713 let metadata = DocumentMetadata {
714 title: Some("Title".to_string()),
715 author: Some("Author".to_string()),
716 subject: Some("Subject".to_string()),
717 keywords: Some("Keywords".to_string()),
718 creator: Some("Creator".to_string()),
719 producer: Some("Producer".to_string()),
720 creation_date: Some("D:20240101".to_string()),
721 modification_date: Some("D:20240102".to_string()),
722 version: "1.5".to_string(),
723 page_count: Some(10),
724 };
725
726 assert_eq!(metadata.title, Some("Title".to_string()));
727 assert_eq!(metadata.page_count, Some(10));
728 }
729
730 #[test]
731 fn test_document_metadata_default() {
732 let metadata = DocumentMetadata::default();
733 assert!(metadata.title.is_none());
734 assert!(metadata.author.is_none());
735 assert!(metadata.subject.is_none());
736 assert!(metadata.keywords.is_none());
737 assert!(metadata.creator.is_none());
738 assert!(metadata.producer.is_none());
739 assert!(metadata.creation_date.is_none());
740 assert!(metadata.modification_date.is_none());
741 assert_eq!(metadata.version, "".to_string());
742 assert!(metadata.page_count.is_none());
743 }
744
745 #[test]
746 fn test_document_metadata_clone() {
747 let metadata = DocumentMetadata {
748 title: Some("Test".to_string()),
749 version: "1.4".to_string(),
750 ..Default::default()
751 };
752
753 let cloned = metadata.clone();
754 assert_eq!(cloned.title, Some("Test".to_string()));
755 assert_eq!(cloned.version, "1.4".to_string());
756 }
757
758 #[test]
759 fn test_reader_trailer_validation_error() {
760 let bad_pdf = b"%PDF-1.4
7621 0 obj
763<< /Type /Catalog >>
764endobj
765xref
7660 2
7670000000000 65535 f
7680000000009 00000 n
769trailer
770<< /Size 2 >>
771startxref
77246
773%%EOF".to_vec();
774
775 let cursor = Cursor::new(bad_pdf);
776 let result = PdfReader::new(cursor);
777 assert!(result.is_err()); }
779}