1use super::header::PdfHeader;
6use super::object_stream::ObjectStream;
7use super::objects::{PdfDictionary, PdfObject};
8use super::trailer::PdfTrailer;
9use super::xref::XRefTable;
10use super::{ParseError, ParseResult};
11use std::collections::HashMap;
12use std::fs::File;
13use std::io::{BufReader, Read, Seek};
14use std::path::Path;
15
16pub struct PdfReader<R: Read + Seek> {
18 reader: BufReader<R>,
19 header: PdfHeader,
20 xref: XRefTable,
21 trailer: PdfTrailer,
22 object_cache: HashMap<(u32, u16), PdfObject>,
24 object_stream_cache: HashMap<u32, ObjectStream>,
26 page_tree: Option<super::page_tree::PageTree>,
28}
29
30impl PdfReader<File> {
31 pub fn open<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
33 let file = File::open(path)?;
34 Self::new(file)
35 }
36
37 pub fn open_document<P: AsRef<Path>>(
39 path: P,
40 ) -> ParseResult<super::document::PdfDocument<File>> {
41 let reader = Self::open(path)?;
42 Ok(reader.into_document())
43 }
44}
45
46impl<R: Read + Seek> PdfReader<R> {
47 pub fn new(reader: R) -> ParseResult<Self> {
49 let mut buf_reader = BufReader::new(reader);
50
51 let header = PdfHeader::parse(&mut buf_reader)?;
53 let xref = XRefTable::parse(&mut buf_reader)?;
55
56 let trailer_dict = xref.trailer().ok_or(ParseError::InvalidTrailer)?.clone();
58
59 let xref_offset = xref.xref_offset();
60 let trailer = PdfTrailer::from_dict(trailer_dict, xref_offset)?;
61
62 trailer.validate()?;
64
65 Ok(Self {
66 reader: buf_reader,
67 header,
68 xref,
69 trailer,
70 object_cache: HashMap::new(),
71 object_stream_cache: HashMap::new(),
72 page_tree: None,
73 })
74 }
75
76 pub fn version(&self) -> &super::header::PdfVersion {
78 &self.header.version
79 }
80
81 pub fn catalog(&mut self) -> ParseResult<&PdfDictionary> {
83 let (obj_num, gen_num) = self.trailer.root()?;
84 let catalog = self.get_object(obj_num, gen_num)?;
85
86 catalog.as_dict().ok_or_else(|| ParseError::SyntaxError {
87 position: 0,
88 message: "Catalog is not a dictionary".to_string(),
89 })
90 }
91
92 pub fn info(&mut self) -> ParseResult<Option<&PdfDictionary>> {
94 match self.trailer.info() {
95 Some((obj_num, gen_num)) => {
96 let info = self.get_object(obj_num, gen_num)?;
97 Ok(info.as_dict())
98 }
99 None => Ok(None),
100 }
101 }
102
103 pub fn get_object(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
105 let key = (obj_num, gen_num);
106
107 if self.object_cache.contains_key(&key) {
109 return Ok(&self.object_cache[&key]);
110 }
111
112 if let Some(ext_entry) = self.xref.get_extended_entry(obj_num) {
114 if let Some((stream_obj_num, index_in_stream)) = ext_entry.compressed_info {
115 return self.get_compressed_object(
117 obj_num,
118 gen_num,
119 stream_obj_num,
120 index_in_stream,
121 );
122 }
123 }
124
125 let entry = self
127 .xref
128 .get_entry(obj_num)
129 .ok_or(ParseError::InvalidReference(obj_num, gen_num))?;
130
131 if !entry.in_use {
132 self.object_cache.insert(key, PdfObject::Null);
134 return Ok(&self.object_cache[&key]);
135 }
136
137 if entry.generation != gen_num {
138 return Err(ParseError::InvalidReference(obj_num, gen_num));
139 }
140
141 self.reader.seek(std::io::SeekFrom::Start(entry.offset))?;
143
144 let mut lexer = super::lexer::Lexer::new(&mut self.reader);
146
147 let token = lexer.next_token()?;
149 let read_obj_num = match token {
150 super::lexer::Token::Integer(n) => n as u32,
151 _ => {
152 return Err(ParseError::SyntaxError {
153 position: entry.offset as usize,
154 message: "Expected object number".to_string(),
155 })
156 }
157 };
158
159 if read_obj_num != obj_num {
160 return Err(ParseError::SyntaxError {
161 position: entry.offset as usize,
162 message: format!(
163 "Object number mismatch: expected {obj_num}, found {read_obj_num}"
164 ),
165 });
166 }
167
168 let token = lexer.next_token()?;
170 let read_gen_num = match token {
171 super::lexer::Token::Integer(n) => n as u16,
172 _ => {
173 return Err(ParseError::SyntaxError {
174 position: entry.offset as usize,
175 message: "Expected generation number".to_string(),
176 })
177 }
178 };
179
180 if read_gen_num != gen_num {
181 return Err(ParseError::SyntaxError {
182 position: entry.offset as usize,
183 message: format!(
184 "Generation number mismatch: expected {gen_num}, found {read_gen_num}"
185 ),
186 });
187 }
188
189 let token = lexer.next_token()?;
191 match token {
192 super::lexer::Token::Obj => {}
193 _ => {
194 return Err(ParseError::SyntaxError {
195 position: entry.offset as usize,
196 message: "Expected 'obj' keyword".to_string(),
197 })
198 }
199 };
200
201 let obj = PdfObject::parse(&mut lexer)?;
203
204 let token = lexer.next_token()?;
206 match token {
207 super::lexer::Token::EndObj => {}
208 _ => {
209 return Err(ParseError::SyntaxError {
210 position: entry.offset as usize,
211 message: "Expected 'endobj' keyword".to_string(),
212 })
213 }
214 };
215
216 self.object_cache.insert(key, obj);
218 Ok(&self.object_cache[&key])
219 }
220
221 pub fn resolve<'a>(&'a mut self, obj: &'a PdfObject) -> ParseResult<&'a PdfObject> {
223 match obj {
224 PdfObject::Reference(obj_num, gen_num) => self.get_object(*obj_num, *gen_num),
225 _ => Ok(obj),
226 }
227 }
228
229 fn get_compressed_object(
231 &mut self,
232 obj_num: u32,
233 gen_num: u16,
234 stream_obj_num: u32,
235 _index_in_stream: u32,
236 ) -> ParseResult<&PdfObject> {
237 let key = (obj_num, gen_num);
238
239 if !self.object_stream_cache.contains_key(&stream_obj_num) {
241 let stream_obj = self.get_object(stream_obj_num, 0)?;
243
244 if let Some(stream) = stream_obj.as_stream() {
245 let obj_stream = ObjectStream::parse(stream.clone())?;
247 self.object_stream_cache.insert(stream_obj_num, obj_stream);
248 } else {
249 return Err(ParseError::SyntaxError {
250 position: 0,
251 message: format!("Object {stream_obj_num} is not a stream"),
252 });
253 }
254 }
255
256 let obj_stream = &self.object_stream_cache[&stream_obj_num];
258 let obj = obj_stream
259 .get_object(obj_num)
260 .ok_or_else(|| ParseError::SyntaxError {
261 position: 0,
262 message: format!("Object {obj_num} not found in object stream {stream_obj_num}"),
263 })?;
264
265 self.object_cache.insert(key, obj.clone());
267 Ok(&self.object_cache[&key])
268 }
269
270 pub fn pages(&mut self) -> ParseResult<&PdfDictionary> {
272 let (pages_obj_num, pages_gen_num) = {
274 let catalog = self.catalog()?;
275 let pages_ref = catalog
276 .get("Pages")
277 .ok_or_else(|| ParseError::MissingKey("Pages".to_string()))?;
278
279 match pages_ref {
280 PdfObject::Reference(obj_num, gen_num) => (*obj_num, *gen_num),
281 _ => {
282 return Err(ParseError::SyntaxError {
283 position: 0,
284 message: "Pages must be a reference".to_string(),
285 })
286 }
287 }
288 };
289
290 let pages_obj = self.get_object(pages_obj_num, pages_gen_num)?;
292 pages_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
293 position: 0,
294 message: "Pages is not a dictionary".to_string(),
295 })
296 }
297
298 pub fn page_count(&mut self) -> ParseResult<u32> {
300 let pages = self.pages()?;
301 pages
302 .get("Count")
303 .and_then(|obj| obj.as_integer())
304 .map(|count| count as u32)
305 .ok_or_else(|| ParseError::MissingKey("Count".to_string()))
306 }
307
308 pub fn metadata(&mut self) -> ParseResult<DocumentMetadata> {
310 let mut metadata = DocumentMetadata::default();
311
312 if let Some(info_dict) = self.info()? {
313 if let Some(title) = info_dict.get("Title").and_then(|o| o.as_string()) {
314 metadata.title = title.as_str().ok().map(|s| s.to_string());
315 }
316 if let Some(author) = info_dict.get("Author").and_then(|o| o.as_string()) {
317 metadata.author = author.as_str().ok().map(|s| s.to_string());
318 }
319 if let Some(subject) = info_dict.get("Subject").and_then(|o| o.as_string()) {
320 metadata.subject = subject.as_str().ok().map(|s| s.to_string());
321 }
322 if let Some(keywords) = info_dict.get("Keywords").and_then(|o| o.as_string()) {
323 metadata.keywords = keywords.as_str().ok().map(|s| s.to_string());
324 }
325 if let Some(creator) = info_dict.get("Creator").and_then(|o| o.as_string()) {
326 metadata.creator = creator.as_str().ok().map(|s| s.to_string());
327 }
328 if let Some(producer) = info_dict.get("Producer").and_then(|o| o.as_string()) {
329 metadata.producer = producer.as_str().ok().map(|s| s.to_string());
330 }
331 }
332
333 metadata.version = self.version().to_string();
334 metadata.page_count = self.page_count().ok();
335
336 Ok(metadata)
337 }
338
339 fn ensure_page_tree(&mut self) -> ParseResult<()> {
341 if self.page_tree.is_none() {
342 let page_count = self.page_count()?;
343 self.page_tree = Some(super::page_tree::PageTree::new(page_count));
344 }
345 Ok(())
346 }
347
348 pub fn get_page(&mut self, _index: u32) -> ParseResult<&super::page_tree::ParsedPage> {
354 self.ensure_page_tree()?;
355
356 Err(ParseError::SyntaxError {
360 position: 0,
361 message: "get_page not implemented due to borrow checker constraints. Use PdfDocument instead.".to_string(),
362 })
363 }
364
365 pub fn get_all_pages(&mut self) -> ParseResult<Vec<super::page_tree::ParsedPage>> {
367 let page_count = self.page_count()?;
368 let mut pages = Vec::with_capacity(page_count as usize);
369
370 for i in 0..page_count {
371 let page = self.get_page(i)?.clone();
372 pages.push(page);
373 }
374
375 Ok(pages)
376 }
377
378 pub fn into_document(self) -> super::document::PdfDocument<R> {
380 super::document::PdfDocument::new(self)
381 }
382}
383
384#[derive(Debug, Default, Clone)]
386pub struct DocumentMetadata {
387 pub title: Option<String>,
388 pub author: Option<String>,
389 pub subject: Option<String>,
390 pub keywords: Option<String>,
391 pub creator: Option<String>,
392 pub producer: Option<String>,
393 pub creation_date: Option<String>,
394 pub modification_date: Option<String>,
395 pub version: String,
396 pub page_count: Option<u32>,
397}
398
399#[cfg(test)]
400mod tests {
401
402 use super::*;
403 use crate::parser::objects::{PdfName, PdfString};
404 use crate::parser::test_helpers::*;
405 use std::io::Cursor;
406
407 #[test]
408 fn test_reader_construction() {
409 let pdf_data = create_minimal_pdf();
410 let cursor = Cursor::new(pdf_data);
411 let result = PdfReader::new(cursor);
412 assert!(result.is_ok());
413 }
414
415 #[test]
416 fn test_reader_version() {
417 let pdf_data = create_minimal_pdf();
418 let cursor = Cursor::new(pdf_data);
419 let reader = PdfReader::new(cursor).unwrap();
420 assert_eq!(reader.version().major, 1);
421 assert_eq!(reader.version().minor, 4);
422 }
423
424 #[test]
425 fn test_reader_different_versions() {
426 let versions = vec![
427 "1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "2.0",
428 ];
429
430 for version in versions {
431 let pdf_data = create_pdf_with_version(version);
432 let cursor = Cursor::new(pdf_data);
433 let reader = PdfReader::new(cursor).unwrap();
434
435 let parts: Vec<&str> = version.split('.').collect();
436 assert_eq!(reader.version().major, parts[0].parse::<u8>().unwrap());
437 assert_eq!(reader.version().minor, parts[1].parse::<u8>().unwrap());
438 }
439 }
440
441 #[test]
442 fn test_reader_catalog() {
443 let pdf_data = create_minimal_pdf();
444 let cursor = Cursor::new(pdf_data);
445 let mut reader = PdfReader::new(cursor).unwrap();
446
447 let catalog = reader.catalog();
448 assert!(catalog.is_ok());
449
450 let catalog_dict = catalog.unwrap();
451 assert_eq!(
452 catalog_dict.get("Type"),
453 Some(&PdfObject::Name(PdfName("Catalog".to_string())))
454 );
455 }
456
457 #[test]
458 fn test_reader_info_none() {
459 let pdf_data = create_minimal_pdf();
460 let cursor = Cursor::new(pdf_data);
461 let mut reader = PdfReader::new(cursor).unwrap();
462
463 let info = reader.info().unwrap();
464 assert!(info.is_none());
465 }
466
467 #[test]
468 fn test_reader_info_present() {
469 let pdf_data = create_pdf_with_info();
470 let cursor = Cursor::new(pdf_data);
471 let mut reader = PdfReader::new(cursor).unwrap();
472
473 let info = reader.info().unwrap();
474 assert!(info.is_some());
475
476 let info_dict = info.unwrap();
477 assert_eq!(
478 info_dict.get("Title"),
479 Some(&PdfObject::String(PdfString(
480 "Test PDF".to_string().into_bytes()
481 )))
482 );
483 assert_eq!(
484 info_dict.get("Author"),
485 Some(&PdfObject::String(PdfString(
486 "Test Author".to_string().into_bytes()
487 )))
488 );
489 }
490
491 #[test]
492 fn test_reader_get_object() {
493 let pdf_data = create_minimal_pdf();
494 let cursor = Cursor::new(pdf_data);
495 let mut reader = PdfReader::new(cursor).unwrap();
496
497 let obj = reader.get_object(1, 0);
499 assert!(obj.is_ok());
500
501 let catalog = obj.unwrap();
502 assert!(catalog.as_dict().is_some());
503 }
504
505 #[test]
506 fn test_reader_get_invalid_object() {
507 let pdf_data = create_minimal_pdf();
508 let cursor = Cursor::new(pdf_data);
509 let mut reader = PdfReader::new(cursor).unwrap();
510
511 let obj = reader.get_object(999, 0);
513 assert!(obj.is_err());
514 }
515
516 #[test]
517 fn test_reader_get_free_object() {
518 let pdf_data = create_minimal_pdf();
519 let cursor = Cursor::new(pdf_data);
520 let mut reader = PdfReader::new(cursor).unwrap();
521
522 let obj = reader.get_object(0, 65535);
524 assert!(obj.is_ok());
525 assert_eq!(obj.unwrap(), &PdfObject::Null);
526 }
527
528 #[test]
529 fn test_reader_resolve_reference() {
530 let pdf_data = create_minimal_pdf();
531 let cursor = Cursor::new(pdf_data);
532 let mut reader = PdfReader::new(cursor).unwrap();
533
534 let ref_obj = PdfObject::Reference(1, 0);
536 let resolved = reader.resolve(&ref_obj);
537
538 assert!(resolved.is_ok());
539 assert!(resolved.unwrap().as_dict().is_some());
540 }
541
542 #[test]
543 fn test_reader_resolve_non_reference() {
544 let pdf_data = create_minimal_pdf();
545 let cursor = Cursor::new(pdf_data);
546 let mut reader = PdfReader::new(cursor).unwrap();
547
548 let int_obj = PdfObject::Integer(42);
550 let resolved = reader.resolve(&int_obj).unwrap();
551
552 assert_eq!(resolved, &PdfObject::Integer(42));
553 }
554
555 #[test]
556 fn test_reader_cache_behavior() {
557 let pdf_data = create_minimal_pdf();
558 let cursor = Cursor::new(pdf_data);
559 let mut reader = PdfReader::new(cursor).unwrap();
560
561 let obj1 = reader.get_object(1, 0).unwrap();
563 assert!(obj1.as_dict().is_some());
564
565 let obj2 = reader.get_object(1, 0).unwrap();
567 assert!(obj2.as_dict().is_some());
568 }
569
570 #[test]
571 fn test_reader_wrong_generation() {
572 let pdf_data = create_minimal_pdf();
573 let cursor = Cursor::new(pdf_data);
574 let mut reader = PdfReader::new(cursor).unwrap();
575
576 let obj = reader.get_object(1, 99);
578 assert!(obj.is_err());
579 }
580
581 #[test]
582 fn test_reader_invalid_pdf() {
583 let invalid_data = b"This is not a PDF file";
584 let cursor = Cursor::new(invalid_data.to_vec());
585 let result = PdfReader::new(cursor);
586
587 assert!(result.is_err());
588 }
589
590 #[test]
591 fn test_reader_corrupt_xref() {
592 let corrupt_pdf = b"%PDF-1.4
5931 0 obj
594<< /Type /Catalog >>
595endobj
596xref
597corrupted xref table
598trailer
599<< /Size 2 /Root 1 0 R >>
600startxref
60124
602%%EOF"
603 .to_vec();
604
605 let cursor = Cursor::new(corrupt_pdf);
606 let result = PdfReader::new(cursor);
607 assert!(result.is_err());
608 }
609
610 #[test]
611 fn test_reader_missing_trailer() {
612 let pdf_no_trailer = b"%PDF-1.4
6131 0 obj
614<< /Type /Catalog >>
615endobj
616xref
6170 2
6180000000000 65535 f
6190000000009 00000 n
620startxref
62124
622%%EOF"
623 .to_vec();
624
625 let cursor = Cursor::new(pdf_no_trailer);
626 let result = PdfReader::new(cursor);
627 assert!(result.is_err());
628 }
629
630 #[test]
631 fn test_reader_empty_pdf() {
632 let cursor = Cursor::new(Vec::new());
633 let result = PdfReader::new(cursor);
634 assert!(result.is_err());
635 }
636
637 #[test]
638 fn test_reader_page_count() {
639 let pdf_data = create_minimal_pdf();
640 let cursor = Cursor::new(pdf_data);
641 let mut reader = PdfReader::new(cursor).unwrap();
642
643 let count = reader.page_count();
644 assert!(count.is_ok());
645 assert_eq!(count.unwrap(), 0); }
647
648 #[test]
649 fn test_reader_into_document() {
650 let pdf_data = create_minimal_pdf();
651 let cursor = Cursor::new(pdf_data);
652 let reader = PdfReader::new(cursor).unwrap();
653
654 let document = reader.into_document();
655 let page_count = document.page_count();
657 assert!(page_count.is_ok());
658 }
659
660 #[test]
661 fn test_reader_pages_dict() {
662 let pdf_data = create_minimal_pdf();
663 let cursor = Cursor::new(pdf_data);
664 let mut reader = PdfReader::new(cursor).unwrap();
665
666 let pages = reader.pages();
667 assert!(pages.is_ok());
668 let pages_dict = pages.unwrap();
669 assert_eq!(
670 pages_dict.get("Type"),
671 Some(&PdfObject::Name(PdfName("Pages".to_string())))
672 );
673 }
674
675 #[test]
676 fn test_reader_pdf_with_binary_data() {
677 let pdf_data = create_pdf_with_binary_marker();
678
679 let cursor = Cursor::new(pdf_data);
680 let result = PdfReader::new(cursor);
681 assert!(result.is_ok());
682 }
683
684 #[test]
685 fn test_reader_metadata() {
686 let pdf_data = create_pdf_with_info();
687 let cursor = Cursor::new(pdf_data);
688 let mut reader = PdfReader::new(cursor).unwrap();
689
690 let metadata = reader.metadata().unwrap();
691 assert_eq!(metadata.title, Some("Test PDF".to_string()));
692 assert_eq!(metadata.author, Some("Test Author".to_string()));
693 assert_eq!(metadata.subject, Some("Testing".to_string()));
694 assert_eq!(metadata.version, "1.4".to_string());
695 }
696
697 #[test]
698 fn test_reader_metadata_empty() {
699 let pdf_data = create_minimal_pdf();
700 let cursor = Cursor::new(pdf_data);
701 let mut reader = PdfReader::new(cursor).unwrap();
702
703 let metadata = reader.metadata().unwrap();
704 assert!(metadata.title.is_none());
705 assert!(metadata.author.is_none());
706 assert_eq!(metadata.version, "1.4".to_string());
707 assert_eq!(metadata.page_count, Some(0));
708 }
709
710 #[test]
711 fn test_reader_object_number_mismatch() {
712 let pdf_data = create_minimal_pdf();
716 let cursor = Cursor::new(pdf_data);
717 let mut reader = PdfReader::new(cursor).unwrap();
718
719 let result = reader.get_object(1, 99);
722 assert!(result.is_err());
723
724 let result2 = reader.get_object(999, 0);
726 assert!(result2.is_err());
727 }
728
729 #[test]
730 fn test_document_metadata_struct() {
731 let metadata = DocumentMetadata {
732 title: Some("Title".to_string()),
733 author: Some("Author".to_string()),
734 subject: Some("Subject".to_string()),
735 keywords: Some("Keywords".to_string()),
736 creator: Some("Creator".to_string()),
737 producer: Some("Producer".to_string()),
738 creation_date: Some("D:20240101".to_string()),
739 modification_date: Some("D:20240102".to_string()),
740 version: "1.5".to_string(),
741 page_count: Some(10),
742 };
743
744 assert_eq!(metadata.title, Some("Title".to_string()));
745 assert_eq!(metadata.page_count, Some(10));
746 }
747
748 #[test]
749 fn test_document_metadata_default() {
750 let metadata = DocumentMetadata::default();
751 assert!(metadata.title.is_none());
752 assert!(metadata.author.is_none());
753 assert!(metadata.subject.is_none());
754 assert!(metadata.keywords.is_none());
755 assert!(metadata.creator.is_none());
756 assert!(metadata.producer.is_none());
757 assert!(metadata.creation_date.is_none());
758 assert!(metadata.modification_date.is_none());
759 assert_eq!(metadata.version, "".to_string());
760 assert!(metadata.page_count.is_none());
761 }
762
763 #[test]
764 fn test_document_metadata_clone() {
765 let metadata = DocumentMetadata {
766 title: Some("Test".to_string()),
767 version: "1.4".to_string(),
768 ..Default::default()
769 };
770
771 let cloned = metadata.clone();
772 assert_eq!(cloned.title, Some("Test".to_string()));
773 assert_eq!(cloned.version, "1.4".to_string());
774 }
775
776 #[test]
777 fn test_reader_trailer_validation_error() {
778 let bad_pdf = b"%PDF-1.4
7801 0 obj
781<< /Type /Catalog >>
782endobj
783xref
7840 2
7850000000000 65535 f
7860000000009 00000 n
787trailer
788<< /Size 2 >>
789startxref
79046
791%%EOF"
792 .to_vec();
793
794 let cursor = Cursor::new(bad_pdf);
795 let result = PdfReader::new(cursor);
796 assert!(result.is_err()); }
798}