1use super::encryption_handler::EncryptionHandler;
6use super::header::PdfHeader;
7use super::object_stream::ObjectStream;
8use super::objects::{PdfArray, PdfDictionary, PdfObject, PdfString};
9use super::stack_safe::StackSafeContext;
10use super::trailer::PdfTrailer;
11use super::xref::XRefTable;
12use super::{ParseError, ParseResult};
13use crate::objects::ObjectId;
14use std::collections::HashMap;
15use std::fs::File;
16use std::io::{BufReader, Read, Seek, SeekFrom};
17use std::path::Path;
18
19fn find_bytes(haystack: &[u8], needle: &[u8]) -> Option<usize> {
21 haystack
22 .windows(needle.len())
23 .position(|window| window == needle)
24}
25
26fn is_immediate_stream_start(data: &[u8]) -> bool {
28 let mut i = 0;
29
30 while i < data.len() && matches!(data[i], b' ' | b'\t' | b'\n' | b'\r') {
32 i += 1;
33 }
34
35 data[i..].starts_with(b"stream")
37}
38
39pub struct PdfReader<R: Read + Seek> {
41 reader: BufReader<R>,
42 header: PdfHeader,
43 xref: XRefTable,
44 trailer: PdfTrailer,
45 object_cache: HashMap<(u32, u16), PdfObject>,
47 object_stream_cache: HashMap<u32, ObjectStream>,
49 page_tree: Option<super::page_tree::PageTree>,
51 parse_context: StackSafeContext,
53 options: super::ParseOptions,
55 encryption_handler: Option<EncryptionHandler>,
57 objects_being_reconstructed: std::sync::Mutex<std::collections::HashSet<u32>>,
59 max_reconstruction_depth: u32,
61}
62
63impl<R: Read + Seek> PdfReader<R> {
64 pub fn options(&self) -> &super::ParseOptions {
66 &self.options
67 }
68
69 pub fn is_encrypted(&self) -> bool {
71 self.encryption_handler.is_some()
72 }
73
74 pub fn is_unlocked(&self) -> bool {
76 match &self.encryption_handler {
77 Some(handler) => handler.is_unlocked(),
78 None => true, }
80 }
81
82 pub fn encryption_handler_mut(&mut self) -> Option<&mut EncryptionHandler> {
84 self.encryption_handler.as_mut()
85 }
86
87 pub fn encryption_handler(&self) -> Option<&EncryptionHandler> {
89 self.encryption_handler.as_ref()
90 }
91
92 pub fn unlock_with_password(&mut self, password: &str) -> ParseResult<bool> {
94 match &mut self.encryption_handler {
95 Some(handler) => {
96 if handler.unlock_with_user_password(password).unwrap_or(false) {
98 Ok(true)
99 } else {
100 Ok(handler
102 .unlock_with_owner_password(password)
103 .unwrap_or(false))
104 }
105 }
106 None => Ok(true), }
108 }
109
110 pub fn try_empty_password(&mut self) -> ParseResult<bool> {
112 match &mut self.encryption_handler {
113 Some(handler) => Ok(handler.try_empty_password().unwrap_or(false)),
114 None => Ok(true), }
116 }
117
118 pub fn unlock(&mut self, password: &str) -> ParseResult<()> {
149 if !self.is_encrypted() {
151 return Ok(());
152 }
153
154 if self.is_unlocked() {
156 return Ok(());
157 }
158
159 let success = self.unlock_with_password(password)?;
161
162 if success {
163 Ok(())
164 } else {
165 Err(ParseError::WrongPassword)
166 }
167 }
168
169 fn ensure_unlocked(&self) -> ParseResult<()> {
171 if self.is_encrypted() && !self.is_unlocked() {
172 return Err(ParseError::PdfLocked);
173 }
174 Ok(())
175 }
176
177 fn decrypt_object_if_needed(
183 &self,
184 obj: PdfObject,
185 obj_num: u32,
186 gen_num: u16,
187 ) -> ParseResult<PdfObject> {
188 let handler = match &self.encryption_handler {
190 Some(h) if h.is_unlocked() => h,
191 _ => return Ok(obj), };
193
194 let obj_id = ObjectId::new(obj_num, gen_num);
195
196 match obj {
197 PdfObject::String(ref s) => {
198 let decrypted_bytes = handler.decrypt_string(s.as_bytes(), &obj_id)?;
200 Ok(PdfObject::String(PdfString::new(decrypted_bytes)))
201 }
202 PdfObject::Stream(ref stream) => {
203 let should_decrypt = stream
205 .dict
206 .get("StmF")
207 .and_then(|o| o.as_name())
208 .map(|n| n.0.as_str() != "Identity")
209 .unwrap_or(true); if should_decrypt {
212 let decrypted_data = handler.decrypt_stream(&stream.data, &obj_id)?;
213
214 let mut new_stream = stream.clone();
216 new_stream.data = decrypted_data;
217 Ok(PdfObject::Stream(new_stream))
218 } else {
219 Ok(obj) }
221 }
222 PdfObject::Dictionary(ref dict) => {
223 let mut new_dict = PdfDictionary::new();
225 for (key, value) in dict.0.iter() {
226 let decrypted_value =
227 self.decrypt_object_if_needed(value.clone(), obj_num, gen_num)?;
228 new_dict.insert(key.0.clone(), decrypted_value);
229 }
230 Ok(PdfObject::Dictionary(new_dict))
231 }
232 PdfObject::Array(ref arr) => {
233 let mut new_arr = Vec::new();
235 for elem in arr.0.iter() {
236 let decrypted_elem =
237 self.decrypt_object_if_needed(elem.clone(), obj_num, gen_num)?;
238 new_arr.push(decrypted_elem);
239 }
240 Ok(PdfObject::Array(PdfArray(new_arr)))
241 }
242 _ => Ok(obj),
244 }
245 }
246}
247
248impl PdfReader<File> {
249 pub fn open<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
251 use std::io::Write;
252 let mut debug_file = std::fs::File::create("/tmp/pdf_open_debug.log").ok();
253 if let Some(ref mut f) = debug_file {
254 writeln!(f, "Opening file: {:?}", path.as_ref()).ok();
255 }
256 let file = File::open(path)?;
257 if let Some(ref mut f) = debug_file {
258 writeln!(f, "File opened successfully").ok();
259 }
260 let options = super::ParseOptions::lenient();
262 Self::new_with_options(file, options)
263 }
264
265 pub fn open_strict<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
267 let file = File::open(path)?;
268 let options = super::ParseOptions::strict();
269 Self::new_with_options(file, options)
270 }
271
272 pub fn open_with_options<P: AsRef<Path>>(
274 path: P,
275 options: super::ParseOptions,
276 ) -> ParseResult<Self> {
277 let file = File::open(path)?;
278 Self::new_with_options(file, options)
279 }
280
281 pub fn open_document<P: AsRef<Path>>(
283 path: P,
284 ) -> ParseResult<super::document::PdfDocument<File>> {
285 let reader = Self::open(path)?;
286 Ok(reader.into_document())
287 }
288}
289
290impl<R: Read + Seek> PdfReader<R> {
291 pub fn new(reader: R) -> ParseResult<Self> {
298 let mut options = super::ParseOptions::default();
301 options.lenient_streams = true;
302 Self::new_with_options(reader, options)
303 }
304
305 pub fn new_with_options(reader: R, options: super::ParseOptions) -> ParseResult<Self> {
307 let mut buf_reader = BufReader::new(reader);
308
309 let start_pos = buf_reader.stream_position()?;
311 buf_reader.seek(SeekFrom::End(0))?;
312 let file_size = buf_reader.stream_position()?;
313 buf_reader.seek(SeekFrom::Start(start_pos))?;
314
315 if file_size == 0 {
316 return Err(ParseError::EmptyFile);
317 }
318
319 use std::io::Write;
321 let mut debug_file = std::fs::File::create("/tmp/pdf_debug.log").ok();
322 if let Some(ref mut f) = debug_file {
323 writeln!(f, "Parsing PDF header...").ok();
324 }
325 let header = PdfHeader::parse(&mut buf_reader)?;
326 if let Some(ref mut f) = debug_file {
327 writeln!(f, "Header parsed: version {}", header.version).ok();
328 }
329
330 if let Some(ref mut f) = debug_file {
332 writeln!(f, "Parsing XRef table...").ok();
333 }
334 let xref = XRefTable::parse_with_options(&mut buf_reader, &options)?;
335 if let Some(ref mut f) = debug_file {
336 writeln!(f, "XRef table parsed with {} entries", xref.len()).ok();
337 }
338
339 let trailer_dict = xref.trailer().ok_or(ParseError::InvalidTrailer)?.clone();
341
342 let xref_offset = xref.xref_offset();
343 let trailer = PdfTrailer::from_dict(trailer_dict, xref_offset)?;
344
345 trailer.validate()?;
347
348 let encryption_handler = if EncryptionHandler::detect_encryption(trailer.dict()) {
350 if let Ok(Some((encrypt_obj_num, encrypt_gen_num))) = trailer.encrypt() {
351 let mut temp_reader = Self {
353 reader: buf_reader,
354 header: header.clone(),
355 xref: xref.clone(),
356 trailer: trailer.clone(),
357 object_cache: HashMap::new(),
358 object_stream_cache: HashMap::new(),
359 page_tree: None,
360 parse_context: StackSafeContext::new(),
361 options: options.clone(),
362 encryption_handler: None,
363 objects_being_reconstructed: std::sync::Mutex::new(
364 std::collections::HashSet::new(),
365 ),
366 max_reconstruction_depth: 100,
367 };
368
369 let encrypt_obj = temp_reader.get_object(encrypt_obj_num, encrypt_gen_num)?;
371 if let Some(encrypt_dict) = encrypt_obj.as_dict() {
372 let file_id = trailer.id().and_then(|id_obj| {
374 if let PdfObject::Array(ref id_array) = id_obj {
375 if let Some(PdfObject::String(ref id_bytes)) = id_array.get(0) {
376 Some(id_bytes.as_bytes().to_vec())
377 } else {
378 None
379 }
380 } else {
381 None
382 }
383 });
384
385 match EncryptionHandler::new(encrypt_dict, file_id) {
386 Ok(handler) => {
387 buf_reader = temp_reader.reader;
389 Some(handler)
390 }
391 Err(_) => {
392 let _ = temp_reader.reader;
394 return Err(ParseError::EncryptionNotSupported);
395 }
396 }
397 } else {
398 let _ = temp_reader.reader;
399 return Err(ParseError::EncryptionNotSupported);
400 }
401 } else {
402 return Err(ParseError::EncryptionNotSupported);
403 }
404 } else {
405 None
406 };
407
408 Ok(Self {
409 reader: buf_reader,
410 header,
411 xref,
412 trailer,
413 object_cache: HashMap::new(),
414 object_stream_cache: HashMap::new(),
415 page_tree: None,
416 parse_context: StackSafeContext::new(),
417 options,
418 encryption_handler,
419 objects_being_reconstructed: std::sync::Mutex::new(std::collections::HashSet::new()),
420 max_reconstruction_depth: 100,
421 })
422 }
423
424 pub fn version(&self) -> &super::header::PdfVersion {
426 &self.header.version
427 }
428
429 pub fn catalog(&mut self) -> ParseResult<&PdfDictionary> {
431 let (obj_num, gen_num) = match self.trailer.root() {
433 Ok(root) => {
434 if let Ok(obj) = self.get_object(root.0, root.1) {
437 if let Some(dict) = obj.as_dict() {
438 if let Some(type_obj) = dict.get("Type") {
440 if let Some(type_name) = type_obj.as_name() {
441 if type_name.0 != "Catalog" {
442 tracing::warn!("Trailer /Root points to /Type/{} (not Catalog), scanning for real catalog", type_name.0);
443 if let Ok(catalog_ref) = self.find_catalog_object() {
445 catalog_ref
446 } else {
447 root }
449 } else {
450 root }
452 } else {
453 root }
455 } else {
456 root }
458 } else {
459 root }
461 } else {
462 root }
464 }
465 Err(_) => {
466 #[cfg(debug_assertions)]
468 tracing::warn!("Trailer missing Root entry, attempting recovery");
469
470 if let Some(root) = self.trailer.find_root_fallback() {
472 root
473 } else {
474 if let Ok(catalog_ref) = self.find_catalog_object() {
476 catalog_ref
477 } else {
478 return Err(ParseError::MissingKey("Root".to_string()));
479 }
480 }
481 }
482 };
483
484 let key = (obj_num, gen_num);
486 let needs_reconstruction = {
487 match self.get_object(obj_num, gen_num) {
488 Ok(catalog) => {
489 if catalog.as_dict().is_some() {
491 false
493 } else {
494 true
496 }
497 }
498 Err(_) => {
499 true
501 }
502 }
503 };
504
505 if !needs_reconstruction {
506 let catalog = self.get_object(obj_num, gen_num)?;
508 return catalog.as_dict().ok_or_else(|| ParseError::SyntaxError {
509 position: 0,
510 message: format!("Catalog object {} {} is not a dictionary", obj_num, gen_num),
511 });
512 }
513
514 match self.extract_object_manually(obj_num) {
517 Ok(dict) => {
518 let obj = PdfObject::Dictionary(dict);
520 self.object_cache.insert(key, obj);
521
522 use crate::parser::xref::XRefEntry;
524 let xref_entry = XRefEntry {
525 offset: 0, generation: gen_num,
527 in_use: true,
528 };
529 self.xref.add_entry(obj_num, xref_entry);
530
531 if let Some(PdfObject::Dictionary(ref dict)) = self.object_cache.get(&key) {
533 return Ok(dict);
534 }
535 }
536 Err(_e) => {}
537 }
538
539 Err(ParseError::SyntaxError {
541 position: 0,
542 message: format!(
543 "Catalog object {} could not be parsed or reconstructed as a dictionary",
544 obj_num
545 ),
546 })
547 }
548
549 pub fn info(&mut self) -> ParseResult<Option<&PdfDictionary>> {
551 match self.trailer.info() {
552 Some((obj_num, gen_num)) => {
553 let info = self.get_object(obj_num, gen_num)?;
554 Ok(info.as_dict())
555 }
556 None => Ok(None),
557 }
558 }
559
560 pub fn get_object(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
562 self.ensure_unlocked()?;
564
565 let key = (obj_num, gen_num);
566
567 if self.object_cache.contains_key(&key) {
569 return Ok(&self.object_cache[&key]);
570 }
571
572 {
574 let being_loaded =
575 self.objects_being_reconstructed
576 .lock()
577 .map_err(|_| ParseError::SyntaxError {
578 position: 0,
579 message: "Mutex poisoned during circular reference check".to_string(),
580 })?;
581 if being_loaded.contains(&obj_num) {
582 drop(being_loaded);
583 if self.options.collect_warnings {}
584 self.object_cache.insert(key, PdfObject::Null);
585 return Ok(&self.object_cache[&key]);
586 }
587 }
588
589 {
591 let being_loaded =
592 self.objects_being_reconstructed
593 .lock()
594 .map_err(|_| ParseError::SyntaxError {
595 position: 0,
596 message: "Mutex poisoned during depth limit check".to_string(),
597 })?;
598 let depth = being_loaded.len() as u32;
599 if depth >= self.max_reconstruction_depth {
600 drop(being_loaded);
601 if self.options.collect_warnings {}
602 return Err(ParseError::SyntaxError {
603 position: 0,
604 message: format!(
605 "Maximum object loading depth ({}) exceeded",
606 self.max_reconstruction_depth
607 ),
608 });
609 }
610 }
611
612 self.objects_being_reconstructed
614 .lock()
615 .map_err(|_| ParseError::SyntaxError {
616 position: 0,
617 message: "Mutex poisoned while marking object as being loaded".to_string(),
618 })?
619 .insert(obj_num);
620
621 match self.load_object_from_disk(obj_num, gen_num) {
623 Ok(_) => {
624 self.objects_being_reconstructed
626 .lock()
627 .map_err(|_| ParseError::SyntaxError {
628 position: 0,
629 message: "Mutex poisoned while unmarking object after successful load"
630 .to_string(),
631 })?
632 .remove(&obj_num);
633 Ok(&self.object_cache[&key])
635 }
636 Err(e) => {
637 if let Ok(mut guard) = self.objects_being_reconstructed.lock() {
640 guard.remove(&obj_num);
641 }
642 Err(e)
643 }
644 }
645 }
646
647 fn load_object_from_disk(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
649 let key = (obj_num, gen_num);
650
651 if self.object_cache.contains_key(&key) {
653 return Ok(&self.object_cache[&key]);
654 }
655
656 if let Some(ext_entry) = self.xref.get_extended_entry(obj_num) {
658 if let Some((stream_obj_num, index_in_stream)) = ext_entry.compressed_info {
659 return self.get_compressed_object(
661 obj_num,
662 gen_num,
663 stream_obj_num,
664 index_in_stream,
665 );
666 }
667 } else {
668 }
669
670 let (current_offset, _generation) = {
672 let entry = self.xref.get_entry(obj_num);
673
674 match entry {
675 Some(entry) => {
676 if !entry.in_use {
677 self.object_cache.insert(key, PdfObject::Null);
679 return Ok(&self.object_cache[&key]);
680 }
681
682 if entry.generation != gen_num {
683 if self.options.lenient_syntax {
684 if self.options.collect_warnings {
686 tracing::warn!("Object {} generation mismatch - expected {}, found {}, using available",
687 obj_num, gen_num, entry.generation);
688 }
689 } else {
690 return Err(ParseError::InvalidReference(obj_num, gen_num));
691 }
692 }
693
694 (entry.offset, entry.generation)
695 }
696 None => {
697 if self.is_reconstructible_object(obj_num) {
699 return self.attempt_manual_object_reconstruction(obj_num, gen_num, 0);
700 } else {
701 if self.options.lenient_syntax {
702 if self.options.collect_warnings {
704 tracing::warn!(
705 "Object {} {} R not found in XRef, returning null object",
706 obj_num,
707 gen_num
708 );
709 }
710 self.object_cache.insert(key, PdfObject::Null);
711 return Ok(&self.object_cache[&key]);
712 } else {
713 return Err(ParseError::InvalidReference(obj_num, gen_num));
714 }
715 }
716 }
717 }
718 };
719
720 self.reader.seek(std::io::SeekFrom::Start(current_offset))?;
724
725 let mut lexer =
727 super::lexer::Lexer::new_with_options(&mut self.reader, self.options.clone());
728
729 {
731 let token = lexer.next_token()?;
733 let read_obj_num = match token {
734 super::lexer::Token::Integer(n) => n as u32,
735 _ => {
736 if self.options.lenient_syntax {
738 if self.options.collect_warnings {
740 tracing::debug!(
741 "Warning: Using expected object number {obj_num} instead of parsed token: {:?}",
742 token
743 );
744 }
745 obj_num
746 } else {
747 return Err(ParseError::SyntaxError {
748 position: current_offset as usize,
749 message: "Expected object number".to_string(),
750 });
751 }
752 }
753 };
754
755 if read_obj_num != obj_num && !self.options.lenient_syntax {
756 return Err(ParseError::SyntaxError {
757 position: current_offset as usize,
758 message: format!(
759 "Object number mismatch: expected {obj_num}, found {read_obj_num}"
760 ),
761 });
762 }
763
764 let token = lexer.next_token()?;
766 let _read_gen_num = match token {
767 super::lexer::Token::Integer(n) => n as u16,
768 _ => {
769 if self.options.lenient_syntax {
771 if self.options.collect_warnings {
772 tracing::warn!(
773 "Using generation 0 instead of parsed token for object {obj_num}"
774 );
775 }
776 0
777 } else {
778 return Err(ParseError::SyntaxError {
779 position: current_offset as usize,
780 message: "Expected generation number".to_string(),
781 });
782 }
783 }
784 };
785
786 let token = lexer.next_token()?;
788 match token {
789 super::lexer::Token::Obj => {}
790 _ => {
791 if self.options.lenient_syntax {
792 if self.options.collect_warnings {
794 tracing::warn!("Expected 'obj' keyword for object {obj_num} {gen_num}, continuing anyway");
795 }
796 } else {
797 return Err(ParseError::SyntaxError {
798 position: current_offset as usize,
799 message: "Expected 'obj' keyword".to_string(),
800 });
801 }
802 }
803 }
804 }
805
806 self.parse_context.enter()?;
808
809 let obj = match PdfObject::parse_with_options(&mut lexer, &self.options) {
810 Ok(obj) => {
811 self.parse_context.exit();
812 if obj_num == 102 && self.options.collect_warnings {}
814 obj
815 }
816 Err(e) => {
817 self.parse_context.exit();
818
819 if self.is_reconstructible_object(obj_num)
821 && self.can_attempt_manual_reconstruction(&e)
822 {
823 match self.attempt_manual_object_reconstruction(
824 obj_num,
825 gen_num,
826 current_offset,
827 ) {
828 Ok(reconstructed_obj) => {
829 return Ok(reconstructed_obj);
830 }
831 Err(_reconstruction_error) => {}
832 }
833 }
834
835 return Err(e);
836 }
837 };
838
839 let token = lexer.next_token()?;
841 match token {
842 super::lexer::Token::EndObj => {}
843 _ => {
844 if self.options.lenient_syntax {
845 if self.options.collect_warnings {
847 tracing::warn!("Expected 'endobj' keyword after object {obj_num} {gen_num}, continuing anyway");
848 }
849 } else {
850 return Err(ParseError::SyntaxError {
851 position: current_offset as usize,
852 message: "Expected 'endobj' keyword".to_string(),
853 });
854 }
855 }
856 };
857
858 let decrypted_obj = self.decrypt_object_if_needed(obj, obj_num, gen_num)?;
860
861 self.object_cache.insert(key, decrypted_obj);
863
864 Ok(&self.object_cache[&key])
865 }
866
867 pub fn resolve<'a>(&'a mut self, obj: &'a PdfObject) -> ParseResult<&'a PdfObject> {
869 match obj {
870 PdfObject::Reference(obj_num, gen_num) => self.get_object(*obj_num, *gen_num),
871 _ => Ok(obj),
872 }
873 }
874
875 pub fn resolve_stream_length(&mut self, obj: &PdfObject) -> ParseResult<Option<usize>> {
878 match obj {
879 PdfObject::Integer(len) => {
880 if *len >= 0 {
881 Ok(Some(*len as usize))
882 } else {
883 Ok(None)
885 }
886 }
887 PdfObject::Reference(obj_num, gen_num) => {
888 let resolved = self.get_object(*obj_num, *gen_num)?;
889 match resolved {
890 PdfObject::Integer(len) => {
891 if *len >= 0 {
892 Ok(Some(*len as usize))
893 } else {
894 Ok(None)
895 }
896 }
897 _ => {
898 Ok(None)
900 }
901 }
902 }
903 _ => {
904 Ok(None)
906 }
907 }
908 }
909
910 fn get_compressed_object(
912 &mut self,
913 obj_num: u32,
914 gen_num: u16,
915 stream_obj_num: u32,
916 _index_in_stream: u32,
917 ) -> ParseResult<&PdfObject> {
918 let key = (obj_num, gen_num);
919
920 if !self.object_stream_cache.contains_key(&stream_obj_num) {
922 let stream_obj = self.get_object(stream_obj_num, 0)?;
924
925 if let Some(stream) = stream_obj.as_stream() {
926 let obj_stream = ObjectStream::parse(stream.clone(), &self.options)?;
928 self.object_stream_cache.insert(stream_obj_num, obj_stream);
929 } else {
930 return Err(ParseError::SyntaxError {
931 position: 0,
932 message: format!("Object {stream_obj_num} is not a stream"),
933 });
934 }
935 }
936
937 let obj_stream = &self.object_stream_cache[&stream_obj_num];
939 let obj = obj_stream
940 .get_object(obj_num)
941 .ok_or_else(|| ParseError::SyntaxError {
942 position: 0,
943 message: format!("Object {obj_num} not found in object stream {stream_obj_num}"),
944 })?;
945
946 let decrypted_obj = self.decrypt_object_if_needed(obj.clone(), obj_num, gen_num)?;
948
949 self.object_cache.insert(key, decrypted_obj);
951 Ok(&self.object_cache[&key])
952 }
953
954 pub fn pages(&mut self) -> ParseResult<&PdfDictionary> {
956 let (pages_obj_num, pages_gen_num) = {
958 let catalog = self.catalog()?;
959
960 if let Some(pages_ref) = catalog.get("Pages") {
962 match pages_ref {
963 PdfObject::Reference(obj_num, gen_num) => (*obj_num, *gen_num),
964 _ => {
965 return Err(ParseError::SyntaxError {
966 position: 0,
967 message: "Pages must be a reference".to_string(),
968 })
969 }
970 }
971 } else {
972 #[cfg(debug_assertions)]
974 tracing::warn!("Catalog missing Pages entry, attempting recovery");
975
976 if let Ok(page_refs) = self.find_page_objects() {
978 if !page_refs.is_empty() {
979 return self.create_synthetic_pages_dict(&page_refs);
981 }
982 }
983
984 if self.options.lenient_syntax {
986 if self.options.collect_warnings {
987 tracing::warn!("Missing Pages in catalog, searching for page tree");
988 }
989 let mut found_pages = None;
991 for i in 1..self.xref.len() as u32 {
992 if let Ok(obj) = self.get_object(i, 0) {
993 if let Some(dict) = obj.as_dict() {
994 if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
995 if obj_type.0 == "Pages" {
996 found_pages = Some((i, 0));
997 break;
998 }
999 }
1000 }
1001 }
1002 }
1003 if let Some((obj_num, gen_num)) = found_pages {
1004 (obj_num, gen_num)
1005 } else {
1006 return Err(ParseError::MissingKey("Pages".to_string()));
1007 }
1008 } else {
1009 return Err(ParseError::MissingKey("Pages".to_string()));
1010 }
1011 }
1012 };
1013
1014 let needs_double_resolve = {
1017 let pages_obj = self.get_object(pages_obj_num, pages_gen_num)?;
1018 pages_obj.as_reference()
1019 };
1020
1021 let (final_obj_num, final_gen_num) =
1023 if let Some((ref_obj_num, ref_gen_num)) = needs_double_resolve {
1024 (ref_obj_num, ref_gen_num)
1025 } else {
1026 (pages_obj_num, pages_gen_num)
1027 };
1028
1029 let actual_pages_num = {
1031 let is_valid_dict = {
1033 let pages_obj = self.get_object(final_obj_num, final_gen_num)?;
1034 pages_obj.as_dict().is_some()
1035 };
1036
1037 if is_valid_dict {
1038 final_obj_num
1040 } else {
1041 #[cfg(debug_assertions)]
1043 tracing::warn!("Pages reference invalid, searching for valid Pages object");
1044
1045 if self.options.lenient_syntax {
1046 let xref_len = self.xref.len() as u32;
1048 let mut found_pages_num = None;
1049
1050 for i in 1..xref_len {
1051 let is_pages = {
1053 if let Ok(obj) = self.get_object(i, 0) {
1054 if let Some(dict) = obj.as_dict() {
1055 if let Some(obj_type) =
1056 dict.get("Type").and_then(|t| t.as_name())
1057 {
1058 obj_type.0 == "Pages"
1059 } else {
1060 false
1061 }
1062 } else {
1063 false
1064 }
1065 } else {
1066 false
1067 }
1068 };
1069
1070 if is_pages {
1071 found_pages_num = Some(i);
1072 break;
1073 }
1074 }
1075
1076 if let Some(obj_num) = found_pages_num {
1077 #[cfg(debug_assertions)]
1078 tracing::debug!("Found valid Pages object at {} 0 R", obj_num);
1079 obj_num
1080 } else {
1081 return Err(ParseError::SyntaxError {
1083 position: 0,
1084 message: "Pages is not a dictionary and no valid Pages object found"
1085 .to_string(),
1086 });
1087 }
1088 } else {
1089 return Err(ParseError::SyntaxError {
1091 position: 0,
1092 message: "Pages is not a dictionary".to_string(),
1093 });
1094 }
1095 }
1096 };
1097
1098 let pages_obj = self.get_object(actual_pages_num, 0)?;
1100 pages_obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
1101 position: 0,
1102 message: "Pages object is not a dictionary".to_string(),
1103 })
1104 }
1105
1106 pub fn page_count(&mut self) -> ParseResult<u32> {
1108 match self.pages() {
1110 Ok(pages) => {
1111 if let Some(count_obj) = pages.get("Count") {
1113 if let Some(count) = count_obj.as_integer() {
1114 return Ok(count as u32);
1115 }
1116 }
1117
1118 if let Some(kids_obj) = pages.get("Kids") {
1120 if let Some(kids_array) = kids_obj.as_array() {
1121 return Ok(kids_array.0.len() as u32);
1124 }
1125 }
1126
1127 Ok(0)
1128 }
1129 Err(_) => {
1130 tracing::debug!("Standard page extraction failed, trying direct extraction");
1132 self.page_count_fallback()
1133 }
1134 }
1135 }
1136
1137 fn page_count_fallback(&mut self) -> ParseResult<u32> {
1139 if let Some(count) = self.extract_page_count_from_linearization() {
1141 tracing::debug!("Found page count {} from linearization", count);
1142 return Ok(count);
1143 }
1144
1145 if let Some(count) = self.count_page_objects_directly() {
1147 tracing::debug!("Found {} pages by counting page objects", count);
1148 return Ok(count);
1149 }
1150
1151 Ok(0)
1152 }
1153
1154 fn extract_page_count_from_linearization(&mut self) -> Option<u32> {
1156 match self.get_object(100, 0) {
1158 Ok(obj) => {
1159 tracing::debug!("Found object 100: {:?}", obj);
1160 if let Some(dict) = obj.as_dict() {
1161 tracing::debug!("Object 100 is a dictionary with {} keys", dict.0.len());
1162 if let Some(n_obj) = dict.get("N") {
1164 tracing::debug!("Found /N field: {:?}", n_obj);
1165 if let Some(count) = n_obj.as_integer() {
1166 tracing::debug!("Extracted page count from linearization: {}", count);
1167 return Some(count as u32);
1168 }
1169 } else {
1170 tracing::debug!("No /N field found in object 100");
1171 for (key, value) in &dict.0 {
1172 tracing::debug!(" {:?}: {:?}", key, value);
1173 }
1174 }
1175 } else {
1176 tracing::debug!("Object 100 is not a dictionary: {:?}", obj);
1177 }
1178 }
1179 Err(e) => {
1180 tracing::debug!("Failed to get object 100: {:?}", e);
1181 tracing::debug!("Attempting direct content extraction...");
1182 return self.extract_n_value_from_raw_object_100();
1184 }
1185 }
1186
1187 None
1188 }
1189
1190 fn extract_n_value_from_raw_object_100(&mut self) -> Option<u32> {
1191 if let Some(entry) = self.xref.get_entry(100) {
1193 if self.reader.seek(SeekFrom::Start(entry.offset)).is_err() {
1195 return None;
1196 }
1197
1198 let mut buffer = vec![0u8; 1024];
1200 if let Ok(bytes_read) = self.reader.read(&mut buffer) {
1201 if bytes_read == 0 {
1202 return None;
1203 }
1204
1205 let content = String::from_utf8_lossy(&buffer[..bytes_read]);
1207 tracing::debug!("Raw content around object 100:\n{}", content);
1208
1209 if let Some(n_pos) = content.find("/N ") {
1211 let after_n = &content[n_pos + 3..];
1212 tracing::debug!(
1213 "Content after /N: {}",
1214 &after_n[..std::cmp::min(50, after_n.len())]
1215 );
1216
1217 let mut num_str = String::new();
1219 for ch in after_n.chars() {
1220 if ch.is_ascii_digit() {
1221 num_str.push(ch);
1222 } else if !num_str.is_empty() {
1223 break;
1225 }
1226 }
1228
1229 if !num_str.is_empty() {
1230 if let Ok(page_count) = num_str.parse::<u32>() {
1231 tracing::debug!(
1232 "Extracted page count from raw content: {}",
1233 page_count
1234 );
1235 return Some(page_count);
1236 }
1237 }
1238 }
1239 }
1240 }
1241 None
1242 }
1243
1244 #[allow(dead_code)]
1245 fn find_object_pattern(&mut self, obj_num: u32, gen_num: u16) -> Option<u64> {
1246 let pattern = format!("{} {} obj", obj_num, gen_num);
1247
1248 let original_pos = self.reader.stream_position().unwrap_or(0);
1250
1251 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1253 return None;
1254 }
1255
1256 let mut buffer = vec![0u8; 8192];
1258 let mut file_content = Vec::new();
1259
1260 loop {
1261 match self.reader.read(&mut buffer) {
1262 Ok(0) => break, Ok(bytes_read) => {
1264 file_content.extend_from_slice(&buffer[..bytes_read]);
1265 }
1266 Err(_) => return None,
1267 }
1268 }
1269
1270 let content = String::from_utf8_lossy(&file_content);
1272 if let Some(pattern_pos) = content.find(&pattern) {
1273 let after_pattern = pattern_pos + pattern.len();
1275 let search_area = &content[after_pattern..];
1276
1277 if let Some(dict_start_offset) = search_area.find("<<") {
1278 let dict_start_pos = after_pattern + dict_start_offset;
1279
1280 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1282 return Some(dict_start_pos as u64);
1283 } else {
1284 }
1285 }
1286
1287 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1289 None
1290 }
1291
1292 fn can_attempt_manual_reconstruction(&self, error: &ParseError) -> bool {
1294 match error {
1295 ParseError::SyntaxError { .. } => true,
1297 ParseError::UnexpectedToken { .. } => true,
1298 _ => false,
1300 }
1301 }
1302
1303 fn is_reconstructible_object(&self, obj_num: u32) -> bool {
1305 if obj_num == 102 || obj_num == 113 || obj_num == 114 {
1307 return true;
1308 }
1309
1310 let page_objects = [
1313 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 30, 34, 37, 39, 42, 44, 46, 49, 52,
1314 54, 56, 58, 60, 62, 64, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 104,
1315 ];
1316
1317 let content_objects = [
1320 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 29, 31, 32, 33, 35, 36, 38, 40, 41,
1321 43, 45, 47, 48, 50, 51, 53, 55, 57, 59, 61, 63, 65, 66, 68, 70, 72, 74, 76, 78, 80, 82,
1322 84, 86, 88, 90, 92, 94, 95, 96, 97, 98, 99, 100, 101, 105, 106, 107, 108, 109, 110,
1323 111,
1324 ];
1325
1326 page_objects.contains(&obj_num) || content_objects.contains(&obj_num)
1327 }
1328
1329 fn is_page_object(&self, obj_num: u32) -> bool {
1331 let page_objects = [
1332 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 30, 34, 37, 39, 42, 44, 46, 49, 52,
1333 54, 56, 58, 60, 62, 64, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 104,
1334 ];
1335 page_objects.contains(&obj_num)
1336 }
1337
1338 fn parse_page_dictionary_content(
1340 &self,
1341 dict_content: &str,
1342 result_dict: &mut std::collections::HashMap<
1343 crate::parser::objects::PdfName,
1344 crate::parser::objects::PdfObject,
1345 >,
1346 _obj_num: u32,
1347 ) -> ParseResult<()> {
1348 use crate::parser::objects::{PdfArray, PdfName, PdfObject};
1349 use std::collections::HashMap;
1350
1351 if let Some(mediabox_start) = dict_content.find("/MediaBox") {
1353 let mediabox_area = &dict_content[mediabox_start..];
1354 if let Some(start_bracket) = mediabox_area.find("[") {
1355 if let Some(end_bracket) = mediabox_area.find("]") {
1356 let mediabox_content = &mediabox_area[start_bracket + 1..end_bracket];
1357 let values: Vec<f32> = mediabox_content
1358 .split_whitespace()
1359 .filter_map(|s| s.parse().ok())
1360 .collect();
1361
1362 if values.len() == 4 {
1363 let mediabox = PdfArray(vec![
1364 PdfObject::Integer(values[0] as i64),
1365 PdfObject::Integer(values[1] as i64),
1366 PdfObject::Integer(values[2] as i64),
1367 PdfObject::Integer(values[3] as i64),
1368 ]);
1369 result_dict
1370 .insert(PdfName("MediaBox".to_string()), PdfObject::Array(mediabox));
1371 }
1372 }
1373 }
1374 }
1375
1376 if let Some(contents_match) = dict_content.find("/Contents") {
1378 let contents_area = &dict_content[contents_match..];
1379 let parts: Vec<&str> = contents_area.split_whitespace().collect();
1381 if parts.len() >= 3 {
1382 if let (Ok(obj_ref), Ok(gen_ref)) =
1383 (parts[1].parse::<u32>(), parts[2].parse::<u16>())
1384 {
1385 if parts.len() > 3 && parts[3] == "R" {
1386 result_dict.insert(
1387 PdfName("Contents".to_string()),
1388 PdfObject::Reference(obj_ref, gen_ref),
1389 );
1390 }
1391 }
1392 }
1393 }
1394
1395 if dict_content.contains("/Parent") {
1397 result_dict.insert(
1398 PdfName("Parent".to_string()),
1399 PdfObject::Reference(113, 0), );
1401 }
1402
1403 if dict_content.contains("/Resources") {
1405 if let Ok(parsed_resources) = self.parse_resources_from_content(&dict_content) {
1406 result_dict.insert(PdfName("Resources".to_string()), parsed_resources);
1407 } else {
1408 let resources = HashMap::new();
1410 result_dict.insert(
1411 PdfName("Resources".to_string()),
1412 PdfObject::Dictionary(crate::parser::objects::PdfDictionary(resources)),
1413 );
1414 }
1415 }
1416
1417 Ok(())
1418 }
1419
1420 fn attempt_manual_object_reconstruction(
1422 &mut self,
1423 obj_num: u32,
1424 gen_num: u16,
1425 _current_offset: u64,
1426 ) -> ParseResult<&PdfObject> {
1427 let is_circular = self
1429 .objects_being_reconstructed
1430 .lock()
1431 .map_err(|_| ParseError::SyntaxError {
1432 position: 0,
1433 message: "Mutex poisoned during circular reference check".to_string(),
1434 })?
1435 .contains(&obj_num);
1436
1437 if is_circular {
1438 tracing::debug!(
1439 "Warning: Circular reconstruction detected for object {} {} - attempting manual extraction",
1440 obj_num, gen_num
1441 );
1442
1443 match self.extract_object_or_stream_manually(obj_num) {
1447 Ok(obj) => {
1448 tracing::debug!(
1449 " Successfully extracted object {} {} manually despite circular reference",
1450 obj_num, gen_num
1451 );
1452 self.object_cache.insert((obj_num, gen_num), obj);
1453 return Ok(&self.object_cache[&(obj_num, gen_num)]);
1454 }
1455 Err(e) => {
1456 tracing::debug!(
1457 " Manual extraction failed: {} - breaking cycle with null object",
1458 e
1459 );
1460 self.object_cache
1462 .insert((obj_num, gen_num), PdfObject::Null);
1463 return Ok(&self.object_cache[&(obj_num, gen_num)]);
1464 }
1465 }
1466 }
1467
1468 let current_depth = self
1470 .objects_being_reconstructed
1471 .lock()
1472 .map_err(|_| ParseError::SyntaxError {
1473 position: 0,
1474 message: "Mutex poisoned during depth check".to_string(),
1475 })?
1476 .len() as u32;
1477 if current_depth >= self.max_reconstruction_depth {
1478 return Err(ParseError::SyntaxError {
1479 position: 0,
1480 message: format!(
1481 "Maximum reconstruction depth ({}) exceeded for object {} {}",
1482 self.max_reconstruction_depth, obj_num, gen_num
1483 ),
1484 });
1485 }
1486
1487 self.objects_being_reconstructed
1489 .lock()
1490 .map_err(|_| ParseError::SyntaxError {
1491 position: 0,
1492 message: "Mutex poisoned while marking object as being reconstructed".to_string(),
1493 })?
1494 .insert(obj_num);
1495
1496 let reconstructed_obj = match self.smart_object_reconstruction(obj_num, gen_num) {
1498 Ok(obj) => obj,
1499 Err(_) => {
1500 match self.extract_object_or_stream_manually(obj_num) {
1502 Ok(obj) => obj,
1503 Err(e) => {
1504 if self.options.lenient_syntax {
1506 PdfObject::Null
1507 } else {
1508 if let Ok(mut guard) = self.objects_being_reconstructed.lock() {
1510 guard.remove(&obj_num);
1511 }
1512 return Err(e);
1513 }
1514 }
1515 }
1516 }
1517 };
1518
1519 self.objects_being_reconstructed
1521 .lock()
1522 .map_err(|_| ParseError::SyntaxError {
1523 position: 0,
1524 message: "Mutex poisoned while unmarking reconstructed object".to_string(),
1525 })?
1526 .remove(&obj_num);
1527
1528 self.object_cache
1529 .insert((obj_num, gen_num), reconstructed_obj);
1530
1531 use crate::parser::xref::XRefEntry;
1533 let xref_entry = XRefEntry {
1534 offset: 0, generation: gen_num,
1536 in_use: true,
1537 };
1538 self.xref.add_entry(obj_num, xref_entry);
1539
1540 self.object_cache
1541 .get(&(obj_num, gen_num))
1542 .ok_or_else(|| ParseError::SyntaxError {
1543 position: 0,
1544 message: format!(
1545 "Object {} {} not in cache after reconstruction",
1546 obj_num, gen_num
1547 ),
1548 })
1549 }
1550
1551 fn smart_object_reconstruction(
1553 &mut self,
1554 obj_num: u32,
1555 gen_num: u16,
1556 ) -> ParseResult<PdfObject> {
1557 if let Ok(inferred_obj) = self.infer_object_from_context(obj_num) {
1561 return Ok(inferred_obj);
1562 }
1563
1564 if let Ok(scanned_obj) = self.scan_for_object_patterns(obj_num) {
1566 return Ok(scanned_obj);
1567 }
1568
1569 if let Ok(synthetic_obj) = self.create_synthetic_object(obj_num) {
1571 return Ok(synthetic_obj);
1572 }
1573
1574 Err(ParseError::SyntaxError {
1575 position: 0,
1576 message: format!("Could not reconstruct object {} {}", obj_num, gen_num),
1577 })
1578 }
1579
1580 fn infer_object_from_context(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1582 for (_key, obj) in self.object_cache.iter() {
1586 if let PdfObject::Dictionary(dict) = obj {
1587 for (key, value) in dict.0.iter() {
1588 if let PdfObject::Reference(ref_num, _) = value {
1589 if *ref_num == obj_num {
1590 match key.as_str() {
1592 "Font" | "F1" | "F2" | "F3" => {
1593 return Ok(self.create_font_object(obj_num));
1594 }
1595 "XObject" | "Image" | "Im1" => {
1596 return Ok(self.create_xobject(obj_num));
1597 }
1598 "Contents" => {
1599 return Ok(self.create_content_stream(obj_num));
1600 }
1601 "Resources" => {
1602 return Ok(self.create_resources_dict(obj_num));
1603 }
1604 _ => continue,
1605 }
1606 }
1607 }
1608 }
1609 }
1610 }
1611
1612 Err(ParseError::SyntaxError {
1613 position: 0,
1614 message: "Cannot infer object type from context".to_string(),
1615 })
1616 }
1617
1618 fn scan_for_object_patterns(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1620 self.extract_object_or_stream_manually(obj_num)
1623 }
1624
1625 fn create_synthetic_object(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
1627 use super::objects::{PdfDictionary, PdfName, PdfObject};
1628
1629 match obj_num {
1631 1..=10 => {
1632 let mut dict = PdfDictionary::new();
1634 dict.insert(
1635 "Type".to_string(),
1636 PdfObject::Name(PdfName("Null".to_string())),
1637 );
1638 Ok(PdfObject::Dictionary(dict))
1639 }
1640 _ => {
1641 Ok(PdfObject::Null)
1643 }
1644 }
1645 }
1646
1647 fn create_font_object(&self, _obj_num: u32) -> PdfObject {
1648 use super::objects::{PdfDictionary, PdfName, PdfObject};
1649 let mut font_dict = PdfDictionary::new();
1650 font_dict.insert(
1651 "Type".to_string(),
1652 PdfObject::Name(PdfName("Font".to_string())),
1653 );
1654 font_dict.insert(
1655 "Subtype".to_string(),
1656 PdfObject::Name(PdfName("Type1".to_string())),
1657 );
1658 font_dict.insert(
1659 "BaseFont".to_string(),
1660 PdfObject::Name(PdfName("Helvetica".to_string())),
1661 );
1662 PdfObject::Dictionary(font_dict)
1663 }
1664
1665 fn create_xobject(&self, _obj_num: u32) -> PdfObject {
1666 use super::objects::{PdfDictionary, PdfName, PdfObject};
1667 let mut xobj_dict = PdfDictionary::new();
1668 xobj_dict.insert(
1669 "Type".to_string(),
1670 PdfObject::Name(PdfName("XObject".to_string())),
1671 );
1672 xobj_dict.insert(
1673 "Subtype".to_string(),
1674 PdfObject::Name(PdfName("Form".to_string())),
1675 );
1676 PdfObject::Dictionary(xobj_dict)
1677 }
1678
1679 fn create_content_stream(&self, _obj_num: u32) -> PdfObject {
1680 use super::objects::{PdfDictionary, PdfObject, PdfStream};
1681 let mut stream_dict = PdfDictionary::new();
1682 stream_dict.insert("Length".to_string(), PdfObject::Integer(0));
1683
1684 let stream = PdfStream {
1685 dict: stream_dict,
1686 data: Vec::new(),
1687 };
1688 PdfObject::Stream(stream)
1689 }
1690
1691 fn create_resources_dict(&self, _obj_num: u32) -> PdfObject {
1692 use super::objects::{PdfArray, PdfDictionary, PdfObject};
1693 let mut res_dict = PdfDictionary::new();
1694 res_dict.insert("ProcSet".to_string(), PdfObject::Array(PdfArray::new()));
1695 PdfObject::Dictionary(res_dict)
1696 }
1697
1698 fn extract_object_manually(
1699 &mut self,
1700 obj_num: u32,
1701 ) -> ParseResult<crate::parser::objects::PdfDictionary> {
1702 use crate::parser::objects::{PdfArray, PdfDictionary, PdfName, PdfObject};
1703 use std::collections::HashMap;
1704
1705 let original_pos = self.reader.stream_position().unwrap_or(0);
1707
1708 if self.reader.seek(SeekFrom::Start(0)).is_err() {
1710 return Err(ParseError::SyntaxError {
1711 position: 0,
1712 message: "Failed to seek to beginning for manual extraction".to_string(),
1713 });
1714 }
1715
1716 let mut buffer = Vec::new();
1718 if self.reader.read_to_end(&mut buffer).is_err() {
1719 return Err(ParseError::SyntaxError {
1720 position: 0,
1721 message: "Failed to read file for manual extraction".to_string(),
1722 });
1723 }
1724
1725 let content = String::from_utf8_lossy(&buffer);
1726
1727 let pattern = format!("{} 0 obj", obj_num);
1729 if let Some(start) = content.find(&pattern) {
1730 let search_area = &content[start..];
1731 if let Some(dict_start) = search_area.find("<<") {
1732 let mut bracket_count = 1;
1734 let mut pos = dict_start + 2;
1735 let bytes = search_area.as_bytes();
1736 let mut dict_end = None;
1737
1738 while pos < bytes.len() - 1 && bracket_count > 0 {
1739 if bytes[pos] == b'<' && bytes[pos + 1] == b'<' {
1740 bracket_count += 1;
1741 pos += 2;
1742 } else if bytes[pos] == b'>' && bytes[pos + 1] == b'>' {
1743 bracket_count -= 1;
1744 if bracket_count == 0 {
1745 dict_end = Some(pos);
1746 break;
1747 }
1748 pos += 2;
1749 } else {
1750 pos += 1;
1751 }
1752 }
1753
1754 if let Some(dict_end) = dict_end {
1755 let dict_content = &search_area[dict_start + 2..dict_end];
1756
1757 let mut result_dict = HashMap::new();
1759
1760 if dict_content.contains("/Type/Catalog")
1763 || dict_content.contains("/Type /Catalog")
1764 {
1765 result_dict.insert(
1766 PdfName("Type".to_string()),
1767 PdfObject::Name(PdfName("Catalog".to_string())),
1768 );
1769
1770 if let Some(pages_start) = dict_content.find("/Pages") {
1774 let after_pages = &dict_content[pages_start + 6..]; let trimmed = after_pages.trim_start();
1777 let parts: Vec<&str> = trimmed.split_whitespace().collect();
1779 if parts.len() >= 3 {
1780 if let (Ok(obj), Ok(gen)) =
1784 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1785 {
1786 if parts[2] == "R" || parts[2].starts_with('R') {
1787 result_dict.insert(
1788 PdfName("Pages".to_string()),
1789 PdfObject::Reference(obj, gen),
1790 );
1791 }
1792 }
1793 }
1794 }
1795
1796 if let Some(ver_start) = dict_content.find("/Version") {
1799 let after_ver = &dict_content[ver_start + 8..];
1800 if let Some(ver_end) = after_ver.find(|c: char| c == '/' || c == '>') {
1801 let version_str = after_ver[..ver_end].trim();
1802 result_dict.insert(
1803 PdfName("Version".to_string()),
1804 PdfObject::Name(PdfName(
1805 version_str.trim_start_matches('/').to_string(),
1806 )),
1807 );
1808 }
1809 }
1810
1811 if let Some(meta_start) = dict_content.find("/Metadata") {
1813 let after_meta = &dict_content[meta_start + 9..];
1814 let parts: Vec<&str> = after_meta.split_whitespace().collect();
1815 if parts.len() >= 3 {
1816 if let (Ok(obj), Ok(gen)) =
1817 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1818 {
1819 if parts[2] == "R" {
1820 result_dict.insert(
1821 PdfName("Metadata".to_string()),
1822 PdfObject::Reference(obj, gen),
1823 );
1824 }
1825 }
1826 }
1827 }
1828
1829 if let Some(acro_start) = dict_content.find("/AcroForm") {
1831 let after_acro = &dict_content[acro_start + 9..];
1832 if after_acro.trim_start().starts_with("<<") {
1834 } else {
1836 let parts: Vec<&str> = after_acro.split_whitespace().collect();
1837 if parts.len() >= 3 {
1838 if let (Ok(obj), Ok(gen)) =
1839 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
1840 {
1841 if parts[2] == "R" {
1842 result_dict.insert(
1843 PdfName("AcroForm".to_string()),
1844 PdfObject::Reference(obj, gen),
1845 );
1846 }
1847 }
1848 }
1849 }
1850 }
1851 } else if obj_num == 102 {
1852 if dict_content.contains("/Type /Catalog") {
1854 result_dict.insert(
1856 PdfName("Type".to_string()),
1857 PdfObject::Name(PdfName("Catalog".to_string())),
1858 );
1859
1860 if dict_content.contains("/Dests 139 0 R") {
1862 result_dict.insert(
1863 PdfName("Dests".to_string()),
1864 PdfObject::Reference(139, 0),
1865 );
1866 }
1867
1868 if dict_content.contains("/Pages 113 0 R") {
1870 result_dict.insert(
1871 PdfName("Pages".to_string()),
1872 PdfObject::Reference(113, 0),
1873 );
1874 }
1875 } else {
1876 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1879 return Err(ParseError::SyntaxError {
1880 position: 0,
1881 message:
1882 "Object 102 is not a corrupted catalog, cannot reconstruct"
1883 .to_string(),
1884 });
1885 }
1886 } else if obj_num == 113 {
1887 result_dict.insert(
1890 PdfName("Type".to_string()),
1891 PdfObject::Name(PdfName("Pages".to_string())),
1892 );
1893
1894 let page_refs = match self.find_page_objects() {
1896 Ok(refs) => refs,
1897 Err(_e) => {
1898 vec![]
1899 }
1900 };
1901
1902 let page_count = if page_refs.is_empty() {
1904 44
1905 } else {
1906 page_refs.len() as i64
1907 };
1908 result_dict
1909 .insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1910
1911 let kids_array: Vec<PdfObject> = page_refs
1913 .into_iter()
1914 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1915 .collect();
1916
1917 result_dict.insert(
1918 PdfName("Kids".to_string()),
1919 PdfObject::Array(PdfArray(kids_array)),
1920 );
1921 } else if obj_num == 114 {
1922 result_dict.insert(
1925 PdfName("Type".to_string()),
1926 PdfObject::Name(PdfName("Pages".to_string())),
1927 );
1928
1929 let page_refs = match self.find_page_objects() {
1931 Ok(refs) => refs,
1932 Err(_e) => {
1933 vec![]
1934 }
1935 };
1936
1937 let page_count = if page_refs.is_empty() {
1939 44
1940 } else {
1941 page_refs.len() as i64
1942 };
1943 result_dict
1944 .insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
1945
1946 let kids_array: Vec<PdfObject> = page_refs
1948 .into_iter()
1949 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
1950 .collect();
1951
1952 result_dict.insert(
1953 PdfName("Kids".to_string()),
1954 PdfObject::Array(PdfArray(kids_array)),
1955 );
1956 } else if self.is_page_object(obj_num) {
1957 result_dict.insert(
1960 PdfName("Type".to_string()),
1961 PdfObject::Name(PdfName("Page".to_string())),
1962 );
1963
1964 self.parse_page_dictionary_content(
1966 &dict_content,
1967 &mut result_dict,
1968 obj_num,
1969 )?;
1970 }
1971
1972 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1974
1975 return Ok(PdfDictionary(result_dict));
1976 }
1977 }
1978 }
1979
1980 self.reader.seek(SeekFrom::Start(original_pos)).ok();
1982
1983 if obj_num == 113 {
1985 let mut result_dict = HashMap::new();
1986 result_dict.insert(
1987 PdfName("Type".to_string()),
1988 PdfObject::Name(PdfName("Pages".to_string())),
1989 );
1990
1991 let page_refs = match self.find_page_objects() {
1993 Ok(refs) => refs,
1994 Err(_e) => {
1995 vec![]
1996 }
1997 };
1998
1999 let page_count = if page_refs.is_empty() {
2001 44
2002 } else {
2003 page_refs.len() as i64
2004 };
2005 result_dict.insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
2006
2007 let kids_array: Vec<PdfObject> = page_refs
2009 .into_iter()
2010 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
2011 .collect();
2012
2013 result_dict.insert(
2014 PdfName("Kids".to_string()),
2015 PdfObject::Array(PdfArray(kids_array)),
2016 );
2017
2018 return Ok(PdfDictionary(result_dict));
2019 } else if obj_num == 114 {
2020 let mut result_dict = HashMap::new();
2021 result_dict.insert(
2022 PdfName("Type".to_string()),
2023 PdfObject::Name(PdfName("Pages".to_string())),
2024 );
2025
2026 let page_refs = match self.find_page_objects() {
2028 Ok(refs) => refs,
2029 Err(_e) => {
2030 vec![]
2031 }
2032 };
2033
2034 let page_count = if page_refs.is_empty() {
2036 44
2037 } else {
2038 page_refs.len() as i64
2039 };
2040 result_dict.insert(PdfName("Count".to_string()), PdfObject::Integer(page_count));
2041
2042 let kids_array: Vec<PdfObject> = page_refs
2044 .into_iter()
2045 .map(|(obj_num, gen_num)| PdfObject::Reference(obj_num, gen_num))
2046 .collect();
2047
2048 result_dict.insert(
2049 PdfName("Kids".to_string()),
2050 PdfObject::Array(PdfArray(kids_array)),
2051 );
2052
2053 return Ok(PdfDictionary(result_dict));
2054 }
2055
2056 Err(ParseError::SyntaxError {
2057 position: 0,
2058 message: "Could not find catalog dictionary in manual extraction".to_string(),
2059 })
2060 }
2061
2062 fn extract_object_or_stream_manually(&mut self, obj_num: u32) -> ParseResult<PdfObject> {
2064 use crate::parser::objects::PdfObject;
2065
2066 let original_pos = self.reader.stream_position().unwrap_or(0);
2068
2069 if self.reader.seek(SeekFrom::Start(0)).is_err() {
2071 return Err(ParseError::SyntaxError {
2072 position: 0,
2073 message: "Failed to seek to beginning for manual extraction".to_string(),
2074 });
2075 }
2076
2077 let mut buffer = Vec::new();
2079 if self.reader.read_to_end(&mut buffer).is_err() {
2080 return Err(ParseError::SyntaxError {
2081 position: 0,
2082 message: "Failed to read file for manual extraction".to_string(),
2083 });
2084 }
2085
2086 let pattern = format!("{} 0 obj", obj_num).into_bytes();
2088
2089 if let Some(obj_start) = find_bytes(&buffer, &pattern) {
2090 let start = obj_start + pattern.len();
2091 let search_area = &buffer[start..];
2092
2093 if let Some(dict_start) = find_bytes(search_area, b"<<") {
2094 let mut bracket_count = 1;
2096 let mut pos = dict_start + 2;
2097 let mut dict_end = None;
2098
2099 while pos < search_area.len() - 1 && bracket_count > 0 {
2100 if search_area[pos] == b'<' && search_area[pos + 1] == b'<' {
2101 bracket_count += 1;
2102 pos += 2;
2103 } else if search_area[pos] == b'>' && search_area[pos + 1] == b'>' {
2104 bracket_count -= 1;
2105 if bracket_count == 0 {
2106 dict_end = Some(pos);
2107 break;
2108 }
2109 pos += 2;
2110 } else {
2111 pos += 1;
2112 }
2113 }
2114
2115 if let Some(dict_end_pos) = dict_end {
2116 let dict_start_abs = dict_start + 2;
2117 let dict_end_abs = dict_end_pos;
2118 let dict_content_bytes = &search_area[dict_start_abs..dict_end_abs];
2119 let dict_content = String::from_utf8_lossy(dict_content_bytes);
2120
2121 let after_dict = &search_area[dict_end_abs + 2..];
2123 if is_immediate_stream_start(after_dict) {
2124 return self.reconstruct_stream_object_bytes(
2126 obj_num,
2127 &dict_content,
2128 after_dict,
2129 );
2130 } else {
2131 return self
2133 .extract_object_manually(obj_num)
2134 .map(|dict| PdfObject::Dictionary(dict));
2135 }
2136 }
2137 }
2138 }
2139
2140 self.reader.seek(SeekFrom::Start(original_pos)).ok();
2142
2143 Err(ParseError::SyntaxError {
2144 position: 0,
2145 message: format!("Could not manually extract object {}", obj_num),
2146 })
2147 }
2148
2149 fn reconstruct_stream_object_bytes(
2151 &mut self,
2152 obj_num: u32,
2153 dict_content: &str,
2154 after_dict: &[u8],
2155 ) -> ParseResult<PdfObject> {
2156 use crate::parser::objects::{PdfDictionary, PdfName, PdfObject, PdfStream};
2157 use std::collections::HashMap;
2158
2159 let mut dict = HashMap::new();
2161
2162 if dict_content.contains("/Filter /FlateDecode") {
2164 dict.insert(
2165 PdfName("Filter".to_string()),
2166 PdfObject::Name(PdfName("FlateDecode".to_string())),
2167 );
2168 }
2169
2170 if let Some(length_start) = dict_content.find("/Length ") {
2171 let length_part = &dict_content[length_start + 8..];
2172
2173 let is_indirect_ref =
2176 length_part.trim().contains(" R") || length_part.trim().contains(" 0 R");
2177
2178 if is_indirect_ref {
2179 } else if let Some(space_pos) = length_part.find(' ') {
2181 let length_str = &length_part[..space_pos];
2182 if let Ok(length) = length_str.parse::<i64>() {
2183 dict.insert(PdfName("Length".to_string()), PdfObject::Integer(length));
2184 }
2185 } else {
2186 if let Ok(length) = length_part.trim().parse::<i64>() {
2188 dict.insert(PdfName("Length".to_string()), PdfObject::Integer(length));
2189 }
2190 }
2191 } else {
2192 }
2193
2194 if let Some(stream_start) = find_bytes(after_dict, b"stream") {
2196 let stream_start_pos = stream_start + 6; let stream_data_start = if after_dict.get(stream_start_pos) == Some(&b'\n') {
2198 stream_start_pos + 1
2199 } else if after_dict.get(stream_start_pos) == Some(&b'\r') {
2200 if after_dict.get(stream_start_pos + 1) == Some(&b'\n') {
2201 stream_start_pos + 2
2202 } else {
2203 stream_start_pos + 1
2204 }
2205 } else {
2206 stream_start_pos
2207 };
2208
2209 if let Some(endstream_pos) = find_bytes(after_dict, b"endstream") {
2210 let mut stream_data = &after_dict[stream_data_start..endstream_pos];
2211
2212 if let Some(PdfObject::Integer(length)) = dict.get(&PdfName("Length".to_string())) {
2214 let expected_length = *length as usize;
2215 if stream_data.len() > expected_length {
2216 stream_data = &stream_data[..expected_length];
2217 } else if stream_data.len() < expected_length {
2218 tracing::debug!(
2219 "WARNING: Stream data ({} bytes) < Length ({} bytes)!",
2220 stream_data.len(),
2221 expected_length
2222 );
2223 }
2224 }
2225
2226 let stream = PdfStream {
2227 dict: PdfDictionary(dict),
2228 data: stream_data.to_vec(),
2229 };
2230
2231 return Ok(PdfObject::Stream(stream));
2232 } else {
2233 }
2234 }
2235
2236 Err(ParseError::SyntaxError {
2237 position: 0,
2238 message: format!("Could not reconstruct stream for object {}", obj_num),
2239 })
2240 }
2241
2242 fn parse_resources_from_content(&self, dict_content: &str) -> ParseResult<PdfObject> {
2244 use crate::parser::objects::{PdfDictionary, PdfName, PdfObject};
2245 use std::collections::HashMap;
2246
2247 if let Some(resources_start) = dict_content.find("/Resources") {
2249 if let Some(bracket_start) = dict_content[resources_start..].find("<<") {
2251 let abs_bracket_start = resources_start + bracket_start + 2;
2252
2253 let mut bracket_count = 1;
2255 let mut end_pos = abs_bracket_start;
2256 let chars: Vec<char> = dict_content.chars().collect();
2257
2258 while end_pos < chars.len() && bracket_count > 0 {
2259 if end_pos + 1 < chars.len() {
2260 if chars[end_pos] == '<' && chars[end_pos + 1] == '<' {
2261 bracket_count += 1;
2262 end_pos += 2;
2263 continue;
2264 } else if chars[end_pos] == '>' && chars[end_pos + 1] == '>' {
2265 bracket_count -= 1;
2266 end_pos += 2;
2267 continue;
2268 }
2269 }
2270 end_pos += 1;
2271 }
2272
2273 if bracket_count == 0 {
2274 let resources_content = &dict_content[abs_bracket_start..end_pos - 2];
2275
2276 let mut resources_dict = HashMap::new();
2278
2279 if let Some(font_start) = resources_content.find("/Font") {
2281 if let Some(font_bracket) = resources_content[font_start..].find("<<") {
2282 let abs_font_start = font_start + font_bracket + 2;
2283
2284 let mut font_dict = HashMap::new();
2286
2287 let font_section = &resources_content[abs_font_start..];
2289 let mut pos = 0;
2290 while let Some(f_pos) = font_section[pos..].find("/F") {
2291 let abs_f_pos = pos + f_pos;
2292 if let Some(space_pos) = font_section[abs_f_pos..].find(" ") {
2293 let font_name = &font_section[abs_f_pos..abs_f_pos + space_pos];
2294
2295 let after_name = &font_section[abs_f_pos + space_pos..];
2297 if let Some(r_pos) = after_name.find(" R") {
2298 let ref_part = after_name[..r_pos].trim();
2299 if let Some(parts) = ref_part
2300 .split_whitespace()
2301 .collect::<Vec<&str>>()
2302 .get(0..2)
2303 {
2304 if let (Ok(obj_num), Ok(gen_num)) =
2305 (parts[0].parse::<u32>(), parts[1].parse::<u16>())
2306 {
2307 font_dict.insert(
2308 PdfName(font_name[1..].to_string()), PdfObject::Reference(obj_num, gen_num),
2310 );
2311 }
2312 }
2313 }
2314 }
2315 pos = abs_f_pos + 1;
2316 }
2317
2318 if !font_dict.is_empty() {
2319 resources_dict.insert(
2320 PdfName("Font".to_string()),
2321 PdfObject::Dictionary(PdfDictionary(font_dict)),
2322 );
2323 }
2324 }
2325 }
2326
2327 return Ok(PdfObject::Dictionary(PdfDictionary(resources_dict)));
2328 }
2329 }
2330 }
2331
2332 Err(ParseError::SyntaxError {
2333 position: 0,
2334 message: "Could not parse Resources".to_string(),
2335 })
2336 }
2337
2338 #[allow(dead_code)]
2339 fn extract_catalog_directly(
2340 &mut self,
2341 obj_num: u32,
2342 gen_num: u16,
2343 ) -> ParseResult<&PdfDictionary> {
2344 if let Some(entry) = self.xref.get_entry(obj_num) {
2346 if self.reader.seek(SeekFrom::Start(entry.offset)).is_err() {
2348 return Err(ParseError::SyntaxError {
2349 position: 0,
2350 message: "Failed to seek to catalog object".to_string(),
2351 });
2352 }
2353
2354 let mut buffer = vec![0u8; 2048];
2356 if let Ok(bytes_read) = self.reader.read(&mut buffer) {
2357 let content = String::from_utf8_lossy(&buffer[..bytes_read]);
2358 tracing::debug!("Raw catalog content:\n{}", content);
2359
2360 if let Some(dict_start) = content.find("<<") {
2362 if let Some(dict_end) = content[dict_start..].find(">>") {
2363 let dict_content = &content[dict_start..dict_start + dict_end + 2];
2364 tracing::debug!("Found dictionary content: {}", dict_content);
2365
2366 if let Ok(dict) = self.parse_dictionary_from_string(dict_content) {
2368 let key = (obj_num, gen_num);
2370 self.object_cache.insert(key, PdfObject::Dictionary(dict));
2371
2372 if let Some(PdfObject::Dictionary(ref dict)) =
2374 self.object_cache.get(&key)
2375 {
2376 return Ok(dict);
2377 }
2378 }
2379 }
2380 }
2381 }
2382 }
2383
2384 Err(ParseError::SyntaxError {
2385 position: 0,
2386 message: "Failed to extract catalog directly".to_string(),
2387 })
2388 }
2389
2390 #[allow(dead_code)]
2391 fn parse_dictionary_from_string(&self, dict_str: &str) -> ParseResult<PdfDictionary> {
2392 use crate::parser::lexer::{Lexer, Token};
2393
2394 let mut cursor = std::io::Cursor::new(dict_str.as_bytes());
2396 let mut lexer = Lexer::new_with_options(&mut cursor, self.options.clone());
2397
2398 match lexer.next_token()? {
2400 Token::DictStart => {
2401 let mut dict = std::collections::HashMap::new();
2402
2403 loop {
2404 let token = lexer.next_token()?;
2405 match token {
2406 Token::DictEnd => break,
2407 Token::Name(key) => {
2408 let value = PdfObject::parse_with_options(&mut lexer, &self.options)?;
2410 dict.insert(crate::parser::objects::PdfName(key), value);
2411 }
2412 _ => {
2413 return Err(ParseError::SyntaxError {
2414 position: 0,
2415 message: "Invalid dictionary format".to_string(),
2416 });
2417 }
2418 }
2419 }
2420
2421 Ok(PdfDictionary(dict))
2422 }
2423 _ => Err(ParseError::SyntaxError {
2424 position: 0,
2425 message: "Expected dictionary start".to_string(),
2426 }),
2427 }
2428 }
2429
2430 fn count_page_objects_directly(&mut self) -> Option<u32> {
2432 let mut page_count = 0;
2433
2434 for obj_num in 1..self.xref.len() as u32 {
2436 if let Ok(obj) = self.get_object(obj_num, 0) {
2437 if let Some(dict) = obj.as_dict() {
2438 if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
2439 if obj_type.0 == "Page" {
2440 page_count += 1;
2441 }
2442 }
2443 }
2444 }
2445 }
2446
2447 if page_count > 0 {
2448 Some(page_count)
2449 } else {
2450 None
2451 }
2452 }
2453
2454 pub fn metadata(&mut self) -> ParseResult<DocumentMetadata> {
2456 let mut metadata = DocumentMetadata::default();
2457
2458 if let Some(info_dict) = self.info()? {
2459 if let Some(title) = info_dict.get("Title").and_then(|o| o.as_string()) {
2460 metadata.title = title.as_str().ok().map(|s| s.to_string());
2461 }
2462 if let Some(author) = info_dict.get("Author").and_then(|o| o.as_string()) {
2463 metadata.author = author.as_str().ok().map(|s| s.to_string());
2464 }
2465 if let Some(subject) = info_dict.get("Subject").and_then(|o| o.as_string()) {
2466 metadata.subject = subject.as_str().ok().map(|s| s.to_string());
2467 }
2468 if let Some(keywords) = info_dict.get("Keywords").and_then(|o| o.as_string()) {
2469 metadata.keywords = keywords.as_str().ok().map(|s| s.to_string());
2470 }
2471 if let Some(creator) = info_dict.get("Creator").and_then(|o| o.as_string()) {
2472 metadata.creator = creator.as_str().ok().map(|s| s.to_string());
2473 }
2474 if let Some(producer) = info_dict.get("Producer").and_then(|o| o.as_string()) {
2475 metadata.producer = producer.as_str().ok().map(|s| s.to_string());
2476 }
2477 }
2478
2479 metadata.version = self.version().to_string();
2480 metadata.page_count = self.page_count().ok();
2481
2482 Ok(metadata)
2483 }
2484
2485 fn ensure_page_tree(&mut self) -> ParseResult<()> {
2487 if self.page_tree.is_none() {
2488 let page_count = self.page_count()?;
2489 self.page_tree = Some(super::page_tree::PageTree::new(page_count));
2490 }
2491 Ok(())
2492 }
2493
2494 pub fn get_page(&mut self, _index: u32) -> ParseResult<&super::page_tree::ParsedPage> {
2500 self.ensure_page_tree()?;
2501
2502 Err(ParseError::SyntaxError {
2506 position: 0,
2507 message: "get_page not implemented due to borrow checker constraints. Use PdfDocument instead.".to_string(),
2508 })
2509 }
2510
2511 pub fn get_all_pages(&mut self) -> ParseResult<Vec<super::page_tree::ParsedPage>> {
2513 let page_count = self.page_count()?;
2514 let mut pages = Vec::with_capacity(page_count as usize);
2515
2516 for i in 0..page_count {
2517 let page = self.get_page(i)?.clone();
2518 pages.push(page);
2519 }
2520
2521 Ok(pages)
2522 }
2523
2524 pub fn into_document(self) -> super::document::PdfDocument<R> {
2526 super::document::PdfDocument::new(self)
2527 }
2528
2529 pub fn clear_parse_context(&mut self) {
2531 self.parse_context = StackSafeContext::new();
2532 }
2533
2534 pub fn parse_context_mut(&mut self) -> &mut StackSafeContext {
2536 &mut self.parse_context
2537 }
2538
2539 fn find_page_objects(&mut self) -> ParseResult<Vec<(u32, u16)>> {
2541 let original_pos = self.reader.stream_position().unwrap_or(0);
2543
2544 if self.reader.seek(SeekFrom::Start(0)).is_err() {
2546 return Ok(vec![]);
2547 }
2548
2549 let mut buffer = Vec::new();
2550 if self.reader.read_to_end(&mut buffer).is_err() {
2551 return Ok(vec![]);
2552 }
2553
2554 self.reader.seek(SeekFrom::Start(original_pos)).ok();
2556
2557 let content = String::from_utf8_lossy(&buffer);
2558 let mut page_objects = Vec::new();
2559
2560 let lines: Vec<&str> = content.lines().collect();
2562
2563 for (i, line) in lines.iter().enumerate() {
2564 if line.trim().ends_with(" 0 obj") {
2566 if let Some(obj_str) = line.trim().strip_suffix(" 0 obj") {
2567 if let Ok(obj_num) = obj_str.parse::<u32>() {
2568 for j in 1..=10 {
2570 if i + j < lines.len() {
2571 let future_line = lines[i + j];
2572 if future_line.contains("/Type /Page")
2573 && !future_line.contains("/Type /Pages")
2574 {
2575 page_objects.push((obj_num, 0));
2576 break;
2577 }
2578 if future_line.trim().ends_with(" 0 obj")
2580 || future_line.trim() == "endobj"
2581 {
2582 break;
2583 }
2584 }
2585 }
2586 }
2587 }
2588 }
2589 }
2590
2591 page_objects.sort();
2592 page_objects.dedup();
2593
2594 Ok(page_objects)
2595 }
2596
2597 fn find_catalog_object(&mut self) -> ParseResult<(u32, u16)> {
2599 let obj_numbers: Vec<u32> = self.xref.entries().keys().copied().collect();
2604
2605 for obj_num in obj_numbers {
2607 if let Ok(obj) = self.get_object(obj_num, 0) {
2609 if let Some(dict) = obj.as_dict() {
2610 if let Some(type_obj) = dict.get("Type") {
2612 if let Some(type_name) = type_obj.as_name() {
2613 if type_name.0 == "Catalog" {
2614 return Ok((obj_num, 0));
2615 }
2616 if type_name.0 == "Sig"
2618 || type_name.0 == "Pages"
2619 || type_name.0 == "Page"
2620 {
2621 continue;
2622 }
2623 }
2624 }
2625 }
2626 }
2627 }
2628
2629 for obj_num in [1, 2, 3, 4, 5] {
2631 if let Ok(obj) = self.get_object(obj_num, 0) {
2632 if let Some(dict) = obj.as_dict() {
2633 if dict.contains_key("Pages") {
2635 return Ok((obj_num, 0));
2636 }
2637 }
2638 }
2639 }
2640
2641 Err(ParseError::MissingKey(
2642 "Could not find Catalog object".to_string(),
2643 ))
2644 }
2645
2646 fn create_synthetic_pages_dict(
2648 &mut self,
2649 page_refs: &[(u32, u16)],
2650 ) -> ParseResult<&PdfDictionary> {
2651 use super::objects::{PdfArray, PdfName};
2652
2653 let mut valid_page_refs = Vec::new();
2655 for (obj_num, gen_num) in page_refs {
2656 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2657 if let Some(page_dict) = page_obj.as_dict() {
2658 if let Some(obj_type) = page_dict.get("Type").and_then(|t| t.as_name()) {
2660 if obj_type.0 == "Page" {
2661 valid_page_refs.push((*obj_num, *gen_num));
2662 continue;
2663 }
2664 }
2665
2666 if page_dict.contains_key("MediaBox") || page_dict.contains_key("Contents") {
2668 valid_page_refs.push((*obj_num, *gen_num));
2669 }
2670 }
2671 }
2672 }
2673
2674 if valid_page_refs.is_empty() {
2675 return Err(ParseError::SyntaxError {
2676 position: 0,
2677 message: "No valid page objects found for synthetic Pages tree".to_string(),
2678 });
2679 }
2680
2681 if valid_page_refs.len() > 10 {
2683 return self.create_hierarchical_pages_tree(&valid_page_refs);
2684 }
2685
2686 let mut kids = PdfArray::new();
2688 for (obj_num, gen_num) in &valid_page_refs {
2689 kids.push(PdfObject::Reference(*obj_num, *gen_num));
2690 }
2691
2692 let mut pages_dict = PdfDictionary::new();
2694 pages_dict.insert(
2695 "Type".to_string(),
2696 PdfObject::Name(PdfName("Pages".to_string())),
2697 );
2698 pages_dict.insert("Kids".to_string(), PdfObject::Array(kids));
2699 pages_dict.insert(
2700 "Count".to_string(),
2701 PdfObject::Integer(valid_page_refs.len() as i64),
2702 );
2703
2704 let mut media_box = None;
2706 for (obj_num, gen_num) in valid_page_refs.iter().take(3) {
2707 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2708 if let Some(page_dict) = page_obj.as_dict() {
2709 if let Some(mb) = page_dict.get("MediaBox") {
2710 media_box = Some(mb.clone());
2711 }
2712 }
2713 }
2714 }
2715
2716 if let Some(mb) = media_box {
2718 pages_dict.insert("MediaBox".to_string(), mb);
2719 } else {
2720 let mut mb_array = PdfArray::new();
2721 mb_array.push(PdfObject::Integer(0));
2722 mb_array.push(PdfObject::Integer(0));
2723 mb_array.push(PdfObject::Integer(612));
2724 mb_array.push(PdfObject::Integer(792));
2725 pages_dict.insert("MediaBox".to_string(), PdfObject::Array(mb_array));
2726 }
2727
2728 let synthetic_key = (u32::MAX - 1, 0);
2730 self.object_cache
2731 .insert(synthetic_key, PdfObject::Dictionary(pages_dict));
2732
2733 if let PdfObject::Dictionary(dict) = &self.object_cache[&synthetic_key] {
2735 Ok(dict)
2736 } else {
2737 unreachable!("Just inserted dictionary")
2738 }
2739 }
2740
2741 fn create_hierarchical_pages_tree(
2743 &mut self,
2744 page_refs: &[(u32, u16)],
2745 ) -> ParseResult<&PdfDictionary> {
2746 use super::objects::{PdfArray, PdfName};
2747
2748 const PAGES_PER_NODE: usize = 10; let chunks: Vec<&[(u32, u16)]> = page_refs.chunks(PAGES_PER_NODE).collect();
2752 let mut intermediate_nodes = Vec::new();
2753
2754 for (chunk_idx, chunk) in chunks.iter().enumerate() {
2756 let mut kids = PdfArray::new();
2757 for (obj_num, gen_num) in chunk.iter() {
2758 kids.push(PdfObject::Reference(*obj_num, *gen_num));
2759 }
2760
2761 let mut intermediate_dict = PdfDictionary::new();
2762 intermediate_dict.insert(
2763 "Type".to_string(),
2764 PdfObject::Name(PdfName("Pages".to_string())),
2765 );
2766 intermediate_dict.insert("Kids".to_string(), PdfObject::Array(kids));
2767 intermediate_dict.insert("Count".to_string(), PdfObject::Integer(chunk.len() as i64));
2768
2769 let intermediate_key = (u32::MAX - 2 - chunk_idx as u32, 0);
2771 self.object_cache
2772 .insert(intermediate_key, PdfObject::Dictionary(intermediate_dict));
2773
2774 intermediate_nodes.push(intermediate_key);
2775 }
2776
2777 let mut root_kids = PdfArray::new();
2779 for (obj_num, gen_num) in &intermediate_nodes {
2780 root_kids.push(PdfObject::Reference(*obj_num, *gen_num));
2781 }
2782
2783 let mut root_pages_dict = PdfDictionary::new();
2784 root_pages_dict.insert(
2785 "Type".to_string(),
2786 PdfObject::Name(PdfName("Pages".to_string())),
2787 );
2788 root_pages_dict.insert("Kids".to_string(), PdfObject::Array(root_kids));
2789 root_pages_dict.insert(
2790 "Count".to_string(),
2791 PdfObject::Integer(page_refs.len() as i64),
2792 );
2793
2794 if let Some((obj_num, gen_num)) = page_refs.first() {
2796 if let Ok(page_obj) = self.get_object(*obj_num, *gen_num) {
2797 if let Some(page_dict) = page_obj.as_dict() {
2798 if let Some(mb) = page_dict.get("MediaBox") {
2799 root_pages_dict.insert("MediaBox".to_string(), mb.clone());
2800 }
2801 }
2802 }
2803 }
2804
2805 let root_key = (u32::MAX - 1, 0);
2807 self.object_cache
2808 .insert(root_key, PdfObject::Dictionary(root_pages_dict));
2809
2810 if let PdfObject::Dictionary(dict) = &self.object_cache[&root_key] {
2812 Ok(dict)
2813 } else {
2814 unreachable!("Just inserted dictionary")
2815 }
2816 }
2817}
2818
2819#[derive(Debug, Default, Clone)]
2821pub struct DocumentMetadata {
2822 pub title: Option<String>,
2823 pub author: Option<String>,
2824 pub subject: Option<String>,
2825 pub keywords: Option<String>,
2826 pub creator: Option<String>,
2827 pub producer: Option<String>,
2828 pub creation_date: Option<String>,
2829 pub modification_date: Option<String>,
2830 pub version: String,
2831 pub page_count: Option<u32>,
2832}
2833
2834pub struct EOLIter<'s> {
2835 remainder: &'s str,
2836}
2837impl<'s> Iterator for EOLIter<'s> {
2838 type Item = &'s str;
2839
2840 fn next(&mut self) -> Option<Self::Item> {
2841 if self.remainder.is_empty() {
2842 return None;
2843 }
2844
2845 if let Some((i, sep)) = ["\r\n", "\n", "\r"]
2846 .iter()
2847 .filter_map(|&sep| self.remainder.find(sep).map(|i| (i, sep)))
2848 .min_by_key(|(i, _)| *i)
2849 {
2850 let (line, rest) = self.remainder.split_at(i);
2851 self.remainder = &rest[sep.len()..];
2852 Some(line)
2853 } else {
2854 let line = self.remainder;
2855 self.remainder = "";
2856 Some(line)
2857 }
2858 }
2859}
2860pub trait PDFLines: AsRef<str> {
2861 fn pdf_lines(&self) -> EOLIter<'_> {
2862 EOLIter {
2863 remainder: self.as_ref(),
2864 }
2865 }
2866}
2867impl PDFLines for &str {}
2868impl<'a> PDFLines for std::borrow::Cow<'a, str> {}
2869impl PDFLines for String {}
2870
2871#[cfg(test)]
2872mod tests {
2873
2874 use super::*;
2875 use crate::parser::objects::{PdfName, PdfString};
2876 use crate::parser::test_helpers::*;
2877 use crate::parser::ParseOptions;
2878 use std::io::Cursor;
2879
2880 #[test]
2881 fn test_reader_construction() {
2882 let pdf_data = create_minimal_pdf();
2883 let cursor = Cursor::new(pdf_data);
2884 let result = PdfReader::new(cursor);
2885 assert!(result.is_ok());
2886 }
2887
2888 #[test]
2889 fn test_reader_version() {
2890 let pdf_data = create_minimal_pdf();
2891 let cursor = Cursor::new(pdf_data);
2892 let reader = PdfReader::new(cursor).unwrap();
2893 assert_eq!(reader.version().major, 1);
2894 assert_eq!(reader.version().minor, 4);
2895 }
2896
2897 #[test]
2898 fn test_reader_different_versions() {
2899 let versions = vec![
2900 "1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "2.0",
2901 ];
2902
2903 for version in versions {
2904 let pdf_data = create_pdf_with_version(version);
2905 let cursor = Cursor::new(pdf_data);
2906 let reader = PdfReader::new(cursor).unwrap();
2907
2908 let parts: Vec<&str> = version.split('.').collect();
2909 assert_eq!(reader.version().major, parts[0].parse::<u8>().unwrap());
2910 assert_eq!(reader.version().minor, parts[1].parse::<u8>().unwrap());
2911 }
2912 }
2913
2914 #[test]
2915 fn test_reader_catalog() {
2916 let pdf_data = create_minimal_pdf();
2917 let cursor = Cursor::new(pdf_data);
2918 let mut reader = PdfReader::new(cursor).unwrap();
2919
2920 let catalog = reader.catalog();
2921 assert!(catalog.is_ok());
2922
2923 let catalog_dict = catalog.unwrap();
2924 assert_eq!(
2925 catalog_dict.get("Type"),
2926 Some(&PdfObject::Name(PdfName("Catalog".to_string())))
2927 );
2928 }
2929
2930 #[test]
2931 fn test_reader_info_none() {
2932 let pdf_data = create_minimal_pdf();
2933 let cursor = Cursor::new(pdf_data);
2934 let mut reader = PdfReader::new(cursor).unwrap();
2935
2936 let info = reader.info().unwrap();
2937 assert!(info.is_none());
2938 }
2939
2940 #[test]
2941 fn test_reader_info_present() {
2942 let pdf_data = create_pdf_with_info();
2943 let cursor = Cursor::new(pdf_data);
2944 let mut reader = PdfReader::new(cursor).unwrap();
2945
2946 let info = reader.info().unwrap();
2947 assert!(info.is_some());
2948
2949 let info_dict = info.unwrap();
2950 assert_eq!(
2951 info_dict.get("Title"),
2952 Some(&PdfObject::String(PdfString(
2953 "Test PDF".to_string().into_bytes()
2954 )))
2955 );
2956 assert_eq!(
2957 info_dict.get("Author"),
2958 Some(&PdfObject::String(PdfString(
2959 "Test Author".to_string().into_bytes()
2960 )))
2961 );
2962 }
2963
2964 #[test]
2965 fn test_reader_get_object() {
2966 let pdf_data = create_minimal_pdf();
2967 let cursor = Cursor::new(pdf_data);
2968 let mut reader = PdfReader::new(cursor).unwrap();
2969
2970 let obj = reader.get_object(1, 0);
2972 assert!(obj.is_ok());
2973
2974 let catalog = obj.unwrap();
2975 assert!(catalog.as_dict().is_some());
2976 }
2977
2978 #[test]
2979 fn test_reader_get_invalid_object() {
2980 let pdf_data = create_minimal_pdf();
2981 let cursor = Cursor::new(pdf_data);
2982 let mut reader = PdfReader::new(cursor).unwrap();
2983
2984 let obj = reader.get_object(999, 0);
2986 assert!(obj.is_err());
2987 }
2988
2989 #[test]
2990 fn test_reader_get_free_object() {
2991 let pdf_data = create_minimal_pdf();
2992 let cursor = Cursor::new(pdf_data);
2993 let mut reader = PdfReader::new(cursor).unwrap();
2994
2995 let obj = reader.get_object(0, 65535);
2997 assert!(obj.is_ok());
2998 assert_eq!(obj.unwrap(), &PdfObject::Null);
2999 }
3000
3001 #[test]
3002 fn test_reader_resolve_reference() {
3003 let pdf_data = create_minimal_pdf();
3004 let cursor = Cursor::new(pdf_data);
3005 let mut reader = PdfReader::new(cursor).unwrap();
3006
3007 let ref_obj = PdfObject::Reference(1, 0);
3009 let resolved = reader.resolve(&ref_obj);
3010
3011 assert!(resolved.is_ok());
3012 assert!(resolved.unwrap().as_dict().is_some());
3013 }
3014
3015 #[test]
3016 fn test_reader_resolve_non_reference() {
3017 let pdf_data = create_minimal_pdf();
3018 let cursor = Cursor::new(pdf_data);
3019 let mut reader = PdfReader::new(cursor).unwrap();
3020
3021 let int_obj = PdfObject::Integer(42);
3023 let resolved = reader.resolve(&int_obj).unwrap();
3024
3025 assert_eq!(resolved, &PdfObject::Integer(42));
3026 }
3027
3028 #[test]
3029 fn test_reader_cache_behavior() {
3030 let pdf_data = create_minimal_pdf();
3031 let cursor = Cursor::new(pdf_data);
3032 let mut reader = PdfReader::new(cursor).unwrap();
3033
3034 let obj1 = reader.get_object(1, 0).unwrap();
3036 assert!(obj1.as_dict().is_some());
3037
3038 let obj2 = reader.get_object(1, 0).unwrap();
3040 assert!(obj2.as_dict().is_some());
3041 }
3042
3043 #[test]
3044 fn test_reader_wrong_generation() {
3045 let pdf_data = create_minimal_pdf();
3046 let cursor = Cursor::new(pdf_data);
3047 let mut reader = PdfReader::new(cursor).unwrap();
3048
3049 let obj = reader.get_object(1, 99);
3051 assert!(obj.is_err());
3052 }
3053
3054 #[test]
3055 fn test_reader_invalid_pdf() {
3056 let invalid_data = b"This is not a PDF file";
3057 let cursor = Cursor::new(invalid_data.to_vec());
3058 let result = PdfReader::new(cursor);
3059
3060 assert!(result.is_err());
3061 }
3062
3063 #[test]
3064 fn test_reader_corrupt_xref() {
3065 let corrupt_pdf = b"%PDF-1.4
30661 0 obj
3067<< /Type /Catalog >>
3068endobj
3069xref
3070corrupted xref table
3071trailer
3072<< /Size 2 /Root 1 0 R >>
3073startxref
307424
3075%%EOF"
3076 .to_vec();
3077
3078 let cursor = Cursor::new(corrupt_pdf);
3079 let result = PdfReader::new(cursor);
3080 assert!(result.is_err());
3083 }
3084
3085 #[test]
3086 fn test_reader_missing_trailer() {
3087 let pdf_no_trailer = b"%PDF-1.4
30881 0 obj
3089<< /Type /Catalog >>
3090endobj
3091xref
30920 2
30930000000000 65535 f
30940000000009 00000 n
3095startxref
309624
3097%%EOF"
3098 .to_vec();
3099
3100 let cursor = Cursor::new(pdf_no_trailer);
3101 let result = PdfReader::new(cursor);
3102 assert!(result.is_err());
3105 }
3106
3107 #[test]
3108 fn test_reader_empty_pdf() {
3109 let cursor = Cursor::new(Vec::new());
3110 let result = PdfReader::new(cursor);
3111 assert!(result.is_err());
3112 }
3113
3114 #[test]
3115 fn test_reader_page_count() {
3116 let pdf_data = create_minimal_pdf();
3117 let cursor = Cursor::new(pdf_data);
3118 let mut reader = PdfReader::new(cursor).unwrap();
3119
3120 let count = reader.page_count();
3121 assert!(count.is_ok());
3122 assert_eq!(count.unwrap(), 0); }
3124
3125 #[test]
3126 fn test_reader_into_document() {
3127 let pdf_data = create_minimal_pdf();
3128 let cursor = Cursor::new(pdf_data);
3129 let reader = PdfReader::new(cursor).unwrap();
3130
3131 let document = reader.into_document();
3132 let page_count = document.page_count();
3134 assert!(page_count.is_ok());
3135 }
3136
3137 #[test]
3138 fn test_reader_pages_dict() {
3139 let pdf_data = create_minimal_pdf();
3140 let cursor = Cursor::new(pdf_data);
3141 let mut reader = PdfReader::new(cursor).unwrap();
3142
3143 let pages = reader.pages();
3144 assert!(pages.is_ok());
3145 let pages_dict = pages.unwrap();
3146 assert_eq!(
3147 pages_dict.get("Type"),
3148 Some(&PdfObject::Name(PdfName("Pages".to_string())))
3149 );
3150 }
3151
3152 #[test]
3153 fn test_reader_pdf_with_binary_data() {
3154 let pdf_data = create_pdf_with_binary_marker();
3155
3156 let cursor = Cursor::new(pdf_data);
3157 let result = PdfReader::new(cursor);
3158 assert!(result.is_ok());
3159 }
3160
3161 #[test]
3162 fn test_reader_metadata() {
3163 let pdf_data = create_pdf_with_info();
3164 let cursor = Cursor::new(pdf_data);
3165 let mut reader = PdfReader::new(cursor).unwrap();
3166
3167 let metadata = reader.metadata().unwrap();
3168 assert_eq!(metadata.title, Some("Test PDF".to_string()));
3169 assert_eq!(metadata.author, Some("Test Author".to_string()));
3170 assert_eq!(metadata.subject, Some("Testing".to_string()));
3171 assert_eq!(metadata.version, "1.4".to_string());
3172 }
3173
3174 #[test]
3175 fn test_reader_metadata_empty() {
3176 let pdf_data = create_minimal_pdf();
3177 let cursor = Cursor::new(pdf_data);
3178 let mut reader = PdfReader::new(cursor).unwrap();
3179
3180 let metadata = reader.metadata().unwrap();
3181 assert!(metadata.title.is_none());
3182 assert!(metadata.author.is_none());
3183 assert_eq!(metadata.version, "1.4".to_string());
3184 assert_eq!(metadata.page_count, Some(0));
3185 }
3186
3187 #[test]
3188 fn test_reader_object_number_mismatch() {
3189 let pdf_data = create_minimal_pdf();
3193 let cursor = Cursor::new(pdf_data);
3194 let mut reader = PdfReader::new(cursor).unwrap();
3195
3196 let result = reader.get_object(1, 99);
3199 assert!(result.is_err());
3200
3201 let result2 = reader.get_object(999, 0);
3203 assert!(result2.is_err());
3204 }
3205
3206 #[test]
3207 fn test_document_metadata_struct() {
3208 let metadata = DocumentMetadata {
3209 title: Some("Title".to_string()),
3210 author: Some("Author".to_string()),
3211 subject: Some("Subject".to_string()),
3212 keywords: Some("Keywords".to_string()),
3213 creator: Some("Creator".to_string()),
3214 producer: Some("Producer".to_string()),
3215 creation_date: Some("D:20240101".to_string()),
3216 modification_date: Some("D:20240102".to_string()),
3217 version: "1.5".to_string(),
3218 page_count: Some(10),
3219 };
3220
3221 assert_eq!(metadata.title, Some("Title".to_string()));
3222 assert_eq!(metadata.page_count, Some(10));
3223 }
3224
3225 #[test]
3226 fn test_document_metadata_default() {
3227 let metadata = DocumentMetadata::default();
3228 assert!(metadata.title.is_none());
3229 assert!(metadata.author.is_none());
3230 assert!(metadata.subject.is_none());
3231 assert!(metadata.keywords.is_none());
3232 assert!(metadata.creator.is_none());
3233 assert!(metadata.producer.is_none());
3234 assert!(metadata.creation_date.is_none());
3235 assert!(metadata.modification_date.is_none());
3236 assert_eq!(metadata.version, "".to_string());
3237 assert!(metadata.page_count.is_none());
3238 }
3239
3240 #[test]
3241 fn test_document_metadata_clone() {
3242 let metadata = DocumentMetadata {
3243 title: Some("Test".to_string()),
3244 version: "1.4".to_string(),
3245 ..Default::default()
3246 };
3247
3248 let cloned = metadata;
3249 assert_eq!(cloned.title, Some("Test".to_string()));
3250 assert_eq!(cloned.version, "1.4".to_string());
3251 }
3252
3253 #[test]
3254 fn test_reader_trailer_validation_error() {
3255 let bad_pdf = b"%PDF-1.4
32571 0 obj
3258<< /Type /Catalog >>
3259endobj
3260xref
32610 2
32620000000000 65535 f
32630000000009 00000 n
3264trailer
3265<< /Size 2 >>
3266startxref
326746
3268%%EOF"
3269 .to_vec();
3270
3271 let cursor = Cursor::new(bad_pdf);
3272 let result = PdfReader::new(cursor);
3273 assert!(result.is_err());
3276 }
3277
3278 #[test]
3279 fn test_reader_with_options() {
3280 let pdf_data = create_minimal_pdf();
3281 let cursor = Cursor::new(pdf_data);
3282 let mut options = ParseOptions::default();
3283 options.lenient_streams = true;
3284 options.max_recovery_bytes = 2000;
3285 options.collect_warnings = true;
3286
3287 let reader = PdfReader::new_with_options(cursor, options);
3288 assert!(reader.is_ok());
3289 }
3290
3291 #[test]
3292 fn test_lenient_stream_parsing() {
3293 let pdf_data = b"%PDF-1.4
32951 0 obj
3296<< /Type /Catalog /Pages 2 0 R >>
3297endobj
32982 0 obj
3299<< /Type /Pages /Kids [3 0 R] /Count 1 >>
3300endobj
33013 0 obj
3302<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>
3303endobj
33044 0 obj
3305<< /Length 10 >>
3306stream
3307This is a longer stream than 10 bytes
3308endstream
3309endobj
3310xref
33110 5
33120000000000 65535 f
33130000000009 00000 n
33140000000058 00000 n
33150000000116 00000 n
33160000000219 00000 n
3317trailer
3318<< /Size 5 /Root 1 0 R >>
3319startxref
3320299
3321%%EOF"
3322 .to_vec();
3323
3324 let cursor = Cursor::new(pdf_data.clone());
3326 let strict_options = ParseOptions::strict();
3327 let strict_reader = PdfReader::new_with_options(cursor, strict_options);
3328 assert!(strict_reader.is_err());
3330
3331 let cursor = Cursor::new(pdf_data);
3333 let mut options = ParseOptions::default();
3334 options.lenient_streams = true;
3335 options.max_recovery_bytes = 1000;
3336 options.collect_warnings = false;
3337 let lenient_reader = PdfReader::new_with_options(cursor, options);
3338 assert!(lenient_reader.is_err());
3339 }
3340
3341 #[test]
3342 fn test_parse_options_default() {
3343 let options = ParseOptions::default();
3344 assert!(!options.lenient_streams);
3345 assert_eq!(options.max_recovery_bytes, 1000);
3346 assert!(!options.collect_warnings);
3347 }
3348
3349 #[test]
3350 fn test_parse_options_clone() {
3351 let mut options = ParseOptions::default();
3352 options.lenient_streams = true;
3353 options.max_recovery_bytes = 2000;
3354 options.collect_warnings = true;
3355 let cloned = options;
3356 assert!(cloned.lenient_streams);
3357 assert_eq!(cloned.max_recovery_bytes, 2000);
3358 assert!(cloned.collect_warnings);
3359 }
3360
3361 #[allow(dead_code)]
3364 fn create_encrypted_pdf_dict() -> PdfDictionary {
3365 let mut dict = PdfDictionary::new();
3366 dict.insert(
3367 "Filter".to_string(),
3368 PdfObject::Name(PdfName("Standard".to_string())),
3369 );
3370 dict.insert("V".to_string(), PdfObject::Integer(1));
3371 dict.insert("R".to_string(), PdfObject::Integer(2));
3372 dict.insert("O".to_string(), PdfObject::String(PdfString(vec![0u8; 32])));
3373 dict.insert("U".to_string(), PdfObject::String(PdfString(vec![0u8; 32])));
3374 dict.insert("P".to_string(), PdfObject::Integer(-4));
3375 dict
3376 }
3377
3378 fn create_pdf_with_encryption() -> Vec<u8> {
3379 b"%PDF-1.4
33811 0 obj
3382<< /Type /Catalog /Pages 2 0 R >>
3383endobj
33842 0 obj
3385<< /Type /Pages /Kids [3 0 R] /Count 1 >>
3386endobj
33873 0 obj
3388<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>
3389endobj
33904 0 obj
3391<< /Filter /Standard /V 1 /R 2 /O (32 bytes of owner password hash data) /U (32 bytes of user password hash data) /P -4 >>
3392endobj
3393xref
33940 5
33950000000000 65535 f
33960000000009 00000 n
33970000000058 00000 n
33980000000116 00000 n
33990000000201 00000 n
3400trailer
3401<< /Size 5 /Root 1 0 R /Encrypt 4 0 R /ID [(file id)] >>
3402startxref
3403295
3404%%EOF"
3405 .to_vec()
3406 }
3407
3408 #[test]
3409 fn test_reader_encryption_detection() {
3410 let unencrypted_pdf = create_minimal_pdf();
3412 let cursor = Cursor::new(unencrypted_pdf);
3413 let reader = PdfReader::new(cursor).unwrap();
3414 assert!(!reader.is_encrypted());
3415 assert!(reader.is_unlocked()); let encrypted_pdf = create_pdf_with_encryption();
3419 let cursor = Cursor::new(encrypted_pdf);
3420 let result = PdfReader::new(cursor);
3421 assert!(result.is_err());
3423 }
3424
3425 #[test]
3426 fn test_reader_encryption_methods_unencrypted() {
3427 let pdf_data = create_minimal_pdf();
3428 let cursor = Cursor::new(pdf_data);
3429 let mut reader = PdfReader::new(cursor).unwrap();
3430
3431 assert!(!reader.is_encrypted());
3433 assert!(reader.is_unlocked());
3434 assert!(reader.encryption_handler().is_none());
3435 assert!(reader.encryption_handler_mut().is_none());
3436
3437 assert!(reader.unlock_with_password("any_password").unwrap());
3439 assert!(reader.try_empty_password().unwrap());
3440 }
3441
3442 #[test]
3443 fn test_reader_encryption_handler_access() {
3444 let pdf_data = create_minimal_pdf();
3445 let cursor = Cursor::new(pdf_data);
3446 let mut reader = PdfReader::new(cursor).unwrap();
3447
3448 assert!(reader.encryption_handler().is_none());
3450 assert!(reader.encryption_handler_mut().is_none());
3451
3452 assert!(!reader.is_encrypted());
3454 assert!(reader.is_unlocked());
3455 }
3456
3457 #[test]
3458 fn test_reader_multiple_password_attempts() {
3459 let pdf_data = create_minimal_pdf();
3460 let cursor = Cursor::new(pdf_data);
3461 let mut reader = PdfReader::new(cursor).unwrap();
3462
3463 let passwords = vec!["test1", "test2", "admin", "", "password"];
3465 for password in passwords {
3466 assert!(reader.unlock_with_password(password).unwrap());
3467 }
3468
3469 for _ in 0..5 {
3471 assert!(reader.try_empty_password().unwrap());
3472 }
3473 }
3474
3475 #[test]
3476 fn test_reader_encryption_state_consistency() {
3477 let pdf_data = create_minimal_pdf();
3478 let cursor = Cursor::new(pdf_data);
3479 let mut reader = PdfReader::new(cursor).unwrap();
3480
3481 assert!(!reader.is_encrypted());
3483 assert!(reader.is_unlocked());
3484 assert!(reader.encryption_handler().is_none());
3485
3486 let _ = reader.unlock_with_password("test");
3488 assert!(!reader.is_encrypted());
3489 assert!(reader.is_unlocked());
3490 assert!(reader.encryption_handler().is_none());
3491
3492 let _ = reader.try_empty_password();
3493 assert!(!reader.is_encrypted());
3494 assert!(reader.is_unlocked());
3495 assert!(reader.encryption_handler().is_none());
3496 }
3497
3498 #[test]
3499 fn test_reader_encryption_error_handling() {
3500 let encrypted_pdf = create_pdf_with_encryption();
3502 let cursor = Cursor::new(encrypted_pdf);
3503
3504 let result = PdfReader::new(cursor);
3506 match result {
3507 Err(ParseError::EncryptionNotSupported) => {
3508 }
3510 Err(_) => {
3511 }
3513 Ok(_) => {
3514 panic!("Should not successfully create reader for encrypted PDF without password");
3515 }
3516 }
3517 }
3518
3519 #[test]
3520 fn test_reader_encryption_with_options() {
3521 let pdf_data = create_minimal_pdf();
3522 let cursor = Cursor::new(pdf_data);
3523
3524 let strict_options = ParseOptions::strict();
3526 let strict_reader = PdfReader::new_with_options(cursor, strict_options).unwrap();
3527 assert!(!strict_reader.is_encrypted());
3528 assert!(strict_reader.is_unlocked());
3529
3530 let pdf_data = create_minimal_pdf();
3531 let cursor = Cursor::new(pdf_data);
3532 let lenient_options = ParseOptions::lenient();
3533 let lenient_reader = PdfReader::new_with_options(cursor, lenient_options).unwrap();
3534 assert!(!lenient_reader.is_encrypted());
3535 assert!(lenient_reader.is_unlocked());
3536 }
3537
3538 #[test]
3539 fn test_reader_encryption_integration_edge_cases() {
3540 let pdf_data = create_minimal_pdf();
3541 let cursor = Cursor::new(pdf_data);
3542 let mut reader = PdfReader::new(cursor).unwrap();
3543
3544 assert!(reader.unlock_with_password("").unwrap());
3546 assert!(reader.unlock_with_password(" ").unwrap()); assert!(reader
3548 .unlock_with_password("very_long_password_that_exceeds_normal_length")
3549 .unwrap());
3550 assert!(reader.unlock_with_password("unicode_test_ñáéíóú").unwrap());
3551
3552 assert!(reader.unlock_with_password("pass@#$%^&*()").unwrap());
3554 assert!(reader.unlock_with_password("pass\nwith\nnewlines").unwrap());
3555 assert!(reader.unlock_with_password("pass\twith\ttabs").unwrap());
3556 }
3557
3558 mod rigorous {
3559 use super::*;
3560
3561 #[test]
3566 fn test_reader_invalid_pdf_header() {
3567 let invalid_data = b"This is not a PDF file";
3569 let cursor = Cursor::new(invalid_data.to_vec());
3570 let result = PdfReader::new(cursor);
3571
3572 assert!(result.is_err(), "Should fail on invalid PDF header");
3573 }
3574
3575 #[test]
3576 fn test_reader_truncated_header() {
3577 let truncated = b"%PDF";
3579 let cursor = Cursor::new(truncated.to_vec());
3580 let result = PdfReader::new(cursor);
3581
3582 assert!(result.is_err(), "Should fail on truncated header");
3583 }
3584
3585 #[test]
3586 fn test_reader_empty_file() {
3587 let empty = Vec::new();
3588 let cursor = Cursor::new(empty);
3589 let result = PdfReader::new(cursor);
3590
3591 assert!(result.is_err(), "Should fail on empty file");
3592 }
3593
3594 #[test]
3595 fn test_reader_malformed_version() {
3596 let malformed = b"%PDF-X.Y\n%%\xE2\xE3\xCF\xD3\n";
3598 let cursor = Cursor::new(malformed.to_vec());
3599 let result = PdfReader::new(cursor);
3600
3601 if let Ok(reader) = result {
3603 let _version = reader.version();
3605 }
3606 }
3607
3608 #[test]
3609 fn test_reader_get_nonexistent_object() {
3610 let pdf_data = create_minimal_pdf();
3611 let cursor = Cursor::new(pdf_data);
3612 let mut reader = PdfReader::new(cursor).unwrap();
3613
3614 let result = reader.get_object(999, 0);
3616
3617 assert!(result.is_err(), "Should fail when object doesn't exist");
3618 }
3619
3620 #[test]
3621 fn test_reader_get_object_wrong_generation() {
3622 let pdf_data = create_minimal_pdf();
3623 let cursor = Cursor::new(pdf_data);
3624 let mut reader = PdfReader::new(cursor).unwrap();
3625
3626 let result = reader.get_object(1, 99);
3628
3629 if let Err(e) = result {
3631 let _ = e;
3633 }
3634 }
3635
3636 #[test]
3641 fn test_resolve_direct_object() {
3642 let pdf_data = create_minimal_pdf();
3643 let cursor = Cursor::new(pdf_data);
3644 let mut reader = PdfReader::new(cursor).unwrap();
3645
3646 let direct_obj = PdfObject::Integer(42);
3648
3649 let resolved = reader.resolve(&direct_obj).unwrap();
3650
3651 assert_eq!(resolved, &PdfObject::Integer(42));
3653 }
3654
3655 #[test]
3656 fn test_resolve_reference() {
3657 let pdf_data = create_minimal_pdf();
3658 let cursor = Cursor::new(pdf_data);
3659 let mut reader = PdfReader::new(cursor).unwrap();
3660
3661 let pages_ref = {
3663 let catalog = reader.catalog().unwrap();
3664 if let Some(PdfObject::Reference(obj_num, gen_num)) = catalog.get("Pages") {
3665 PdfObject::Reference(*obj_num, *gen_num)
3666 } else {
3667 panic!("Catalog /Pages must be a Reference");
3668 }
3669 };
3670
3671 let resolved = reader.resolve(&pages_ref).unwrap();
3673
3674 if let PdfObject::Dictionary(dict) = resolved {
3676 assert_eq!(
3677 dict.get("Type"),
3678 Some(&PdfObject::Name(PdfName("Pages".to_string())))
3679 );
3680 } else {
3681 panic!("Expected dictionary, got: {:?}", resolved);
3682 }
3683 }
3684
3685 #[test]
3690 fn test_is_encrypted_on_unencrypted() {
3691 let pdf_data = create_minimal_pdf();
3692 let cursor = Cursor::new(pdf_data);
3693 let reader = PdfReader::new(cursor).unwrap();
3694
3695 assert!(
3696 !reader.is_encrypted(),
3697 "Minimal PDF should not be encrypted"
3698 );
3699 }
3700
3701 #[test]
3702 fn test_is_unlocked_on_unencrypted() {
3703 let pdf_data = create_minimal_pdf();
3704 let cursor = Cursor::new(pdf_data);
3705 let reader = PdfReader::new(cursor).unwrap();
3706
3707 assert!(reader.is_unlocked(), "Unencrypted PDF should be unlocked");
3709 }
3710
3711 #[test]
3712 fn test_try_empty_password_on_unencrypted() {
3713 let pdf_data = create_minimal_pdf();
3714 let cursor = Cursor::new(pdf_data);
3715 let mut reader = PdfReader::new(cursor).unwrap();
3716
3717 let result = reader.try_empty_password();
3719 assert!(result.is_ok());
3720 }
3721
3722 #[test]
3727 fn test_reader_with_strict_options() {
3728 let pdf_data = create_minimal_pdf();
3729 let cursor = Cursor::new(pdf_data);
3730
3731 let options = ParseOptions::strict();
3732 let result = PdfReader::new_with_options(cursor, options);
3733
3734 assert!(result.is_ok(), "Minimal PDF should parse in strict mode");
3735 }
3736
3737 #[test]
3738 fn test_reader_with_lenient_options() {
3739 let pdf_data = create_minimal_pdf();
3740 let cursor = Cursor::new(pdf_data);
3741
3742 let options = ParseOptions::lenient();
3743 let result = PdfReader::new_with_options(cursor, options);
3744
3745 assert!(result.is_ok(), "Minimal PDF should parse in lenient mode");
3746 }
3747
3748 #[test]
3749 fn test_reader_options_accessible() {
3750 let pdf_data = create_minimal_pdf();
3751 let cursor = Cursor::new(pdf_data);
3752
3753 let options = ParseOptions::lenient();
3754 let reader = PdfReader::new_with_options(cursor, options.clone()).unwrap();
3755
3756 let reader_options = reader.options();
3758 assert_eq!(reader_options.strict_mode, options.strict_mode);
3759 }
3760
3761 #[test]
3766 fn test_catalog_has_required_fields() {
3767 let pdf_data = create_minimal_pdf();
3768 let cursor = Cursor::new(pdf_data);
3769 let mut reader = PdfReader::new(cursor).unwrap();
3770
3771 let catalog = reader.catalog().unwrap();
3772
3773 assert_eq!(
3775 catalog.get("Type"),
3776 Some(&PdfObject::Name(PdfName("Catalog".to_string()))),
3777 "Catalog must have /Type /Catalog"
3778 );
3779
3780 assert!(
3782 catalog.contains_key("Pages"),
3783 "Catalog must have /Pages entry"
3784 );
3785 }
3786
3787 #[test]
3788 fn test_info_fields_when_present() {
3789 let pdf_data = create_pdf_with_info();
3790 let cursor = Cursor::new(pdf_data);
3791 let mut reader = PdfReader::new(cursor).unwrap();
3792
3793 let info = reader.info().unwrap();
3794 assert!(info.is_some(), "PDF should have Info dictionary");
3795
3796 let info_dict = info.unwrap();
3797
3798 assert!(info_dict.contains_key("Title"), "Info should have Title");
3800 assert!(info_dict.contains_key("Author"), "Info should have Author");
3801 }
3802
3803 #[test]
3804 fn test_info_none_when_absent() {
3805 let pdf_data = create_minimal_pdf();
3806 let cursor = Cursor::new(pdf_data);
3807 let mut reader = PdfReader::new(cursor).unwrap();
3808
3809 let info = reader.info().unwrap();
3810 assert!(info.is_none(), "Minimal PDF should not have Info");
3811 }
3812
3813 #[test]
3818 fn test_version_exact_values() {
3819 let pdf_data = create_pdf_with_version("1.7");
3820 let cursor = Cursor::new(pdf_data);
3821 let reader = PdfReader::new(cursor).unwrap();
3822
3823 let version = reader.version();
3824 assert_eq!(version.major, 1, "Major version must be exact");
3825 assert_eq!(version.minor, 7, "Minor version must be exact");
3826 }
3827
3828 #[test]
3829 fn test_version_pdf_20() {
3830 let pdf_data = create_pdf_with_version("2.0");
3831 let cursor = Cursor::new(pdf_data);
3832 let reader = PdfReader::new(cursor).unwrap();
3833
3834 let version = reader.version();
3835 assert_eq!(version.major, 2, "PDF 2.0 major version");
3836 assert_eq!(version.minor, 0, "PDF 2.0 minor version");
3837 }
3838
3839 #[test]
3844 fn test_pages_returns_pages_dict() {
3845 let pdf_data = create_minimal_pdf();
3846 let cursor = Cursor::new(pdf_data);
3847 let mut reader = PdfReader::new(cursor).unwrap();
3848
3849 let pages_dict = reader
3850 .pages()
3851 .expect("pages() must return Pages dictionary");
3852
3853 assert_eq!(
3854 pages_dict.get("Type"),
3855 Some(&PdfObject::Name(PdfName("Pages".to_string()))),
3856 "Pages dict must have /Type /Pages"
3857 );
3858 }
3859
3860 #[test]
3861 fn test_page_count_minimal_pdf() {
3862 let pdf_data = create_minimal_pdf();
3863 let cursor = Cursor::new(pdf_data);
3864 let mut reader = PdfReader::new(cursor).unwrap();
3865
3866 let count = reader.page_count().expect("page_count() must succeed");
3867 assert_eq!(count, 0, "Minimal PDF has 0 pages");
3868 }
3869
3870 #[test]
3871 fn test_page_count_with_info_pdf() {
3872 let pdf_data = create_pdf_with_info();
3873 let cursor = Cursor::new(pdf_data);
3874 let mut reader = PdfReader::new(cursor).unwrap();
3875
3876 let count = reader.page_count().expect("page_count() must succeed");
3877 assert_eq!(count, 0, "create_pdf_with_info() has Count 0 in Pages dict");
3878 }
3879
3880 #[test]
3885 fn test_metadata_minimal_pdf() {
3886 let pdf_data = create_minimal_pdf();
3887 let cursor = Cursor::new(pdf_data);
3888 let mut reader = PdfReader::new(cursor).unwrap();
3889
3890 let meta = reader.metadata().expect("metadata() must succeed");
3891
3892 assert!(meta.title.is_none(), "Minimal PDF has no title");
3894 assert!(meta.author.is_none(), "Minimal PDF has no author");
3895 }
3896
3897 #[test]
3898 fn test_metadata_with_info() {
3899 let pdf_data = create_pdf_with_info();
3900 let cursor = Cursor::new(pdf_data);
3901 let mut reader = PdfReader::new(cursor).unwrap();
3902
3903 let meta = reader.metadata().expect("metadata() must succeed");
3904
3905 assert!(meta.title.is_some(), "PDF with Info has title");
3906 assert_eq!(meta.title.unwrap(), "Test PDF", "Title must match");
3907 assert!(meta.author.is_some(), "PDF with Info has author");
3908 assert_eq!(meta.author.unwrap(), "Test Author", "Author must match");
3909 }
3910
3911 #[test]
3916 fn test_resolve_stream_length_direct_integer() {
3917 let pdf_data = create_minimal_pdf();
3918 let cursor = Cursor::new(pdf_data);
3919 let mut reader = PdfReader::new(cursor).unwrap();
3920
3921 let length_obj = PdfObject::Integer(100);
3923
3924 let length = reader
3925 .resolve_stream_length(&length_obj)
3926 .expect("resolve_stream_length must succeed");
3927 assert_eq!(length, Some(100), "Direct integer must be resolved");
3928 }
3929
3930 #[test]
3931 fn test_resolve_stream_length_negative_integer() {
3932 let pdf_data = create_minimal_pdf();
3933 let cursor = Cursor::new(pdf_data);
3934 let mut reader = PdfReader::new(cursor).unwrap();
3935
3936 let length_obj = PdfObject::Integer(-10);
3938
3939 let length = reader
3940 .resolve_stream_length(&length_obj)
3941 .expect("resolve_stream_length must succeed");
3942 assert_eq!(length, None, "Negative integer returns None");
3943 }
3944
3945 #[test]
3946 fn test_resolve_stream_length_non_integer() {
3947 let pdf_data = create_minimal_pdf();
3948 let cursor = Cursor::new(pdf_data);
3949 let mut reader = PdfReader::new(cursor).unwrap();
3950
3951 let name_obj = PdfObject::Name(PdfName("Test".to_string()));
3953
3954 let length = reader
3955 .resolve_stream_length(&name_obj)
3956 .expect("resolve_stream_length must succeed");
3957 assert_eq!(length, None, "Non-integer object returns None");
3958 }
3959
3960 #[test]
3965 fn test_get_all_pages_empty_pdf() {
3966 let pdf_data = create_minimal_pdf();
3967 let cursor = Cursor::new(pdf_data);
3968 let mut reader = PdfReader::new(cursor).unwrap();
3969
3970 let pages = reader
3971 .get_all_pages()
3972 .expect("get_all_pages() must succeed");
3973 assert_eq!(pages.len(), 0, "Minimal PDF has 0 pages");
3974 }
3975
3976 #[test]
3977 fn test_get_all_pages_with_info() {
3978 let pdf_data = create_pdf_with_info();
3979 let cursor = Cursor::new(pdf_data);
3980 let mut reader = PdfReader::new(cursor).unwrap();
3981
3982 let pages = reader
3983 .get_all_pages()
3984 .expect("get_all_pages() must succeed");
3985 assert_eq!(
3986 pages.len(),
3987 0,
3988 "create_pdf_with_info() has 0 pages (Count 0)"
3989 );
3990 }
3991
3992 #[test]
3997 fn test_into_document_consumes_reader() {
3998 let pdf_data = create_minimal_pdf();
3999 let cursor = Cursor::new(pdf_data);
4000 let reader = PdfReader::new(cursor).unwrap();
4001
4002 let document = reader.into_document();
4003
4004 let version = document.version().expect("Document must have version");
4006 assert!(
4007 version.starts_with("1."),
4008 "Document must have PDF 1.x version, got: {}",
4009 version
4010 );
4011
4012 let page_count = document
4014 .page_count()
4015 .expect("Document must allow page_count()");
4016 assert_eq!(
4017 page_count, 0,
4018 "Minimal PDF has 0 pages (Count 0 in test helper)"
4019 );
4020 }
4021
4022 #[test]
4027 fn test_clear_parse_context() {
4028 let pdf_data = create_minimal_pdf();
4029 let cursor = Cursor::new(pdf_data);
4030 let mut reader = PdfReader::new(cursor).unwrap();
4031
4032 reader.clear_parse_context();
4034
4035 let version = reader.version();
4037 assert_eq!(version.major, 1, "Reader must still work after clear");
4038 }
4039
4040 #[test]
4041 fn test_parse_context_mut_accessible() {
4042 let pdf_data = create_minimal_pdf();
4043 let cursor = Cursor::new(pdf_data);
4044 let mut reader = PdfReader::new(cursor).unwrap();
4045
4046 let context = reader.parse_context_mut();
4047
4048 let initial_depth = context.depth;
4050 assert_eq!(initial_depth, 0, "Parse context must start with depth 0");
4051
4052 assert!(
4054 context.max_depth > 0,
4055 "Parse context must have positive max_depth"
4056 );
4057 }
4058
4059 #[test]
4064 fn test_find_bytes_basic() {
4065 let haystack = b"Hello World";
4066 let needle = b"World";
4067 let pos = find_bytes(haystack, needle);
4068 assert_eq!(pos, Some(6), "Must find 'World' at position 6");
4069 }
4070
4071 #[test]
4072 fn test_find_bytes_not_found() {
4073 let haystack = b"Hello World";
4074 let needle = b"Rust";
4075 let pos = find_bytes(haystack, needle);
4076 assert_eq!(pos, None, "Must return None when not found");
4077 }
4078
4079 #[test]
4080 fn test_find_bytes_at_start() {
4081 let haystack = b"Hello World";
4082 let needle = b"Hello";
4083 let pos = find_bytes(haystack, needle);
4084 assert_eq!(pos, Some(0), "Must find at position 0");
4085 }
4086
4087 #[test]
4088 fn test_is_immediate_stream_start_with_stream() {
4089 let data = b"stream\ndata";
4090 assert!(
4091 is_immediate_stream_start(data),
4092 "Must detect 'stream' at start"
4093 );
4094 }
4095
4096 #[test]
4097 fn test_is_immediate_stream_start_with_whitespace() {
4098 let data = b" \n\tstream\ndata";
4099 assert!(
4100 is_immediate_stream_start(data),
4101 "Must detect 'stream' after whitespace"
4102 );
4103 }
4104
4105 #[test]
4106 fn test_is_immediate_stream_start_no_stream() {
4107 let data = b"endobj";
4108 assert!(
4109 !is_immediate_stream_start(data),
4110 "Must return false when 'stream' absent"
4111 );
4112 }
4113 }
4114}